diff options
37 files changed, 9996 insertions, 4805 deletions
diff --git a/target/linux/generic/backport-5.15/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch b/target/linux/generic/backport-5.15/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch index 2a4207c3b5..3bea44d865 100644 --- a/target/linux/generic/backport-5.15/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch +++ b/target/linux/generic/backport-5.15/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch @@ -1,134 +1,396 @@ -From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001 +From a4103262b01a1b8704b37c01c7c813df91b7b119 Mon Sep 17 00:00:00 2001 From: Yu Zhao <yuzhao@google.com> -Date: Wed, 4 Aug 2021 01:31:34 -0600 -Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young() +Date: Sun, 18 Sep 2022 01:59:58 -0600 +Subject: [PATCH 01/29] mm: x86, arm64: add arch_has_hw_pte_young() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit -Some architectures automatically set the accessed bit in PTEs, e.g., -x86 and arm64 v8.2. On architectures that do not have this capability, -clearing the accessed bit in a PTE triggers a page fault following the -TLB miss of this PTE. +Patch series "Multi-Gen LRU Framework", v14. -Being aware of this capability can help make better decisions, i.e., -whether to limit the size of each batch of PTEs and the burst of -batches when clearing the accessed bit. +What's new +========== +1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS, + Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15. +2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs) + machines. The old direct reclaim backoff, which tries to enforce a + minimum fairness among all eligible memcgs, over-swapped by about + (total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which + pulls the plug on swapping once the target is met, trades some + fairness for curtailed latency: + https://lore.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com/ +3. Fixed minior build warnings and conflicts. More comments and nits. +TLDR +==== +The current page reclaim is too expensive in terms of CPU usage and it +often makes poor choices about what to evict. This patchset offers an +alternative solution that is performant, versatile and +straightforward. + +Patchset overview +================= +The design and implementation overview is in patch 14: +https://lore.kernel.org/r/20220918080010.2920238-15-yuzhao@google.com/ + +01. mm: x86, arm64: add arch_has_hw_pte_young() +02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +Take advantage of hardware features when trying to clear the accessed +bit in many PTEs. + +03. mm/vmscan.c: refactor shrink_node() +04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into + its sole caller" +Minor refactors to improve readability for the following patches. + +05. mm: multi-gen LRU: groundwork +Adds the basic data structure and the functions that insert pages to +and remove pages from the multi-gen LRU (MGLRU) lists. + +06. mm: multi-gen LRU: minimal implementation +A minimal implementation without optimizations. + +07. mm: multi-gen LRU: exploit locality in rmap +Exploits spatial locality to improve efficiency when using the rmap. + +08. mm: multi-gen LRU: support page table walks +Further exploits spatial locality by optionally scanning page tables. + +09. mm: multi-gen LRU: optimize multiple memcgs +Optimizes the overall performance for multiple memcgs running mixed +types of workloads. + +10. mm: multi-gen LRU: kill switch +Adds a kill switch to enable or disable MGLRU at runtime. + +11. mm: multi-gen LRU: thrashing prevention +12. mm: multi-gen LRU: debugfs interface +Provide userspace with features like thrashing prevention, working set +estimation and proactive reclaim. + +13. mm: multi-gen LRU: admin guide +14. mm: multi-gen LRU: design doc +Add an admin guide and a design doc. + +Benchmark results +================= +Independent lab results +----------------------- +Based on the popularity of searches [01] and the memory usage in +Google's public cloud, the most popular open-source memory-hungry +applications, in alphabetical order, are: + Apache Cassandra Memcached + Apache Hadoop MongoDB + Apache Spark PostgreSQL + MariaDB (MySQL) Redis + +An independent lab evaluated MGLRU with the most widely used benchmark +suites for the above applications. They posted 960 data points along +with kernel metrics and perf profiles collected over more than 500 +hours of total benchmark time. Their final reports show that, with 95% +confidence intervals (CIs), the above applications all performed +significantly better for at least part of their benchmark matrices. + +On 5.14: +1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]% + less wall time to sort three billion random integers, respectively, + under the medium- and the high-concurrency conditions, when + overcommitting memory. There were no statistically significant + changes in wall time for the rest of the benchmark matrix. +2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]% + more transactions per minute (TPM), respectively, under the medium- + and the high-concurrency conditions, when overcommitting memory. + There were no statistically significant changes in TPM for the rest + of the benchmark matrix. +3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]% + and [21.59, 30.02]% more operations per second (OPS), respectively, + for sequential access, random access and Gaussian (distribution) + access, when THP=always; 95% CIs [13.85, 15.97]% and + [23.94, 29.92]% more OPS, respectively, for random access and + Gaussian access, when THP=never. There were no statistically + significant changes in OPS for the rest of the benchmark matrix. +4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and + [2.16, 3.55]% more operations per second (OPS), respectively, for + exponential (distribution) access, random access and Zipfian + (distribution) access, when underutilizing memory; 95% CIs + [8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS, + respectively, for exponential access, random access and Zipfian + access, when overcommitting memory. + +On 5.15: +5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]% + and [4.11, 7.50]% more operations per second (OPS), respectively, + for exponential (distribution) access, random access and Zipfian + (distribution) access, when swap was off; 95% CIs [0.50, 2.60]%, + [6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for + exponential access, random access and Zipfian access, when swap was + on. +6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]% + less average wall time to finish twelve parallel TeraSort jobs, + respectively, under the medium- and the high-concurrency + conditions, when swap was on. There were no statistically + significant changes in average wall time for the rest of the + benchmark matrix. +7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per + minute (TPM) under the high-concurrency condition, when swap was + off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM, + respectively, under the medium- and the high-concurrency + conditions, when swap was on. There were no statistically + significant changes in TPM for the rest of the benchmark matrix. +8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and + [11.47, 19.36]% more total operations per second (OPS), + respectively, for sequential access, random access and Gaussian + (distribution) access, when THP=always; 95% CIs [1.27, 3.54]%, + [10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively, + for sequential access, random access and Gaussian access, when + THP=never. + +Our lab results +--------------- +To supplement the above results, we ran the following benchmark suites +on 5.16-rc7 and found no regressions [10]. + fs_fio_bench_hdd_mq pft + fs_lmbench pgsql-hammerdb + fs_parallelio redis + fs_postmark stream + hackbench sysbenchthread + kernbench tpcc_spark + memcached unixbench + multichase vm-scalability + mutilate will-it-scale + nginx + +[01] https://trends.google.com +[02] https://lore.kernel.org/r/20211102002002.92051-1-bot@edi.works/ +[03] https://lore.kernel.org/r/20211009054315.47073-1-bot@edi.works/ +[04] https://lore.kernel.org/r/20211021194103.65648-1-bot@edi.works/ +[05] https://lore.kernel.org/r/20211109021346.50266-1-bot@edi.works/ +[06] https://lore.kernel.org/r/20211202062806.80365-1-bot@edi.works/ +[07] https://lore.kernel.org/r/20211209072416.33606-1-bot@edi.works/ +[08] https://lore.kernel.org/r/20211218071041.24077-1-bot@edi.works/ +[09] https://lore.kernel.org/r/20211122053248.57311-1-bot@edi.works/ +[10] https://lore.kernel.org/r/20220104202247.2903702-1-yuzhao@google.com/ + +Read-world applications +======================= +Third-party testimonials +------------------------ +Konstantin reported [11]: + I have Archlinux with 8G RAM + zswap + swap. While developing, I + have lots of apps opened such as multiple LSP-servers for different + langs, chats, two browsers, etc... Usually, my system gets quickly + to a point of SWAP-storms, where I have to kill LSP-servers, + restart browsers to free memory, etc, otherwise the system lags + heavily and is barely usable. + + 1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU + patchset, and I started up by opening lots of apps to create memory + pressure, and worked for a day like this. Till now I had not a + single SWAP-storm, and mind you I got 3.4G in SWAP. I was never + getting to the point of 3G in SWAP before without a single + SWAP-storm. + +Vaibhav from IBM reported [12]: + In a synthetic MongoDB Benchmark, seeing an average of ~19% + throughput improvement on POWER10(Radix MMU + 64K Page Size) with + MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across + three different request distributions, namely, Exponential, Uniform + and Zipfan. + +Shuang from U of Rochester reported [13]: + With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]% + and [9.26, 10.36]% higher throughput, respectively, for random + access, Zipfian (distribution) access and Gaussian (distribution) + access, when the average number of jobs per CPU is 1; 95% CIs + [42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher + throughput, respectively, for random access, Zipfian access and + Gaussian access, when the average number of jobs per CPU is 2. + +Daniel from Michigan Tech reported [14]: + With Memcached allocating ~100GB of byte-addressable Optante, + performance improvement in terms of throughput (measured as queries + per second) was about 10% for a series of workloads. + +Large-scale deployments +----------------------- +We've rolled out MGLRU to tens of millions of ChromeOS users and +about a million Android users. Google's fleetwide profiling [15] shows +an overall 40% decrease in kswapd CPU usage, in addition to +improvements in other UX metrics, e.g., an 85% decrease in the number +of low-memory kills at the 75th percentile and an 18% decrease in +app launch time at the 50th percentile. + +The downstream kernels that have been using MGLRU include: +1. Android [16] +2. Arch Linux Zen [17] +3. Armbian [18] +4. ChromeOS [19] +5. Liquorix [20] +6. OpenWrt [21] +7. post-factum [22] +8. XanMod [23] + +[11] https://lore.kernel.org/r/140226722f2032c86301fbd326d91baefe3d7d23.camel@yandex.ru/ +[12] https://lore.kernel.org/r/87czj3mux0.fsf@vajain21.in.ibm.com/ +[13] https://lore.kernel.org/r/20220105024423.26409-1-szhai2@cs.rochester.edu/ +[14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/ +[15] https://dl.acm.org/doi/10.1145/2749469.2750392 +[16] https://android.com +[17] https://archlinux.org +[18] https://armbian.com +[19] https://chromium.org +[20] https://liquorix.net +[21] https://openwrt.org +[22] https://codeberg.org/pf-kernel +[23] https://xanmod.org + +Summary +======= +The facts are: +1. The independent lab results and the real-world applications + indicate substantial improvements; there are no known regressions. +2. Thrashing prevention, working set estimation and proactive reclaim + work out of the box; there are no equivalent solutions. +3. There is a lot of new code; no smaller changes have been + demonstrated similar effects. + +Our options, accordingly, are: +1. Given the amount of evidence, the reported improvements will likely + materialize for a wide range of workloads. +2. Gauging the interest from the past discussions, the new features + will likely be put to use for both personal computers and data + centers. +3. Based on Google's track record, the new code will likely be well + maintained in the long term. It'd be more difficult if not + impossible to achieve similar effects with other approaches. + +This patch (of 14): + +Some architectures automatically set the accessed bit in PTEs, e.g., x86 +and arm64 v8.2. On architectures that do not have this capability, +clearing the accessed bit in a PTE usually triggers a page fault following +the TLB miss of this PTE (to emulate the accessed bit). + +Being aware of this capability can help make better decisions, e.g., +whether to spread the work out over a period of time to reduce bursty page +faults when trying to clear the accessed bit in many PTEs. + +Note that theoretically this capability can be unreliable, e.g., +hotplugged CPUs might be different from builtin ones. Therefore it should +not be used in architecture-independent code that involves correctness, +e.g., to determine whether TLB flushes are required (in combination with +the accessed bit). + +Link: https://lkml.kernel.org/r/20220918080010.2920238-1-yuzhao@google.com +Link: https://lkml.kernel.org/r/20220918080010.2920238-2-yuzhao@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> -Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a +Reviewed-by: Barry Song <baohua@kernel.org> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Acked-by: Will Deacon <will@kernel.org> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: linux-arm-kernel@lists.infradead.org +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- - arch/arm64/include/asm/cpufeature.h | 5 +++++ - arch/arm64/include/asm/pgtable.h | 13 ++++++++----- - arch/arm64/kernel/cpufeature.c | 10 ++++++++++ - arch/arm64/tools/cpucaps | 1 + - arch/x86/include/asm/pgtable.h | 6 +++--- - include/linux/pgtable.h | 13 +++++++++++++ - mm/memory.c | 14 +------------- - 7 files changed, 41 insertions(+), 21 deletions(-) - ---- a/arch/arm64/include/asm/cpufeature.h -+++ b/arch/arm64/include/asm/cpufeature.h -@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r - cpus_have_const_cap(ARM64_HAS_TLB_RANGE); - } - -+static inline bool system_has_hw_af(void) -+{ -+ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF); -+} -+ - extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); - - static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) + arch/arm64/include/asm/pgtable.h | 14 ++------------ + arch/x86/include/asm/pgtable.h | 6 +++--- + include/linux/pgtable.h | 13 +++++++++++++ + mm/memory.c | 14 +------------- + 4 files changed, 19 insertions(+), 28 deletions(-) + +diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h +index ed57717cd004..874827fc7bc6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h -@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru +@@ -999,23 +999,13 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, * page after fork() + CoW for pfn mappings. We don't always have a * hardware-managed access flag on arm64. */ -static inline bool arch_faults_on_old_pte(void) -+static inline bool arch_has_hw_pte_young(bool local) - { +-{ - WARN_ON(preemptible()); -+ if (local) { -+ WARN_ON(preemptible()); -+ return cpu_has_hw_af(); -+ } - +- - return !cpu_has_hw_af(); -+ return system_has_hw_af(); - } +-} -#define arch_faults_on_old_pte arch_faults_on_old_pte -+#define arch_has_hw_pte_young arch_has_hw_pte_young ++#define arch_has_hw_pte_young cpu_has_hw_af /* * Experimentally, it's cheap to set the access flag in hardware and we -@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt + * benefit from prefaulting mappings as 'old' to start with. */ - static inline bool arch_wants_old_prefaulted_pte(void) - { +-static inline bool arch_wants_old_prefaulted_pte(void) +-{ - return !arch_faults_on_old_pte(); -+ return arch_has_hw_pte_young(true); - } - #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte +-} +-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte ++#define arch_wants_old_prefaulted_pte cpu_has_hw_af ---- a/arch/arm64/kernel/cpufeature.c -+++ b/arch/arm64/kernel/cpufeature.c -@@ -2197,6 +2197,16 @@ static const struct arm64_cpu_capabiliti - .matches = has_hw_dbm, - .cpu_enable = cpu_enable_hw_dbm, - }, -+ { -+ .desc = "Hardware update of the Access flag", -+ .type = ARM64_CPUCAP_SYSTEM_FEATURE, -+ .capability = ARM64_HW_AF, -+ .sys_reg = SYS_ID_AA64MMFR1_EL1, -+ .sign = FTR_UNSIGNED, -+ .field_pos = ID_AA64MMFR1_HADBS_SHIFT, -+ .min_field_value = 1, -+ .matches = has_cpuid_feature, -+ }, - #endif - { - .desc = "CRC32 instructions", ---- a/arch/arm64/tools/cpucaps -+++ b/arch/arm64/tools/cpucaps -@@ -35,6 +35,7 @@ HAS_STAGE2_FWB - HAS_SYSREG_GIC_CPUIF - HAS_TLB_RANGE - HAS_VIRT_HOST_EXTN -+HW_AF - HW_DBM - KVM_PROTECTED_MODE - MISMATCHED_CACHE_TYPE + #endif /* !__ASSEMBLY__ */ + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 448cd01eb3ec..3908780fc408 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h -@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c +@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_faults_on_old_pte arch_faults_on_old_pte -static inline bool arch_faults_on_old_pte(void) +#define arch_has_hw_pte_young arch_has_hw_pte_young -+static inline bool arch_has_hw_pte_young(bool local) ++static inline bool arch_has_hw_pte_young(void) { - return false; + return true; } #endif /* __ASSEMBLY__ */ +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index d468efcf48f4..2f1188980baf 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h -@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young +@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef arch_has_hw_pte_young +/* -+ * Return whether the accessed bit is supported by the local CPU or all CPUs. ++ * Return whether the accessed bit is supported on the local CPU. + * -+ * Those arches which have hw access flag feature need to implement their own -+ * helper. By default, "false" means pagefault will be hit on old pte. ++ * This stub assumes accessing through an old PTE triggers a page fault. ++ * Architectures that automatically set the access bit should overwrite it. + */ -+static inline bool arch_has_hw_pte_young(bool local) ++static inline bool arch_has_hw_pte_young(void) +{ + return false; +} @@ -137,6 +399,8 @@ Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, +diff --git a/mm/memory.c b/mm/memory.c +index a4d0f744a458..392b7326a2d2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -121,18 +121,6 @@ int randomize_va_space __read_mostly = @@ -158,12 +422,15 @@ Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { -@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct +@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ - if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { -+ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) { ++ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch b/target/linux/generic/backport-5.15/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch index 785af275f5..60ce9c07cc 100644 --- a/target/linux/generic/backport-5.15/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch +++ b/target/linux/generic/backport-5.15/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch @@ -1,64 +1,112 @@ -From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001 +From 493de1c4b0f2cd909169401da8c445f6c8a7e29d Mon Sep 17 00:00:00 2001 From: Yu Zhao <yuzhao@google.com> -Date: Sat, 26 Sep 2020 21:17:18 -0600 -Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +Date: Sun, 18 Sep 2022 01:59:59 -0600 +Subject: [PATCH 02/29] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit -Some architectures support the accessed bit on non-leaf PMD entries, -e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using -it as part of linear address translation [1]. As an optimization, page -table walkers who are interested in the accessed bit can skip the PTEs -under a non-leaf PMD entry if the accessed bit is cleared on this PMD -entry. +Some architectures support the accessed bit in non-leaf PMD entries, e.g., +x86 sets the accessed bit in a non-leaf PMD entry when using it as part of +linear address translation [1]. Page table walkers that clear the +accessed bit may use this capability to reduce their search space. -Although an inline function may be preferable, this capability is -added as a configuration option to look consistent when used with the -existing macros. +Note that: +1. Although an inline function is preferable, this capability is added + as a configuration option for consistency with the existing macros. +2. Due to the little interest in other varieties, this capability was + only tested on Intel and AMD CPUs. + +Thanks to the following developers for their efforts [2][3]. + Randy Dunlap <rdunlap@infradead.org> + Stephen Rothwell <sfr@canb.auug.org.au> [1]: Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3 (June 2021), section 4.8 +[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/ +[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/ +Link: https://lkml.kernel.org/r/20220918080010.2920238-3-yuzhao@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> +Reviewed-by: Barry Song <baohua@kernel.org> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- - arch/Kconfig | 9 +++++++++ + arch/Kconfig | 8 ++++++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 3 ++- arch/x86/mm/pgtable.c | 5 ++++- include/linux/pgtable.h | 4 ++-- - 5 files changed, 18 insertions(+), 4 deletions(-) + 5 files changed, 17 insertions(+), 4 deletions(-) +diff --git a/arch/Kconfig b/arch/Kconfig +index 5987363b41c2..62d55b7ccca1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig -@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT +@@ -1295,6 +1295,14 @@ config ARCH_HAS_ELFCORE_COMPAT config ARCH_HAS_PARANOID_L1D_FLUSH bool +config ARCH_HAS_NONLEAF_PMD_YOUNG + bool -+ depends on PGTABLE_LEVELS > 2 + help -+ Architectures that select this are able to set the accessed bit on -+ non-leaf PMD entries in addition to leaf PTE entries where pages are -+ mapped. For them, page table walkers that clear the accessed bit may -+ stop at non-leaf PMD entries if they do not see the accessed bit. ++ Architectures that select this option are capable of setting the ++ accessed bit in non-leaf PMD entries when using them as part of linear ++ address translations. Page table walkers that clear the accessed bit ++ may use this capability to reduce their search space. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index a08ce6360382..38e1d231d52a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -84,6 +84,7 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL -+ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64 ++ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_COPY_MC if X86_64 select ARCH_HAS_SET_MEMORY +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 3908780fc408..01a1763123ff 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h -@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad +@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { @@ -68,9 +116,11 @@ Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 } static inline unsigned long pages_to_mb(unsigned long npg) +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 3481b35cb4ec..a224193d84bf 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c -@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_ +@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } @@ -79,7 +129,7 @@ Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { -@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_ +@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, return ret; } @@ -89,9 +139,11 @@ Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index 2f1188980baf..e6889556e0bf 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h -@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo +@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG @@ -100,7 +152,7 @@ Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) -@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo +@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, BUILD_BUG(); return 0; } @@ -109,3 +161,6 @@ Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch b/target/linux/generic/backport-5.15/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch index 31161e2a0f..1b9a70dbc1 100644 --- a/target/linux/generic/backport-5.15/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch +++ b/target/linux/generic/backport-5.15/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch @@ -1,21 +1,60 @@ -From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001 +From 9e17efd11450d3d2069adaa3c58db9ac8ebd1c66 Mon Sep 17 00:00:00 2001 From: Yu Zhao <yuzhao@google.com> -Date: Sun, 27 Sep 2020 20:49:08 -0600 -Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node() +Date: Sun, 18 Sep 2022 02:00:00 -0600 +Subject: [PATCH 03/29] mm/vmscan.c: refactor shrink_node() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit -This patch refactors shrink_node(). This will make the upcoming -changes to mm/vmscan.c more readable. +This patch refactors shrink_node() to improve readability for the upcoming +changes to mm/vmscan.c. +Link: https://lkml.kernel.org/r/20220918080010.2920238-4-yuzhao@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> +Reviewed-by: Barry Song <baohua@kernel.org> +Reviewed-by: Miaohe Lin <linmiaohe@huawei.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- - mm/vmscan.c | 186 +++++++++++++++++++++++++++------------------------- - 1 file changed, 98 insertions(+), 88 deletions(-) + mm/vmscan.c | 198 +++++++++++++++++++++++++++------------------------- + 1 file changed, 104 insertions(+), 94 deletions(-) +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 201acea81804..dc5f0381513f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -2497,6 +2497,103 @@ enum scan_balance { +@@ -2497,6 +2497,109 @@ enum scan_balance { SCAN_FILE, }; @@ -27,6 +66,12 @@ Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* ++ * Flush the memory cgroup stats, so that we read accurate per-memcg ++ * lruvec stats for heuristics. ++ */ ++ mem_cgroup_flush_stats(); ++ ++ /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&target_lruvec->lru_lock); @@ -119,7 +164,7 @@ Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined -@@ -2965,7 +3062,6 @@ static void shrink_node(pg_data_t *pgdat +@@ -2965,109 +3068,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long nr_reclaimed, nr_scanned; struct lruvec *target_lruvec; bool reclaimable = false; @@ -127,7 +172,15 @@ Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); -@@ -2981,93 +3077,7 @@ again: + again: +- /* +- * Flush the memory cgroup stats, so that we read accurate per-memcg +- * lruvec stats for heuristics. +- */ +- mem_cgroup_flush_stats(); +- + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; @@ -222,3 +275,6 @@ Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 shrink_node_memcgs(pgdat, sc); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-04-Revert-include-linux-mm_inline.h-fold-__update_lru_s.patch b/target/linux/generic/backport-5.15/020-v6.1-04-Revert-include-linux-mm_inline.h-fold-__update_lru_s.patch new file mode 100644 index 0000000000..24b5c8f797 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-04-Revert-include-linux-mm_inline.h-fold-__update_lru_s.patch @@ -0,0 +1,87 @@ +From 03705be42114db7cc5bd6eb7bf7e8703c94d4880 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:01 -0600 +Subject: [PATCH 04/29] Revert "include/linux/mm_inline.h: fold + __update_lru_size() into its sole caller" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This patch undoes the following refactor: commit 289ccba18af4 +("include/linux/mm_inline.h: fold __update_lru_size() into its sole +caller") + +The upcoming changes to include/linux/mm_inline.h will reuse +__update_lru_size(). + +Link: https://lkml.kernel.org/r/20220918080010.2920238-5-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Reviewed-by: Miaohe Lin <linmiaohe@huawei.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/mm_inline.h | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 355ea1ee32bd..a822d6b690a5 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -24,7 +24,7 @@ static inline int page_is_file_lru(struct page *page) + return !PageSwapBacked(page); + } + +-static __always_inline void update_lru_size(struct lruvec *lruvec, ++static __always_inline void __update_lru_size(struct lruvec *lruvec, + enum lru_list lru, enum zone_type zid, + int nr_pages) + { +@@ -33,6 +33,13 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + lru, nr_pages); ++} ++ ++static __always_inline void update_lru_size(struct lruvec *lruvec, ++ enum lru_list lru, enum zone_type zid, ++ long nr_pages) ++{ ++ __update_lru_size(lruvec, lru, zid, nr_pages); + #ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); + #endif +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-04-mm-multigenerational-lru-groundwork.patch b/target/linux/generic/backport-5.15/020-v6.1-04-mm-multigenerational-lru-groundwork.patch deleted file mode 100644 index aec67c23eb..0000000000 --- a/target/linux/generic/backport-5.15/020-v6.1-04-mm-multigenerational-lru-groundwork.patch +++ /dev/null @@ -1,996 +0,0 @@ -From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Mon, 25 Jan 2021 21:12:33 -0700 -Subject: [PATCH 04/10] mm: multigenerational lru: groundwork - -For each lruvec, evictable pages are divided into multiple -generations. The youngest generation number is stored in -lrugen->max_seq for both anon and file types as they are aged on an -equal footing. The oldest generation numbers are stored in -lrugen->min_seq[] separately for anon and file types as clean file -pages can be evicted regardless of swap constraints. These three -variables are monotonically increasing. Generation numbers are -truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into -page->flags. The sliding window technique is used to prevent truncated -generation numbers from overlapping. Each truncated generation number -is an index to -lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. - -The framework comprises two conceptually independent components: the -aging, which produces young generations, and the eviction, which -consumes old generations. Both can be invoked independently from user -space for the purpose of working set estimation and proactive reclaim. - -The protection of hot pages and the selection of cold pages are based -on page access types and patterns. There are two access types: one via -page tables and the other via file descriptors. The protection of the -former type is by design stronger because: - 1) The uncertainty in determining the access patterns of the former - type is higher due to the coalesced nature of the accessed bit. - 2) The cost of evicting the former type is higher due to the TLB - flushes required and the likelihood of involving I/O. - 3) The penalty of under-protecting the former type is higher because - applications usually do not prepare themselves for major faults like - they do for blocked I/O. For example, client applications commonly - dedicate blocked I/O to separate threads to avoid UI janks that - negatively affect user experience. - -There are also two access patterns: one with temporal locality and the -other without. The latter pattern, e.g., random and sequential, needs -to be explicitly excluded to avoid weakening the protection of the -former pattern. Generally the former type follows the former pattern -unless MADV_SEQUENTIAL is specified and the latter type follows the -latter pattern unless outlying refaults have been observed. - -Upon faulting, a page is added to the youngest generation, which -provides the strongest protection as the eviction will not consider -this page before the aging has scanned it at least twice. The first -scan clears the accessed bit set during the initial fault. And the -second scan makes sure this page has not been used since the first -scan. A page from any other generations is brought back to the -youngest generation whenever the aging finds the accessed bit set on -any of the PTEs mapping this page. - -Unmapped pages are initially added to the oldest generation and then -conditionally protected by tiers. This is done later [PATCH 07/10]. - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7 ---- - fs/fuse/dev.c | 3 +- - include/linux/cgroup.h | 15 +- - include/linux/mm.h | 36 ++++ - include/linux/mm_inline.h | 182 ++++++++++++++++++++ - include/linux/mmzone.h | 70 ++++++++ - include/linux/page-flags-layout.h | 19 ++- - include/linux/page-flags.h | 4 +- - include/linux/sched.h | 3 + - kernel/bounds.c | 3 + - kernel/cgroup/cgroup-internal.h | 1 - - mm/huge_memory.c | 3 +- - mm/memcontrol.c | 1 + - mm/memory.c | 7 + - mm/mm_init.c | 6 +- - mm/page_alloc.c | 1 + - mm/swap.c | 9 +- - mm/swapfile.c | 2 + - mm/vmscan.c | 268 ++++++++++++++++++++++++++++++ - 18 files changed, 618 insertions(+), 15 deletions(-) - ---- a/fs/fuse/dev.c -+++ b/fs/fuse/dev.c -@@ -785,7 +785,8 @@ static int fuse_check_page(struct page * - 1 << PG_active | - 1 << PG_workingset | - 1 << PG_reclaim | -- 1 << PG_waiters))) { -+ 1 << PG_waiters | -+ LRU_GEN_MASK | LRU_REFS_MASK))) { - dump_page(page, "fuse: trying to steal weird page"); - return 1; - } ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr - css_put(&cgrp->self); - } - -+extern struct mutex cgroup_mutex; -+ -+static inline void cgroup_lock(void) -+{ -+ mutex_lock(&cgroup_mutex); -+} -+ -+static inline void cgroup_unlock(void) -+{ -+ mutex_unlock(&cgroup_mutex); -+} -+ - /** - * task_css_set_check - obtain a task's css_set with extra access conditions - * @task: the task to obtain css_set for -@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr - * as locks used during the cgroup_subsys::attach() methods. - */ - #ifdef CONFIG_PROVE_RCU --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - #define task_css_set_check(task, __c) \ - rcu_dereference_check((task)->cgroups, \ -@@ -708,6 +719,8 @@ struct cgroup; - static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } - static inline void css_get(struct cgroup_subsys_state *css) {} - static inline void css_put(struct cgroup_subsys_state *css) {} -+static inline void cgroup_lock(void) {} -+static inline void cgroup_unlock(void) {} - static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) { return 0; } - static inline int cgroupstats_build(struct cgroupstats *stats, ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v - #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) - #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) - #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) -+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) -+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) - - /* - * Define the bit shifts to access each section. For non-existent -@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s - loff_t const holebegin, loff_t const holelen, int even_cows) { } - #endif - -+#ifdef CONFIG_LRU_GEN -+static inline void task_enter_nonseq_fault(void) -+{ -+ WARN_ON(current->in_nonseq_fault); -+ -+ current->in_nonseq_fault = 1; -+} -+ -+static inline void task_exit_nonseq_fault(void) -+{ -+ WARN_ON(!current->in_nonseq_fault); -+ -+ current->in_nonseq_fault = 0; -+} -+ -+static inline bool task_in_nonseq_fault(void) -+{ -+ return current->in_nonseq_fault; -+} -+#else -+static inline void task_enter_nonseq_fault(void) -+{ -+} -+ -+static inline void task_exit_nonseq_fault(void) -+{ -+} -+ -+static inline bool task_in_nonseq_fault(void) -+{ -+ return false; -+} -+#endif /* CONFIG_LRU_GEN */ -+ - static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) - { ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag - return lru; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static inline bool lru_gen_enabled(void) -+{ -+#ifdef CONFIG_LRU_GEN_ENABLED -+ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); -+ -+ return static_branch_likely(&lru_gen_static_key); -+#else -+ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); -+ -+ return static_branch_unlikely(&lru_gen_static_key); -+#endif -+} -+ -+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ -+static inline int lru_gen_from_seq(unsigned long seq) -+{ -+ return seq % MAX_NR_GENS; -+} -+ -+/* The youngest and the second youngest generations are counted as active. */ -+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) -+{ -+ unsigned long max_seq = lruvec->evictable.max_seq; -+ -+ VM_BUG_ON(gen >= MAX_NR_GENS); -+ -+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); -+} -+ -+/* Update the sizes of the multigenerational lru lists. */ -+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, -+ int old_gen, int new_gen) -+{ -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int delta = thp_nr_pages(page); -+ enum lru_list lru = type * LRU_FILE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ lockdep_assert_held(&lruvec->lru_lock); -+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); -+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); -+ VM_BUG_ON(old_gen == -1 && new_gen == -1); -+ -+ if (old_gen >= 0) -+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], -+ lrugen->sizes[old_gen][type][zone] - delta); -+ if (new_gen >= 0) -+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], -+ lrugen->sizes[new_gen][type][zone] + delta); -+ -+ if (old_gen < 0) { -+ if (lru_gen_is_active(lruvec, new_gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, delta); -+ return; -+ } -+ -+ if (new_gen < 0) { -+ if (lru_gen_is_active(lruvec, old_gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, -delta); -+ return; -+ } -+ -+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { -+ update_lru_size(lruvec, lru, zone, -delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); -+ } -+ -+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); -+} -+ -+/* Add a page to one of the multigenerational lru lists. Return true on success. */ -+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int gen; -+ unsigned long old_flags, new_flags; -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ if (PageUnevictable(page) || !lrugen->enabled[type]) -+ return false; -+ /* -+ * If a page shouldn't be considered for eviction, i.e., a page mapped -+ * upon fault during which the accessed bit is set, add it to the -+ * youngest generation. -+ * -+ * If a page can't be evicted immediately, i.e., an anon page not in -+ * swap cache or a dirty page pending writeback, add it to the second -+ * oldest generation. -+ * -+ * If a page could be evicted immediately, e.g., a clean page, add it to -+ * the oldest generation. -+ */ -+ if (PageActive(page)) -+ gen = lru_gen_from_seq(lrugen->max_seq); -+ else if ((!type && !PageSwapCache(page)) || -+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) -+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); -+ else -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); -+ -+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); -+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, -1, gen); -+ /* for rotate_reclaimable_page() */ -+ if (reclaiming) -+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); -+ else -+ list_add(&page->lru, &lrugen->lists[gen][type][zone]); -+ -+ return true; -+} -+ -+/* Delete a page from one of the multigenerational lru lists. Return true on success. */ -+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int gen; -+ unsigned long old_flags, new_flags; -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ if (!(new_flags & LRU_GEN_MASK)) -+ return false; -+ -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ -+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ -+ new_flags &= ~LRU_GEN_MASK; -+ /* for shrink_page_list() */ -+ if (reclaiming) -+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); -+ else if (lru_gen_is_active(lruvec, gen)) -+ new_flags |= BIT(PG_active); -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, gen, -1); -+ list_del(&page->lru); -+ -+ return true; -+} -+ -+#else -+ -+static inline bool lru_gen_enabled(void) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec) - { - enum lru_list lru = page_lru(page); - -+ if (lru_gen_add_page(page, lruvec, false)) -+ return; -+ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); - } -@@ -93,6 +269,9 @@ static __always_inline void add_page_to_ - { - enum lru_list lru = page_lru(page); - -+ if (lru_gen_add_page(page, lruvec, true)) -+ return; -+ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); - } -@@ -100,6 +279,9 @@ static __always_inline void add_page_to_ - static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec) - { -+ if (lru_gen_del_page(page, lruvec, false)) -+ return; -+ - list_del(&page->lru); - update_lru_size(lruvec, page_lru(page), page_zonenum(page), - -thp_nr_pages(page)); ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -294,6 +294,72 @@ enum lruvec_flags { - */ - }; - -+struct lruvec; -+ -+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) -+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -+ -+#ifdef CONFIG_LRU_GEN -+ -+/* -+ * For each lruvec, evictable pages are divided into multiple generations. The -+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are -+ * monotonically increasing. The sliding window technique is used to track at -+ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the -+ * window, AKA gen, indexes an array of per-type and per-zone lists for the -+ * corresponding generation. The counter in page->flags stores gen+1 while a -+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. -+ * -+ * After a page is faulted in, the aging must check the accessed bit at least -+ * twice before the eviction would consider it. The first check clears the -+ * accessed bit set during the initial fault. The second check makes sure this -+ * page hasn't been used since then. -+ */ -+#define MIN_NR_GENS 2 -+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -+ -+struct lrugen { -+ /* the aging increments the max generation number */ -+ unsigned long max_seq; -+ /* the eviction increments the min generation numbers */ -+ unsigned long min_seq[ANON_AND_FILE]; -+ /* the birth time of each generation in jiffies */ -+ unsigned long timestamps[MAX_NR_GENS]; -+ /* the multigenerational lru lists */ -+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the sizes of the multigenerational lru lists in pages */ -+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* whether the multigenerational lru is enabled */ -+ bool enabled[ANON_AND_FILE]; -+}; -+ -+#define MAX_BATCH_SIZE 8192 -+ -+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); -+void lru_gen_change_state(bool enable, bool main, bool swap); -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg); -+#endif -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) -+{ -+} -+ -+static inline void lru_gen_change_state(bool enable, bool main, bool swap) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+} -+#endif -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct lruvec { - struct list_head lists[NR_LRU_LISTS]; - /* per lruvec lru_lock for memcg */ -@@ -311,6 +377,10 @@ struct lruvec { - unsigned long refaults[ANON_AND_FILE]; - /* Various lruvec state flags (enum lruvec_flags) */ - unsigned long flags; -+#ifdef CONFIG_LRU_GEN -+ /* unevictable pages are on LRU_UNEVICTABLE */ -+ struct lrugen evictable; -+#endif - #ifdef CONFIG_MEMCG - struct pglist_data *pgdat; - #endif ---- a/include/linux/page-flags-layout.h -+++ b/include/linux/page-flags-layout.h -@@ -26,6 +26,14 @@ - - #define ZONES_WIDTH ZONES_SHIFT - -+#ifdef CONFIG_LRU_GEN -+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ -+#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) -+#else -+#define LRU_GEN_WIDTH 0 -+#define LRU_REFS_WIDTH 0 -+#endif /* CONFIG_LRU_GEN */ -+ - #ifdef CONFIG_SPARSEMEM - #include <asm/sparsemem.h> - #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) -@@ -55,7 +63,8 @@ - #define SECTIONS_WIDTH 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ -+ <= BITS_PER_LONG - NR_PAGEFLAGS - #define NODES_WIDTH NODES_SHIFT - #elif defined(CONFIG_SPARSEMEM_VMEMMAP) - #error "Vmemmap: No space for nodes field in page flags" -@@ -89,8 +98,8 @@ - #define LAST_CPUPID_SHIFT 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ -- <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS - #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT - #else - #define LAST_CPUPID_WIDTH 0 -@@ -100,8 +109,8 @@ - #define LAST_CPUPID_NOT_IN_PAGE_FLAGS - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ -- > BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS - #error "Not enough bits in page flags" - #endif - ---- a/include/linux/page-flags.h -+++ b/include/linux/page-flags.h -@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall - 1UL << PG_private | 1UL << PG_private_2 | \ - 1UL << PG_writeback | 1UL << PG_reserved | \ - 1UL << PG_slab | 1UL << PG_active | \ -- 1UL << PG_unevictable | __PG_MLOCKED) -+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) - - /* - * Flags checked when a page is prepped for return by the page allocator. -@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall - * alloc-free cycle to prevent from reusing the page. - */ - #define PAGE_FLAGS_CHECK_AT_PREP \ -- (PAGEFLAGS_MASK & ~__PG_HWPOISON) -+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) - - #define PAGE_FLAGS_PRIVATE \ - (1UL << PG_private | 1UL << PG_private_2) ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -911,6 +911,9 @@ struct task_struct { - #ifdef CONFIG_MEMCG - unsigned in_user_fault:1; - #endif -+#ifdef CONFIG_LRU_GEN -+ unsigned in_nonseq_fault:1; -+#endif - #ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; - #endif ---- a/kernel/bounds.c -+++ b/kernel/bounds.c -@@ -22,6 +22,9 @@ int main(void) - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); - #endif - DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); -+#ifdef CONFIG_LRU_GEN -+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); -+#endif - /* End of constants */ - - return 0; ---- a/kernel/cgroup/cgroup-internal.h -+++ b/kernel/cgroup/cgroup-internal.h -@@ -165,7 +165,6 @@ struct cgroup_mgctx { - #define DEFINE_CGROUP_MGCTX(name) \ - struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) - --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - extern struct cgroup_subsys *cgroup_subsys[]; - extern struct list_head cgroup_roots; ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struc - #ifdef CONFIG_64BIT - (1L << PG_arch_2) | - #endif -- (1L << PG_dirty))); -+ (1L << PG_dirty) | -+ LRU_GEN_MASK | LRU_REFS_MASK)); - - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -5241,6 +5241,7 @@ static struct mem_cgroup *mem_cgroup_all - memcg->deferred_split_queue.split_queue_len = 0; - #endif - idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); -+ lru_gen_init_memcg(memcg); - return memcg; - fail: - mem_cgroup_id_remove(memcg); ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are - unsigned int flags, struct pt_regs *regs) - { - vm_fault_t ret; -+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); - - __set_current_state(TASK_RUNNING); - -@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are - if (flags & FAULT_FLAG_USER) - mem_cgroup_enter_user_fault(); - -+ if (nonseq_fault) -+ task_enter_nonseq_fault(); -+ - if (unlikely(is_vm_hugetlb_page(vma))) - ret = hugetlb_fault(vma->vm_mm, vma, address, flags); - else - ret = __handle_mm_fault(vma, address, flags); - -+ if (nonseq_fault) -+ task_exit_nonseq_fault(); -+ - if (flags & FAULT_FLAG_USER) { - mem_cgroup_exit_user_fault(); - /* ---- a/mm/mm_init.c -+++ b/mm/mm_init.c -@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo - - shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH -- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; -+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", -- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", -+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", - SECTIONS_WIDTH, - NODES_WIDTH, - ZONES_WIDTH, - LAST_CPUPID_WIDTH, - KASAN_TAG_WIDTH, -+ LRU_GEN_WIDTH, -+ LRU_REFS_WIDTH, - NR_PAGEFLAGS); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -7459,6 +7459,7 @@ static void __meminit pgdat_init_interna - - pgdat_page_ext_init(pgdat); - lruvec_init(&pgdat->__lruvec); -+ lru_gen_init_state(NULL, &pgdat->__lruvec); - } - - static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page) - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); - -+ /* see the comment in lru_gen_add_page() */ -+ if (lru_gen_enabled() && !PageUnevictable(page) && -+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) -+ SetPageActive(page); -+ - get_page(page); - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); -@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc - - static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) - { -- if (PageActive(page) && !PageUnevictable(page)) { -+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - int nr_pages = thp_nr_pages(page); - - del_page_from_lru_list(page, lruvec); -@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p - */ - void deactivate_page(struct page *page) - { -- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { -+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); ---- a/mm/swapfile.c -+++ b/mm/swapfile.c -@@ -2689,6 +2689,7 @@ SYSCALL_DEFINE1(swapoff, const char __us - err = 0; - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ lru_gen_change_state(false, false, true); - - out_dput: - filp_close(victim, NULL); -@@ -3350,6 +3351,7 @@ SYSCALL_DEFINE2(swapon, const char __use - mutex_unlock(&swapon_mutex); - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ lru_gen_change_state(true, false, true); - - error = 0; - goto out; ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -50,6 +50,7 @@ - #include <linux/printk.h> - #include <linux/dax.h> - #include <linux/psi.h> -+#include <linux/memory.h> - - #include <asm/tlbflush.h> - #include <asm/div64.h> -@@ -2815,6 +2816,273 @@ static bool can_age_anon_pages(struct pg - return can_demote(pgdat->node_id, sc); - } - -+#ifdef CONFIG_LRU_GEN -+ -+/****************************************************************************** -+ * shorthand helpers -+ ******************************************************************************/ -+ -+#define for_each_gen_type_zone(gen, type, zone) \ -+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ -+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) -+ -+static int page_lru_gen(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) -+{ -+ struct pglist_data *pgdat = NODE_DATA(nid); -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) { -+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; -+ -+ if (lruvec->pgdat != pgdat) -+ lruvec->pgdat = pgdat; -+ -+ return lruvec; -+ } -+#endif -+ return pgdat ? &pgdat->__lruvec : NULL; -+} -+ -+static int get_nr_gens(struct lruvec *lruvec, int type) -+{ -+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; -+} -+ -+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) -+{ -+ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && -+ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && -+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS; -+} -+ -+/****************************************************************************** -+ * state change -+ ******************************************************************************/ -+ -+#ifdef CONFIG_LRU_GEN_ENABLED -+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); -+#else -+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); -+#endif -+ -+static int lru_gen_nr_swapfiles; -+ -+static bool __maybe_unused state_is_valid(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ enum lru_list lru; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for_each_evictable_lru(lru) { -+ type = is_file_lru(lru); -+ -+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) -+ return false; -+ } -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) -+ return false; -+ -+ /* unlikely but not a bug when reset_batch_size() is pending */ -+ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); -+ } -+ -+ return true; -+} -+ -+static bool fill_lists(struct lruvec *lruvec) -+{ -+ enum lru_list lru; -+ int remaining = MAX_BATCH_SIZE; -+ -+ for_each_evictable_lru(lru) { -+ int type = is_file_lru(lru); -+ bool active = is_active_lru(lru); -+ struct list_head *head = &lruvec->lists[lru]; -+ -+ if (!lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page) != active, page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ del_page_from_lru_list(page, lruvec); -+ success = lru_gen_add_page(page, lruvec, false); -+ VM_BUG_ON(!success); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+static bool drain_lists(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ int remaining = MAX_BATCH_SIZE; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; -+ -+ if (lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ success = lru_gen_del_page(page, lruvec, false); -+ VM_BUG_ON(!success); -+ add_page_to_lru_list(page, lruvec); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+/* -+ * For file page tracking, we enable/disable it according to the main switch. -+ * For anon page tracking, we only enabled it when the main switch is on and -+ * there is at least one swapfile; we disable it when there are no swapfiles -+ * regardless of the value of the main switch. Otherwise, we will eventually -+ * reach the max size of the sliding window and have to call inc_min_seq(). -+ */ -+void lru_gen_change_state(bool enable, bool main, bool swap) -+{ -+ static DEFINE_MUTEX(state_mutex); -+ -+ struct mem_cgroup *memcg; -+ -+ mem_hotplug_begin(); -+ cgroup_lock(); -+ mutex_lock(&state_mutex); -+ -+ if (swap) { -+ if (enable) -+ swap = !lru_gen_nr_swapfiles++; -+ else -+ swap = !--lru_gen_nr_swapfiles; -+ } -+ -+ if (main && enable != lru_gen_enabled()) { -+ if (enable) -+ static_branch_enable(&lru_gen_static_key); -+ else -+ static_branch_disable(&lru_gen_static_key); -+ } else if (!swap || !lru_gen_enabled()) -+ goto unlock; -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ VM_BUG_ON(!state_is_valid(lruvec)); -+ -+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lruvec->evictable.enabled[1] = lru_gen_enabled(); -+ -+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+unlock: -+ mutex_unlock(&state_mutex); -+ cgroup_unlock(); -+ mem_hotplug_done(); -+} -+ -+/****************************************************************************** -+ * initialization -+ ******************************************************************************/ -+ -+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) -+{ -+ int i; -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ lrugen->max_seq = MIN_NR_GENS + 1; -+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lrugen->enabled[1] = lru_gen_enabled(); -+ -+ for (i = 0; i <= MIN_NR_GENS + 1; i++) -+ lrugen->timestamps[i] = jiffies; -+ -+ for_each_gen_type_zone(gen, type, zone) -+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ lru_gen_init_state(memcg, lruvec); -+ } -+} -+#endif -+ -+static int __init init_lru_gen(void) -+{ -+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); -+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ -+ return 0; -+}; -+late_initcall(init_lru_gen); -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { - unsigned long nr[NR_LRU_LISTS]; diff --git a/target/linux/generic/backport-5.15/020-v6.1-05-mm-multi-gen-LRU-groundwork.patch b/target/linux/generic/backport-5.15/020-v6.1-05-mm-multi-gen-LRU-groundwork.patch new file mode 100644 index 0000000000..5c143f3cfa --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-05-mm-multi-gen-LRU-groundwork.patch @@ -0,0 +1,842 @@ +From a9b328add8422921a0dbbef162730800e16e8cfd Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:02 -0600 +Subject: [PATCH 05/29] mm: multi-gen LRU: groundwork +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Evictable pages are divided into multiple generations for each lruvec. +The youngest generation number is stored in lrugen->max_seq for both +anon and file types as they are aged on an equal footing. The oldest +generation numbers are stored in lrugen->min_seq[] separately for anon +and file types as clean file pages can be evicted regardless of swap +constraints. These three variables are monotonically increasing. + +Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits +in order to fit into the gen counter in page->flags. Each truncated +generation number is an index to lrugen->lists[]. The sliding window +technique is used to track at least MIN_NR_GENS and at most +MAX_NR_GENS generations. The gen counter stores a value within [1, +MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it +stores 0. + +There are two conceptually independent procedures: "the aging", which +produces young generations, and "the eviction", which consumes old +generations. They form a closed-loop system, i.e., "the page reclaim". +Both procedures can be invoked from userspace for the purposes of working +set estimation and proactive reclaim. These techniques are commonly used +to optimize job scheduling (bin packing) in data centers [1][2]. + +To avoid confusion, the terms "hot" and "cold" will be applied to the +multi-gen LRU, as a new convention; the terms "active" and "inactive" will +be applied to the active/inactive LRU, as usual. + +The protection of hot pages and the selection of cold pages are based +on page access channels and patterns. There are two access channels: +one through page tables and the other through file descriptors. The +protection of the former channel is by design stronger because: +1. The uncertainty in determining the access patterns of the former + channel is higher due to the approximation of the accessed bit. +2. The cost of evicting the former channel is higher due to the TLB + flushes required and the likelihood of encountering the dirty bit. +3. The penalty of underprotecting the former channel is higher because + applications usually do not prepare themselves for major page + faults like they do for blocked I/O. E.g., GUI applications + commonly use dedicated I/O threads to avoid blocking rendering + threads. + +There are also two access patterns: one with temporal locality and the +other without. For the reasons listed above, the former channel is +assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is +present; the latter channel is assumed to follow the latter pattern unless +outlying refaults have been observed [3][4]. + +The next patch will address the "outlying refaults". Three macros, i.e., +LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in +this patch to make the entire patchset less diffy. + +A page is added to the youngest generation on faulting. The aging needs +to check the accessed bit at least twice before handing this page over to +the eviction. The first check takes care of the accessed bit set on the +initial fault; the second check makes sure this page has not been used +since then. This protocol, AKA second chance, requires a minimum of two +generations, hence MIN_NR_GENS. + +[1] https://dl.acm.org/doi/10.1145/3297858.3304053 +[2] https://dl.acm.org/doi/10.1145/3503222.3507731 +[3] https://lwn.net/Articles/495543/ +[4] https://lwn.net/Articles/815342/ + +Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + fs/fuse/dev.c | 3 +- + include/linux/mm.h | 2 + + include/linux/mm_inline.h | 177 +++++++++++++++++++++++++++++- + include/linux/mmzone.h | 100 +++++++++++++++++ + include/linux/page-flags-layout.h | 13 ++- + include/linux/page-flags.h | 4 +- + include/linux/sched.h | 4 + + kernel/bounds.c | 5 + + mm/Kconfig | 8 ++ + mm/huge_memory.c | 3 +- + mm/memcontrol.c | 2 + + mm/memory.c | 25 +++++ + mm/mm_init.c | 6 +- + mm/mmzone.c | 2 + + mm/swap.c | 10 +- + mm/vmscan.c | 75 +++++++++++++ + 16 files changed, 425 insertions(+), 14 deletions(-) + +diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c +index d6b5339c56e2..4ec08f7c3e75 100644 +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page) + 1 << PG_active | + 1 << PG_workingset | + 1 << PG_reclaim | +- 1 << PG_waiters))) { ++ 1 << PG_waiters | ++ LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); + return 1; + } +diff --git a/include/linux/mm.h b/include/linux/mm.h +index e4e1817bb3b8..699068f39aa0 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); + #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) + #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) ++#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) ++#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) + + /* + * Define the bit shifts to access each section. For non-existent +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index a822d6b690a5..65320d2b8f60 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -26,10 +26,13 @@ static inline int page_is_file_lru(struct page *page) + + static __always_inline void __update_lru_size(struct lruvec *lruvec, + enum lru_list lru, enum zone_type zid, +- int nr_pages) ++ long nr_pages) + { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + ++ lockdep_assert_held(&lruvec->lru_lock); ++ WARN_ON_ONCE(nr_pages != (int)nr_pages); ++ + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + lru, nr_pages); +@@ -86,11 +89,177 @@ static __always_inline enum lru_list page_lru(struct page *page) + return lru; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return true; ++} ++ ++static inline bool lru_gen_in_fault(void) ++{ ++ return current->in_lru_fault; ++} ++ ++static inline int lru_gen_from_seq(unsigned long seq) ++{ ++ return seq % MAX_NR_GENS; ++} ++ ++static inline int page_lru_gen(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) ++{ ++ unsigned long max_seq = lruvec->lrugen.max_seq; ++ ++ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); ++ ++ /* see the comment on MIN_NR_GENS */ ++ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); ++} ++ ++static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page, ++ int old_gen, int new_gen) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ enum lru_list lru = type * LRU_INACTIVE_FILE; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); ++ ++ if (old_gen >= 0) ++ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], ++ lrugen->nr_pages[old_gen][type][zone] - delta); ++ if (new_gen >= 0) ++ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], ++ lrugen->nr_pages[new_gen][type][zone] + delta); ++ ++ /* addition */ ++ if (old_gen < 0) { ++ if (lru_gen_is_active(lruvec, new_gen)) ++ lru += LRU_ACTIVE; ++ __update_lru_size(lruvec, lru, zone, delta); ++ return; ++ } ++ ++ /* deletion */ ++ if (new_gen < 0) { ++ if (lru_gen_is_active(lruvec, old_gen)) ++ lru += LRU_ACTIVE; ++ __update_lru_size(lruvec, lru, zone, -delta); ++ return; ++ } ++} ++ ++static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming) ++{ ++ unsigned long seq; ++ unsigned long flags; ++ int gen = page_lru_gen(page); ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ VM_WARN_ON_ONCE_PAGE(gen != -1, page); ++ ++ if (PageUnevictable(page)) ++ return false; ++ /* ++ * There are three common cases for this page: ++ * 1. If it's hot, e.g., freshly faulted in or previously hot and ++ * migrated, add it to the youngest generation. ++ * 2. If it's cold but can't be evicted immediately, i.e., an anon page ++ * not in swapcache or a dirty page pending writeback, add it to the ++ * second oldest generation. ++ * 3. Everything else (clean, cold) is added to the oldest generation. ++ */ ++ if (PageActive(page)) ++ seq = lrugen->max_seq; ++ else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) || ++ (PageReclaim(page) && ++ (PageDirty(page) || PageWriteback(page)))) ++ seq = lrugen->min_seq[type] + 1; ++ else ++ seq = lrugen->min_seq[type]; ++ ++ gen = lru_gen_from_seq(seq); ++ flags = (gen + 1UL) << LRU_GEN_PGOFF; ++ /* see the comment on MIN_NR_GENS about PG_active */ ++ set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags); ++ ++ lru_gen_update_size(lruvec, page, -1, gen); ++ /* for rotate_reclaimable_page() */ ++ if (reclaiming) ++ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ else ++ list_add(&page->lru, &lrugen->lists[gen][type][zone]); ++ ++ return true; ++} ++ ++static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming) ++{ ++ unsigned long flags; ++ int gen = page_lru_gen(page); ++ ++ if (gen < 0) ++ return false; ++ ++ VM_WARN_ON_ONCE_PAGE(PageActive(page), page); ++ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); ++ ++ /* for migrate_page_states() */ ++ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; ++ flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags); ++ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ ++ lru_gen_update_size(lruvec, page, gen, -1); ++ list_del(&page->lru); ++ ++ return true; ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_in_fault(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static __always_inline void add_page_to_lru_list(struct page *page, + struct lruvec *lruvec) + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(lruvec, page, false)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add(&page->lru, &lruvec->lists[lru]); + } +@@ -100,6 +269,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(lruvec, page, true)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); + } +@@ -107,6 +279,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, + static __always_inline void del_page_from_lru_list(struct page *page, + struct lruvec *lruvec) + { ++ if (lru_gen_del_page(lruvec, page, false)) ++ return; ++ + list_del(&page->lru); + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 6ba100216530..0c39f72184d0 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -294,6 +294,102 @@ enum lruvec_flags { + */ + }; + ++#endif /* !__GENERATING_BOUNDS_H */ ++ ++/* ++ * Evictable pages are divided into multiple generations. The youngest and the ++ * oldest generation numbers, max_seq and min_seq, are monotonically increasing. ++ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An ++ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the ++ * corresponding generation. The gen counter in page->flags stores gen+1 while ++ * a page is on one of lrugen->lists[]. Otherwise it stores 0. ++ * ++ * A page is added to the youngest generation on faulting. The aging needs to ++ * check the accessed bit at least twice before handing this page over to the ++ * eviction. The first check takes care of the accessed bit set on the initial ++ * fault; the second check makes sure this page hasn't been used since then. ++ * This process, AKA second chance, requires a minimum of two generations, ++ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive ++ * LRU, e.g., /proc/vmstat, these two generations are considered active; the ++ * rest of generations, if they exist, are considered inactive. See ++ * lru_gen_is_active(). ++ * ++ * PG_active is always cleared while a page is on one of lrugen->lists[] so that ++ * the aging needs not to worry about it. And it's set again when a page ++ * considered active is isolated for non-reclaiming purposes, e.g., migration. ++ * See lru_gen_add_page() and lru_gen_del_page(). ++ * ++ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the ++ * number of categories of the active/inactive LRU when keeping track of ++ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits ++ * in page->flags. ++ */ ++#define MIN_NR_GENS 2U ++#define MAX_NR_GENS 4U ++ ++#ifndef __GENERATING_BOUNDS_H ++ ++struct lruvec; ++ ++#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) ++#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) ++ ++#ifdef CONFIG_LRU_GEN ++ ++enum { ++ LRU_GEN_ANON, ++ LRU_GEN_FILE, ++}; ++ ++/* ++ * The youngest generation number is stored in max_seq for both anon and file ++ * types as they are aged on an equal footing. The oldest generation numbers are ++ * stored in min_seq[] separately for anon and file types as clean file pages ++ * can be evicted regardless of swap constraints. ++ * ++ * Normally anon and file min_seq are in sync. But if swapping is constrained, ++ * e.g., out of swap space, file min_seq is allowed to advance and leave anon ++ * min_seq behind. ++ * ++ * The number of pages in each generation is eventually consistent and therefore ++ * can be transiently negative. ++ */ ++struct lru_gen_struct { ++ /* the aging increments the youngest generation number */ ++ unsigned long max_seq; ++ /* the eviction increments the oldest generation numbers */ ++ unsigned long min_seq[ANON_AND_FILE]; ++ /* the multi-gen LRU lists, lazily sorted on eviction */ ++ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the multi-gen LRU sizes, eventually consistent */ ++ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++}; ++ ++void lru_gen_init_lruvec(struct lruvec *lruvec); ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg); ++void lru_gen_exit_memcg(struct mem_cgroup *memcg); ++#endif ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_lruvec(struct lruvec *lruvec) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) ++{ ++} ++#endif ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct lruvec { + struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ +@@ -311,6 +407,10 @@ struct lruvec { + unsigned long refaults[ANON_AND_FILE]; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; ++#ifdef CONFIG_LRU_GEN ++ /* evictable pages divided into generations */ ++ struct lru_gen_struct lrugen; ++#endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; + #endif +diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h +index ef1e3e736e14..240905407a18 100644 +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -55,7 +55,8 @@ + #define SECTIONS_WIDTH 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ ++ <= BITS_PER_LONG - NR_PAGEFLAGS + #define NODES_WIDTH NODES_SHIFT + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) + #error "Vmemmap: No space for nodes field in page flags" +@@ -89,8 +90,8 @@ + #define LAST_CPUPID_SHIFT 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ +- <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS + #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT + #else + #define LAST_CPUPID_WIDTH 0 +@@ -100,10 +101,12 @@ + #define LAST_CPUPID_NOT_IN_PAGE_FLAGS + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ +- > BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS + #error "Not enough bits in page flags" + #endif + ++#define LRU_REFS_WIDTH 0 ++ + #endif + #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index fbfd3fad48f2..a7d7ff4c621d 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) + 1UL << PG_private | 1UL << PG_private_2 | \ + 1UL << PG_writeback | 1UL << PG_reserved | \ + 1UL << PG_slab | 1UL << PG_active | \ +- 1UL << PG_unevictable | __PG_MLOCKED) ++ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + + /* + * Flags checked when a page is prepped for return by the page allocator. +@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) + * alloc-free cycle to prevent from reusing the page. + */ + #define PAGE_FLAGS_CHECK_AT_PREP \ +- (PAGEFLAGS_MASK & ~__PG_HWPOISON) ++ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) + + #define PAGE_FLAGS_PRIVATE \ + (1UL << PG_private | 1UL << PG_private_2) +diff --git a/include/linux/sched.h b/include/linux/sched.h +index e418935f8db6..545f6b1ccd50 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -911,6 +911,10 @@ struct task_struct { + #ifdef CONFIG_MEMCG + unsigned in_user_fault:1; + #endif ++#ifdef CONFIG_LRU_GEN ++ /* whether the LRU algorithm may apply to this access */ ++ unsigned in_lru_fault:1; ++#endif + #ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; + #endif +diff --git a/kernel/bounds.c b/kernel/bounds.c +index 9795d75b09b2..5ee60777d8e4 100644 +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -22,6 +22,11 @@ int main(void) + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + #endif + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); ++#ifdef CONFIG_LRU_GEN ++ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); ++#else ++ DEFINE(LRU_GEN_WIDTH, 0); ++#endif + /* End of constants */ + + return 0; +diff --git a/mm/Kconfig b/mm/Kconfig +index c048dea7e342..0eeb27397884 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -897,6 +897,14 @@ config IO_MAPPING + config SECRETMEM + def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + ++config LRU_GEN ++ bool "Multi-Gen LRU" ++ depends on MMU ++ # make sure page->flags has enough spare bits ++ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP ++ help ++ A high performance LRU implementation to overcommit memory. ++ + source "mm/damon/Kconfig" + + endmenu +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 98ff57c8eda6..f260ef82f03a 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struct page *head, int tail, + #ifdef CONFIG_64BIT + (1L << PG_arch_2) | + #endif +- (1L << PG_dirty))); ++ (1L << PG_dirty) | ++ LRU_GEN_MASK | LRU_REFS_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index b68b2fe639fd..8b634dc72e7f 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) + + static void mem_cgroup_free(struct mem_cgroup *memcg) + { ++ lru_gen_exit_memcg(memcg); + memcg_wb_domain_exit(memcg); + __mem_cgroup_free(memcg); + } +@@ -5241,6 +5242,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) + memcg->deferred_split_queue.split_queue_len = 0; + #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ lru_gen_init_memcg(memcg); + return memcg; + fail: + mem_cgroup_id_remove(memcg); +diff --git a/mm/memory.c b/mm/memory.c +index 392b7326a2d2..7d5be951de9e 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -4778,6 +4778,27 @@ static inline void mm_account_fault(struct pt_regs *regs, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); + } + ++#ifdef CONFIG_LRU_GEN ++static void lru_gen_enter_fault(struct vm_area_struct *vma) ++{ ++ /* the LRU algorithm doesn't apply to sequential or random reads */ ++ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); ++} ++ ++static void lru_gen_exit_fault(void) ++{ ++ current->in_lru_fault = false; ++} ++#else ++static void lru_gen_enter_fault(struct vm_area_struct *vma) ++{ ++} ++ ++static void lru_gen_exit_fault(void) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * By the time we get here, we already hold the mm semaphore + * +@@ -4809,11 +4830,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + ++ lru_gen_enter_fault(vma); ++ + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); + ++ lru_gen_exit_fault(); ++ + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* +diff --git a/mm/mm_init.c b/mm/mm_init.c +index 9ddaf0e1b0ab..0d7b2bd2454a 100644 +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH +- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; ++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", +- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", ++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, ++ LRU_GEN_WIDTH, ++ LRU_REFS_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", +diff --git a/mm/mmzone.c b/mm/mmzone.c +index eb89d6e018e2..2ec0d7793424 100644 +--- a/mm/mmzone.c ++++ b/mm/mmzone.c +@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec) + + for_each_lru(lru) + INIT_LIST_HEAD(&lruvec->lists[lru]); ++ ++ lru_gen_init_lruvec(lruvec); + } + + #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +diff --git a/mm/swap.c b/mm/swap.c +index af3cad4e5378..0bdc96661fb6 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page) + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + ++ /* see the comment in lru_gen_add_page() */ ++ if (lru_gen_enabled() && !PageUnevictable(page) && ++ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) ++ SetPageActive(page); ++ + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); +@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) + + static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) + { +- if (PageActive(page) && !PageUnevictable(page)) { ++ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + int nr_pages = thp_nr_pages(page); + + del_page_from_lru_list(page, lruvec); +@@ -661,7 +666,8 @@ void deactivate_file_page(struct page *page) + */ + void deactivate_page(struct page *page) + { +- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { ++ if (PageLRU(page) && !PageUnevictable(page) && ++ (PageActive(page) || lru_gen_enabled())) { + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); +diff --git a/mm/vmscan.c b/mm/vmscan.c +index dc5f0381513f..41826fe17eb3 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2821,6 +2821,81 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + return can_demote(pgdat->node_id, sc); + } + ++#ifdef CONFIG_LRU_GEN ++ ++/****************************************************************************** ++ * shorthand helpers ++ ******************************************************************************/ ++ ++#define for_each_gen_type_zone(gen, type, zone) \ ++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid) ++{ ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) { ++ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; ++ ++ /* for hotadd_new_pgdat() */ ++ if (!lruvec->pgdat) ++ lruvec->pgdat = pgdat; ++ ++ return lruvec; ++ } ++#endif ++ VM_WARN_ON_ONCE(!mem_cgroup_disabled()); ++ ++ return pgdat ? &pgdat->__lruvec : NULL; ++} ++ ++/****************************************************************************** ++ * initialization ++ ******************************************************************************/ ++ ++void lru_gen_init_lruvec(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ lrugen->max_seq = MIN_NR_GENS + 1; ++ ++ for_each_gen_type_zone(gen, type, zone) ++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++void lru_gen_exit_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, ++ sizeof(lruvec->lrugen.nr_pages))); ++ } ++} ++#endif ++ ++static int __init init_lru_gen(void) ++{ ++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); ++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ ++ return 0; ++}; ++late_initcall(init_lru_gen); ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + unsigned long nr[NR_LRU_LISTS]; +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-05-mm-multigenerational-lru-mm_struct-list.patch b/target/linux/generic/backport-5.15/020-v6.1-05-mm-multigenerational-lru-mm_struct-list.patch deleted file mode 100644 index a1c6e0f75d..0000000000 --- a/target/linux/generic/backport-5.15/020-v6.1-05-mm-multigenerational-lru-mm_struct-list.patch +++ /dev/null @@ -1,760 +0,0 @@ -From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Mon, 5 Apr 2021 04:17:41 -0600 -Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list - -To scan PTEs for accessed pages, a mm_struct list is maintained for -each memcg. When multiple threads traverse the same memcg->mm_list, -each of them gets a unique mm_struct and therefore they can run -walk_page_range() concurrently to reach page tables of all processes -of this memcg. - -This infrastructure also provides the following optimizations: - 1) it allows walkers to skip processes that have been sleeping since - the last walk by tracking the usage of mm_struct between context - switches. - 2) it allows walkers to add interesting items they find during a - walk to a Bloom filter so that they can skip uninteresting items - during the next walk by testing whether an item is in this Bloom - filter. - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8 ---- - fs/exec.c | 2 + - include/linux/memcontrol.h | 4 + - include/linux/mm_inline.h | 6 + - include/linux/mm_types.h | 75 +++++++++ - include/linux/mmzone.h | 63 +++++++ - kernel/exit.c | 1 + - kernel/fork.c | 9 + - kernel/sched/core.c | 1 + - mm/memcontrol.c | 25 +++ - mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++ - 10 files changed, 517 insertions(+) - ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m - active_mm = tsk->active_mm; - tsk->active_mm = mm; - tsk->mm = mm; -+ lru_gen_add_mm(mm); - /* - * This prevents preemption while active_mm is being loaded and - * it and mm are being updated, which could cause problems for -@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m - if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) - local_irq_enable(); - activate_mm(active_mm, mm); -+ lru_gen_activate_mm(mm); - if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) - local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -348,6 +348,10 @@ struct mem_cgroup { - struct deferred_split deferred_split_queue; - #endif - -+#ifdef CONFIG_LRU_GEN -+ struct lru_gen_mm_list mm_list; -+#endif -+ - struct mem_cgroup_per_node *nodeinfo[]; - }; - ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig - return seq % MAX_NR_GENS; - } - -+/* Return a proper index regardless whether we keep stats for historical generations. */ -+static inline int lru_hist_from_seq(unsigned long seq) -+{ -+ return seq % NR_HIST_GENS; -+} -+ - /* The youngest and the second youngest generations are counted as active. */ - static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) - { ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -3,6 +3,7 @@ - #define _LINUX_MM_TYPES_H - - #include <linux/mm_types_task.h> -+#include <linux/sched.h> - - #include <linux/auxvec.h> - #include <linux/list.h> -@@ -15,6 +16,8 @@ - #include <linux/page-flags-layout.h> - #include <linux/workqueue.h> - #include <linux/seqlock.h> -+#include <linux/nodemask.h> -+#include <linux/mmdebug.h> - - #include <asm/mmu.h> - -@@ -580,6 +583,18 @@ struct mm_struct { - #ifdef CONFIG_IOMMU_SUPPORT - u32 pasid; - #endif -+#ifdef CONFIG_LRU_GEN -+ struct { -+ /* the node of a global or per-memcg mm_struct list */ -+ struct list_head list; -+#ifdef CONFIG_MEMCG -+ /* points to the memcg of the owner task above */ -+ struct mem_cgroup *memcg; -+#endif -+ /* whether this mm_struct has been used since the last walk */ -+ nodemask_t nodes; -+ } lrugen; -+#endif /* CONFIG_LRU_GEN */ - } __randomize_layout; - - /* -@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru - return (struct cpumask *)&mm->cpu_bitmap; - } - -+#ifdef CONFIG_LRU_GEN -+ -+struct lru_gen_mm_list { -+ /* a global or per-memcg mm_struct list */ -+ struct list_head fifo; -+ /* protects the list above */ -+ spinlock_t lock; -+}; -+ -+void lru_gen_add_mm(struct mm_struct *mm); -+void lru_gen_del_mm(struct mm_struct *mm); -+#ifdef CONFIG_MEMCG -+void lru_gen_migrate_mm(struct mm_struct *mm); -+#endif -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+ INIT_LIST_HEAD(&mm->lrugen.list); -+#ifdef CONFIG_MEMCG -+ mm->lrugen.memcg = NULL; -+#endif -+ nodes_clear(mm->lrugen.nodes); -+} -+ -+/* Track the usage of each mm_struct so that we can skip inactive ones. */ -+static inline void lru_gen_activate_mm(struct mm_struct *mm) -+{ -+ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ -+ VM_WARN_ON(list_empty(&mm->lrugen.list)); -+ -+ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes)) -+ nodes_setall(mm->lrugen.nodes); -+} -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_add_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_del_mm(struct mm_struct *mm) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+} -+#endif -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_activate_mm(struct mm_struct *mm) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct mmu_gather; - extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); - extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -318,6 +318,13 @@ struct lruvec; - #define MIN_NR_GENS 2 - #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) - -+/* Whether to keep stats for historical generations. */ -+#ifdef CONFIG_LRU_GEN_STATS -+#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -+#else -+#define NR_HIST_GENS 1U -+#endif -+ - struct lrugen { - /* the aging increments the max generation number */ - unsigned long max_seq; -@@ -333,13 +340,63 @@ struct lrugen { - bool enabled[ANON_AND_FILE]; - }; - -+enum { -+ MM_LEAF_TOTAL, /* total leaf entries */ -+ MM_LEAF_OLD, /* old leaf entries */ -+ MM_LEAF_YOUNG, /* young leaf entries */ -+ MM_NONLEAF_TOTAL, /* total non-leaf entries */ -+ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */ -+ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */ -+ NR_MM_STATS -+}; -+ -+/* mnemonic codes for the stats above */ -+#define MM_STAT_CODES "toydpc" -+ -+/* double buffering bloom filters */ -+#define NR_BLOOM_FILTERS 2 -+ -+struct lru_gen_mm_walk { -+ /* set to max_seq after each round of walk */ -+ unsigned long seq; -+ /* the next mm_struct on the list to walk */ -+ struct list_head *head; -+ /* the first mm_struct never walked before */ -+ struct list_head *tail; -+ /* to wait for the last walker to finish */ -+ struct wait_queue_head wait; -+ /* bloom filters flip after each round of walk */ -+ unsigned long *filters[NR_BLOOM_FILTERS]; -+ /* page table stats for debugging */ -+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; -+ /* the number of concurrent walkers */ -+ int nr_walkers; -+}; -+ -+#define MIN_BATCH_SIZE 64 - #define MAX_BATCH_SIZE 8192 - -+struct mm_walk_args { -+ struct mem_cgroup *memcg; -+ unsigned long max_seq; -+ unsigned long start_pfn; -+ unsigned long end_pfn; -+ unsigned long next_addr; -+ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)]; -+ int node_id; -+ int swappiness; -+ int batch_size; -+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ int mm_stats[NR_MM_STATS]; -+ bool use_filter; -+}; -+ - void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); - void lru_gen_change_state(bool enable, bool main, bool swap); - - #ifdef CONFIG_MEMCG - void lru_gen_init_memcg(struct mem_cgroup *memcg); -+void lru_gen_free_memcg(struct mem_cgroup *memcg); - #endif - - #else /* !CONFIG_LRU_GEN */ -@@ -356,6 +413,10 @@ static inline void lru_gen_change_state( - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) - { - } -+ -+static inline void lru_gen_free_memcg(struct mem_cgroup *memcg) -+{ -+} - #endif - - #endif /* CONFIG_LRU_GEN */ -@@ -380,6 +441,8 @@ struct lruvec { - #ifdef CONFIG_LRU_GEN - /* unevictable pages are on LRU_UNEVICTABLE */ - struct lrugen evictable; -+ /* state for mm list and page table walks */ -+ struct lru_gen_mm_walk mm_walk; - #endif - #ifdef CONFIG_MEMCG - struct pglist_data *pgdat; ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -469,6 +469,7 @@ assign_new_owner: - goto retry; - } - WRITE_ONCE(mm->owner, c); -+ lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); - } ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct - goto fail_nocontext; - - mm->user_ns = get_user_ns(user_ns); -+ lru_gen_init_mm(mm); - return mm; - - fail_nocontext: -@@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_str - } - if (mm->binfmt) - module_put(mm->binfmt->module); -+ lru_gen_del_mm(mm); - mmdrop(mm); - } - -@@ -2622,6 +2624,13 @@ pid_t kernel_clone(struct kernel_clone_a - get_task_struct(p); - } - -+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { -+ /* lock the task to synchronize with memcg migration */ -+ task_lock(p); -+ lru_gen_add_mm(p->mm); -+ task_unlock(p); -+ } -+ - wake_up_new_task(p); - - /* forking complete and child started to run, tell ptracer */ ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -5007,6 +5007,7 @@ context_switch(struct rq *rq, struct tas - * finish_task_switch()'s mmdrop(). - */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ lru_gen_activate_mm(next->mm); - - if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem - - static void mem_cgroup_free(struct mem_cgroup *memcg) - { -+ lru_gen_free_memcg(memcg); - memcg_wb_domain_exit(memcg); - __mem_cgroup_free(memcg); - } -@@ -6210,6 +6211,29 @@ static void mem_cgroup_move_task(void) - } - #endif - -+#ifdef CONFIG_LRU_GEN -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *task = NULL; -+ -+ cgroup_taskset_for_each_leader(task, css, tset) -+ break; -+ -+ if (!task) -+ return; -+ -+ task_lock(task); -+ if (task->mm && task->mm->owner == task) -+ lru_gen_migrate_mm(task->mm); -+ task_unlock(task); -+} -+#else -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) - { - if (value == PAGE_COUNTER_MAX) -@@ -6553,6 +6577,7 @@ struct cgroup_subsys memory_cgrp_subsys - .css_reset = mem_cgroup_css_reset, - .css_rstat_flush = mem_cgroup_css_rstat_flush, - .can_attach = mem_cgroup_can_attach, -+ .attach = mem_cgroup_attach, - .cancel_attach = mem_cgroup_cancel_attach, - .post_attach = mem_cgroup_move_task, - .dfl_cftypes = memory_files, ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -2864,6 +2864,306 @@ static bool __maybe_unused seq_is_valid( - } - - /****************************************************************************** -+ * mm_struct list -+ ******************************************************************************/ -+ -+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) -+{ -+ static struct lru_gen_mm_list mm_list = { -+ .fifo = LIST_HEAD_INIT(mm_list.fifo), -+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), -+ }; -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ return &memcg->mm_list; -+#endif -+ return &mm_list; -+} -+ -+void lru_gen_add_mm(struct mm_struct *mm) -+{ -+ int nid; -+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); -+ -+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); -+#ifdef CONFIG_MEMCG -+ VM_BUG_ON_MM(mm->lrugen.memcg, mm); -+ mm->lrugen.memcg = memcg; -+#endif -+ spin_lock(&mm_list->lock); -+ -+ list_add_tail(&mm->lrugen.list, &mm_list->fifo); -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ if (lruvec->mm_walk.tail == &mm_list->fifo) -+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev; -+ } -+ -+ spin_unlock(&mm_list->lock); -+} -+ -+void lru_gen_del_mm(struct mm_struct *mm) -+{ -+ int nid; -+ struct lru_gen_mm_list *mm_list; -+ struct mem_cgroup *memcg = NULL; -+ -+ if (list_empty(&mm->lrugen.list)) -+ return; -+ -+#ifdef CONFIG_MEMCG -+ memcg = mm->lrugen.memcg; -+#endif -+ mm_list = get_mm_list(memcg); -+ -+ spin_lock(&mm_list->lock); -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ if (lruvec->mm_walk.tail == &mm->lrugen.list) -+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next; -+ -+ if (lruvec->mm_walk.head != &mm->lrugen.list) -+ continue; -+ -+ lruvec->mm_walk.head = lruvec->mm_walk.head->next; -+ if (lruvec->mm_walk.head == &mm_list->fifo) -+ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1); -+ } -+ -+ list_del_init(&mm->lrugen.list); -+ -+ spin_unlock(&mm_list->lock); -+ -+#ifdef CONFIG_MEMCG -+ mem_cgroup_put(mm->lrugen.memcg); -+ mm->lrugen.memcg = NULL; -+#endif -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+ struct mem_cgroup *memcg; -+ -+ lockdep_assert_held(&mm->owner->alloc_lock); -+ -+ if (mem_cgroup_disabled()) -+ return; -+ -+ rcu_read_lock(); -+ memcg = mem_cgroup_from_task(mm->owner); -+ rcu_read_unlock(); -+ if (memcg == mm->lrugen.memcg) -+ return; -+ -+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm); -+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); -+ -+ lru_gen_del_mm(mm); -+ lru_gen_add_mm(mm); -+} -+#endif -+ -+#define BLOOM_FILTER_SHIFT 15 -+ -+static inline int filter_gen_from_seq(unsigned long seq) -+{ -+ return seq % NR_BLOOM_FILTERS; -+} -+ -+static void get_item_key(void *item, int *key) -+{ -+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); -+ -+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); -+ -+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); -+ key[1] = hash >> BLOOM_FILTER_SHIFT; -+} -+ -+static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq) -+{ -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); -+ -+ filter = lruvec->mm_walk.filters[gen]; -+ if (filter) { -+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); -+ return; -+ } -+ -+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC); -+ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter); -+} -+ -+static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); -+ if (!filter) -+ return; -+ -+ get_item_key(item, key); -+ -+ if (!test_bit(key[0], filter)) -+ set_bit(key[0], filter); -+ if (!test_bit(key[1], filter)) -+ set_bit(key[1], filter); -+} -+ -+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); -+ if (!filter) -+ return false; -+ -+ get_item_key(item, key); -+ -+ return test_bit(key[0], filter) && test_bit(key[1], filter); -+} -+ -+static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args) -+{ -+ int i; -+ int hist = lru_hist_from_seq(args->max_seq); -+ -+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); -+ -+ for (i = 0; i < NR_MM_STATS; i++) { -+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], -+ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]); -+ args->mm_stats[i] = 0; -+ } -+ -+ if (!last || NR_HIST_GENS == 1) -+ return; -+ -+ hist = lru_hist_from_seq(args->max_seq + 1); -+ for (i = 0; i < NR_MM_STATS; i++) -+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0); -+} -+ -+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) -+{ -+ int type; -+ unsigned long size = 0; -+ -+ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes)) -+ return true; -+ -+ if (mm_is_oom_victim(mm)) -+ return true; -+ -+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) { -+ size += type ? get_mm_counter(mm, MM_FILEPAGES) : -+ get_mm_counter(mm, MM_ANONPAGES) + -+ get_mm_counter(mm, MM_SHMEMPAGES); -+ } -+ -+ if (size < MIN_BATCH_SIZE) -+ return true; -+ -+ if (!mmget_not_zero(mm)) -+ return true; -+ -+ node_clear(args->node_id, mm->lrugen.nodes); -+ -+ return false; -+} -+ -+/* To support multiple walkers that concurrently walk an mm_struct list. */ -+static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args, -+ struct mm_struct **iter) -+{ -+ bool first = false; -+ bool last = true; -+ struct mm_struct *mm = NULL; -+ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk; -+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); -+ -+ if (*iter) -+ mmput_async(*iter); -+ else if (args->max_seq <= READ_ONCE(mm_walk->seq)) -+ return false; -+ -+ spin_lock(&mm_list->lock); -+ -+ VM_BUG_ON(args->max_seq > mm_walk->seq + 1); -+ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq); -+ VM_BUG_ON(*iter && !mm_walk->nr_walkers); -+ -+ if (args->max_seq <= mm_walk->seq) { -+ if (!*iter) -+ last = false; -+ goto done; -+ } -+ -+ if (mm_walk->head == &mm_list->fifo) { -+ VM_BUG_ON(mm_walk->nr_walkers); -+ mm_walk->head = mm_walk->head->next; -+ first = true; -+ } -+ -+ while (!mm && mm_walk->head != &mm_list->fifo) { -+ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list); -+ -+ mm_walk->head = mm_walk->head->next; -+ -+ if (mm_walk->tail == &mm->lrugen.list) { -+ mm_walk->tail = mm_walk->tail->next; -+ args->use_filter = false; -+ } -+ -+ if (should_skip_mm(mm, args)) -+ mm = NULL; -+ } -+ -+ if (mm_walk->head == &mm_list->fifo) -+ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1); -+done: -+ if (*iter && !mm) -+ mm_walk->nr_walkers--; -+ if (!*iter && mm) -+ mm_walk->nr_walkers++; -+ -+ if (mm_walk->nr_walkers) -+ last = false; -+ -+ if (mm && first) -+ clear_bloom_filter(lruvec, args->max_seq + 1); -+ -+ if (*iter || last) -+ reset_mm_stats(lruvec, last, args); -+ -+ spin_unlock(&mm_list->lock); -+ -+ *iter = mm; -+ -+ return last; -+} -+ -+/****************************************************************************** - * state change - ******************************************************************************/ - -@@ -3047,6 +3347,7 @@ void lru_gen_init_state(struct mem_cgrou - int i; - int gen, type, zone; - struct lrugen *lrugen = &lruvec->evictable; -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - - lrugen->max_seq = MIN_NR_GENS + 1; - lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -@@ -3057,6 +3358,17 @@ void lru_gen_init_state(struct mem_cgrou - - for_each_gen_type_zone(gen, type, zone) - INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+ -+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) -+ spin_lock(&mm_list->lock); -+ -+ lruvec->mm_walk.seq = MIN_NR_GENS; -+ lruvec->mm_walk.head = &mm_list->fifo; -+ lruvec->mm_walk.tail = &mm_list->fifo; -+ init_waitqueue_head(&lruvec->mm_walk.wait); -+ -+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) -+ spin_unlock(&mm_list->lock); - } - - #ifdef CONFIG_MEMCG -@@ -3064,18 +3376,37 @@ void lru_gen_init_memcg(struct mem_cgrou - { - int nid; - -+ INIT_LIST_HEAD(&memcg->mm_list.fifo); -+ spin_lock_init(&memcg->mm_list.lock); -+ - for_each_node(nid) { - struct lruvec *lruvec = get_lruvec(nid, memcg); - - lru_gen_init_state(memcg, lruvec); - } - } -+ -+void lru_gen_free_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; -+ -+ for_each_node(nid) { -+ int i; -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ for (i = 0; i < NR_BLOOM_FILTERS; i++) { -+ bitmap_free(lruvec->mm_walk.filters[i]); -+ lruvec->mm_walk.filters[i] = NULL; -+ } -+ } -+} - #endif - - static int __init init_lru_gen(void) - { - BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); - BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); - - return 0; - }; diff --git a/target/linux/generic/backport-5.15/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch b/target/linux/generic/backport-5.15/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch new file mode 100644 index 0000000000..1e310ae211 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch @@ -0,0 +1,1466 @@ +From b564b9471cd60ef1ee3961a224898ce4a9620d84 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:03 -0600 +Subject: [PATCH 06/29] mm: multi-gen LRU: minimal implementation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +To avoid confusion, the terms "promotion" and "demotion" will be applied +to the multi-gen LRU, as a new convention; the terms "activation" and +"deactivation" will be applied to the active/inactive LRU, as usual. + +The aging produces young generations. Given an lruvec, it increments +max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS. The aging promotes +hot pages to the youngest generation when it finds them accessed through +page tables; the demotion of cold pages happens consequently when it +increments max_seq. Promotion in the aging path does not involve any LRU +list operations, only the updates of the gen counter and +lrugen->nr_pages[]; demotion, unless as the result of the increment of +max_seq, requires LRU list operations, e.g., lru_deactivate_fn(). The +aging has the complexity O(nr_hot_pages), since it is only interested in +hot pages. + +The eviction consumes old generations. Given an lruvec, it increments +min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes empty. +A feedback loop modeled after the PID controller monitors refaults over +anon and file types and decides which type to evict when both types are +available from the same generation. + +The protection of pages accessed multiple times through file descriptors +takes place in the eviction path. Each generation is divided into +multiple tiers. A page accessed N times through file descriptors is in +tier order_base_2(N). Tiers do not have dedicated lrugen->lists[], only +bits in page->flags. The aforementioned feedback loop also monitors +refaults over all tiers and decides when to protect pages in which tiers +(N>1), using the first tier (N=0,1) as a baseline. The first tier +contains single-use unmapped clean pages, which are most likely the best +choices. In contrast to promotion in the aging path, the protection of a +page in the eviction path is achieved by moving this page to the next +generation, i.e., min_seq+1, if the feedback loop decides so. This +approach has the following advantages: + +1. It removes the cost of activation in the buffered access path by + inferring whether pages accessed multiple times through file + descriptors are statistically hot and thus worth protecting in the + eviction path. +2. It takes pages accessed through page tables into account and avoids + overprotecting pages accessed multiple times through file + descriptors. (Pages accessed through page tables are in the first + tier, since N=0.) +3. More tiers provide better protection for pages accessed more than + twice through file descriptors, when under heavy buffered I/O + workloads. + +Server benchmark results: + Single workload: + fio (buffered I/O): +[30, 32]% + IOPS BW + 5.19-rc1: 2673k 10.2GiB/s + patch1-6: 3491k 13.3GiB/s + + Single workload: + memcached (anon): -[4, 6]% + Ops/sec KB/sec + 5.19-rc1: 1161501.04 45177.25 + patch1-6: 1106168.46 43025.04 + + Configurations: + CPU: two Xeon 6154 + Mem: total 256G + + Node 1 was only used as a ram disk to reduce the variance in the + results. + + patch drivers/block/brd.c <<EOF + 99,100c99,100 + < gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM; + < page = alloc_page(gfp_flags); + --- + > gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE; + > page = alloc_pages_node(1, gfp_flags, 0); + EOF + + cat >>/etc/systemd/system.conf <<EOF + CPUAffinity=numa + NUMAPolicy=bind + NUMAMask=0 + EOF + + cat >>/etc/memcached.conf <<EOF + -m 184320 + -s /var/run/memcached/memcached.sock + -a 0766 + -t 36 + -B binary + EOF + + cat fio.sh + modprobe brd rd_nr=1 rd_size=113246208 + swapoff -a + mkfs.ext4 /dev/ram0 + mount -t ext4 /dev/ram0 /mnt + + mkdir /sys/fs/cgroup/user.slice/test + echo 38654705664 >/sys/fs/cgroup/user.slice/test/memory.max + echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs + fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \ + --buffered=1 --ioengine=io_uring --iodepth=128 \ + --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ + --rw=randread --random_distribution=random --norandommap \ + --time_based --ramp_time=10m --runtime=5m --group_reporting + + cat memcached.sh + modprobe brd rd_nr=1 rd_size=113246208 + swapoff -a + mkswap /dev/ram0 + swapon /dev/ram0 + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \ + --ratio 1:0 --pipeline 8 -d 2000 + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \ + --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed + +Client benchmark results: + kswapd profiles: + 5.19-rc1 + 40.33% page_vma_mapped_walk (overhead) + 21.80% lzo1x_1_do_compress (real work) + 7.53% do_raw_spin_lock + 3.95% _raw_spin_unlock_irq + 2.52% vma_interval_tree_iter_next + 2.37% page_referenced_one + 2.28% vma_interval_tree_subtree_search + 1.97% anon_vma_interval_tree_iter_first + 1.60% ptep_clear_flush + 1.06% __zram_bvec_write + + patch1-6 + 39.03% lzo1x_1_do_compress (real work) + 18.47% page_vma_mapped_walk (overhead) + 6.74% _raw_spin_unlock_irq + 3.97% do_raw_spin_lock + 2.49% ptep_clear_flush + 2.48% anon_vma_interval_tree_iter_first + 1.92% page_referenced_one + 1.88% __zram_bvec_write + 1.48% memmove + 1.31% vma_interval_tree_iter_next + + Configurations: + CPU: single Snapdragon 7c + Mem: total 4G + + ChromeOS MemoryPressure [1] + +[1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/ + +Link: https://lkml.kernel.org/r/20220918080010.2920238-7-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/mm_inline.h | 36 ++ + include/linux/mmzone.h | 41 ++ + include/linux/page-flags-layout.h | 5 +- + kernel/bounds.c | 2 + + mm/Kconfig | 11 + + mm/swap.c | 39 ++ + mm/vmscan.c | 792 +++++++++++++++++++++++++++++- + mm/workingset.c | 110 ++++- + 8 files changed, 1025 insertions(+), 11 deletions(-) + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 65320d2b8f60..58aabb1ba020 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -106,6 +106,33 @@ static inline int lru_gen_from_seq(unsigned long seq) + return seq % MAX_NR_GENS; + } + ++static inline int lru_hist_from_seq(unsigned long seq) ++{ ++ return seq % NR_HIST_GENS; ++} ++ ++static inline int lru_tier_from_refs(int refs) ++{ ++ VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); ++ ++ /* see the comment in page_lru_refs() */ ++ return order_base_2(refs + 1); ++} ++ ++static inline int page_lru_refs(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ bool workingset = flags & BIT(PG_workingset); ++ ++ /* ++ * Return the number of accesses beyond PG_referenced, i.e., N-1 if the ++ * total number of accesses is N>1, since N=0,1 both map to the first ++ * tier. lru_tier_from_refs() will account for this off-by-one. Also see ++ * the comment on MAX_NR_TIERS. ++ */ ++ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; ++} ++ + static inline int page_lru_gen(struct page *page) + { + unsigned long flags = READ_ONCE(page->flags); +@@ -158,6 +185,15 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page, + __update_lru_size(lruvec, lru, zone, -delta); + return; + } ++ ++ /* promotion */ ++ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { ++ __update_lru_size(lruvec, lru, zone, -delta); ++ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ ++ /* demotion requires isolation, e.g., lru_deactivate_fn() */ ++ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); + } + + static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming) +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 0c39f72184d0..fce8945c507c 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -327,6 +327,28 @@ enum lruvec_flags { + #define MIN_NR_GENS 2U + #define MAX_NR_GENS 4U + ++/* ++ * Each generation is divided into multiple tiers. A page accessed N times ++ * through file descriptors is in tier order_base_2(N). A page in the first tier ++ * (N=0,1) is marked by PG_referenced unless it was faulted in through page ++ * tables or read ahead. A page in any other tier (N>1) is marked by ++ * PG_referenced and PG_workingset. This implies a minimum of two tiers is ++ * supported without using additional bits in page->flags. ++ * ++ * In contrast to moving across generations which requires the LRU lock, moving ++ * across tiers only involves atomic operations on page->flags and therefore ++ * has a negligible cost in the buffered access path. In the eviction path, ++ * comparisons of refaulted/(evicted+protected) from the first tier and the ++ * rest infer whether pages accessed multiple times through file descriptors ++ * are statistically hot and thus worth protecting. ++ * ++ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the ++ * number of categories of the active/inactive LRU when keeping track of ++ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in ++ * page->flags. ++ */ ++#define MAX_NR_TIERS 4U ++ + #ifndef __GENERATING_BOUNDS_H + + struct lruvec; +@@ -341,6 +363,16 @@ enum { + LRU_GEN_FILE, + }; + ++#define MIN_LRU_BATCH BITS_PER_LONG ++#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) ++ ++/* whether to keep historical stats from evicted generations */ ++#ifdef CONFIG_LRU_GEN_STATS ++#define NR_HIST_GENS MAX_NR_GENS ++#else ++#define NR_HIST_GENS 1U ++#endif ++ + /* + * The youngest generation number is stored in max_seq for both anon and file + * types as they are aged on an equal footing. The oldest generation numbers are +@@ -363,6 +395,15 @@ struct lru_gen_struct { + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the multi-gen LRU sizes, eventually consistent */ + long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the exponential moving average of refaulted */ ++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the exponential moving average of evicted+protected */ ++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the first tier doesn't need protection, hence the minus one */ ++ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; ++ /* can be modified without holding the LRU lock */ ++ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + }; + + void lru_gen_init_lruvec(struct lruvec *lruvec); +diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h +index 240905407a18..7d79818dc065 100644 +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -106,7 +106,10 @@ + #error "Not enough bits in page flags" + #endif + +-#define LRU_REFS_WIDTH 0 ++/* see the comment on MAX_NR_TIERS */ ++#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \ ++ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ ++ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) + + #endif + #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ +diff --git a/kernel/bounds.c b/kernel/bounds.c +index 5ee60777d8e4..b529182e8b04 100644 +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -24,8 +24,10 @@ int main(void) + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); + #ifdef CONFIG_LRU_GEN + DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); ++ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2); + #else + DEFINE(LRU_GEN_WIDTH, 0); ++ DEFINE(__LRU_REFS_WIDTH, 0); + #endif + /* End of constants */ + +diff --git a/mm/Kconfig b/mm/Kconfig +index 0eeb27397884..62433f3cd7ae 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -897,6 +897,7 @@ config IO_MAPPING + config SECRETMEM + def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + ++# multi-gen LRU { + config LRU_GEN + bool "Multi-Gen LRU" + depends on MMU +@@ -905,6 +906,16 @@ config LRU_GEN + help + A high performance LRU implementation to overcommit memory. + ++config LRU_GEN_STATS ++ bool "Full stats for debugging" ++ depends on LRU_GEN ++ help ++ Do not enable this option unless you plan to look at historical stats ++ from evicted generations for debugging purpose. ++ ++ This option has a per-memcg and per-node memory overhead. ++# } ++ + source "mm/damon/Kconfig" + + endmenu +diff --git a/mm/swap.c b/mm/swap.c +index 0bdc96661fb6..5d227577b609 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -389,6 +389,40 @@ static void __lru_cache_activate_page(struct page *page) + local_unlock(&lru_pvecs.lock); + } + ++#ifdef CONFIG_LRU_GEN ++static void page_inc_refs(struct page *page) ++{ ++ unsigned long new_flags, old_flags = READ_ONCE(page->flags); ++ ++ if (PageUnevictable(page)) ++ return; ++ ++ if (!PageReferenced(page)) { ++ SetPageReferenced(page); ++ return; ++ } ++ ++ if (!PageWorkingset(page)) { ++ SetPageWorkingset(page); ++ return; ++ } ++ ++ /* see the comment on MAX_NR_TIERS */ ++ do { ++ new_flags = old_flags & LRU_REFS_MASK; ++ if (new_flags == LRU_REFS_MASK) ++ break; ++ ++ new_flags += BIT(LRU_REFS_PGOFF); ++ new_flags |= old_flags & ~LRU_REFS_MASK; ++ } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); ++} ++#else ++static void page_inc_refs(struct page *page) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * Mark a page as having seen activity. + * +@@ -403,6 +437,11 @@ void mark_page_accessed(struct page *page) + { + page = compound_head(page); + ++ if (lru_gen_enabled()) { ++ page_inc_refs(page); ++ return; ++ } ++ + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 41826fe17eb3..932abd24c1b3 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1142,9 +1142,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; +- mem_cgroup_swapout(page, swap); ++ ++ /* get a shadow entry before mem_cgroup_swapout() clears page_memcg() */ + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(page, target_memcg); ++ mem_cgroup_swapout(page, swap); + __delete_from_swap_cache(page, swap, shadow); + xa_unlock_irq(&mapping->i_pages); + put_swap_page(page, swap); +@@ -2502,6 +2504,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) + unsigned long file; + struct lruvec *target_lruvec; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* +@@ -2827,6 +2832,17 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + * shorthand helpers + ******************************************************************************/ + ++#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) ++ ++#define DEFINE_MAX_SEQ(lruvec) \ ++ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) ++ ++#define DEFINE_MIN_SEQ(lruvec) \ ++ unsigned long min_seq[ANON_AND_FILE] = { \ ++ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ ++ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ ++ } ++ + #define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ +@@ -2852,6 +2868,745 @@ static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int ni + return pgdat ? &pgdat->__lruvec : NULL; + } + ++static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ if (!can_demote(pgdat->node_id, sc) && ++ mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) ++ return 0; ++ ++ return mem_cgroup_swappiness(memcg); ++} ++ ++static int get_nr_gens(struct lruvec *lruvec, int type) ++{ ++ return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; ++} ++ ++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) ++{ ++ /* see the comment on lru_gen_struct */ ++ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && ++ get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; ++} ++ ++/****************************************************************************** ++ * refault feedback loop ++ ******************************************************************************/ ++ ++/* ++ * A feedback loop based on Proportional-Integral-Derivative (PID) controller. ++ * ++ * The P term is refaulted/(evicted+protected) from a tier in the generation ++ * currently being evicted; the I term is the exponential moving average of the ++ * P term over the generations previously evicted, using the smoothing factor ++ * 1/2; the D term isn't supported. ++ * ++ * The setpoint (SP) is always the first tier of one type; the process variable ++ * (PV) is either any tier of the other type or any other tier of the same ++ * type. ++ * ++ * The error is the difference between the SP and the PV; the correction is to ++ * turn off protection when SP>PV or turn on protection when SP<PV. ++ * ++ * For future optimizations: ++ * 1. The D term may discount the other two terms over time so that long-lived ++ * generations can resist stale information. ++ */ ++struct ctrl_pos { ++ unsigned long refaulted; ++ unsigned long total; ++ int gain; ++}; ++ ++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, ++ struct ctrl_pos *pos) ++{ ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ pos->refaulted = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ pos->total = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ pos->total += lrugen->protected[hist][type][tier - 1]; ++ pos->gain = gain; ++} ++ ++static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) ++{ ++ int hist, tier; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; ++ unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; ++ ++ lockdep_assert_held(&lruvec->lru_lock); ++ ++ if (!carryover && !clear) ++ return; ++ ++ hist = lru_hist_from_seq(seq); ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ if (carryover) { ++ unsigned long sum; ++ ++ sum = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); ++ ++ sum = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ sum += lrugen->protected[hist][type][tier - 1]; ++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); ++ } ++ ++ if (clear) { ++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); ++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); ++ if (tier) ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); ++ } ++ } ++} ++ ++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) ++{ ++ /* ++ * Return true if the PV has a limited number of refaults or a lower ++ * refaulted/total than the SP. ++ */ ++ return pv->refaulted < MIN_LRU_BATCH || ++ pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= ++ (sp->refaulted + 1) * pv->total * pv->gain; ++} ++ ++/****************************************************************************** ++ * the aging ++ ******************************************************************************/ ++ ++/* protect pages accessed multiple times through file descriptors */ ++static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming) ++{ ++ int type = page_is_file_lru(page); ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ unsigned long new_flags, old_flags = READ_ONCE(page->flags); ++ ++ VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page); ++ ++ do { ++ new_gen = (old_gen + 1) % MAX_NR_GENS; ++ ++ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); ++ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ /* for end_page_writeback() */ ++ if (reclaiming) ++ new_flags |= BIT(PG_reclaim); ++ } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); ++ ++ lru_gen_update_size(lruvec, page, old_gen, new_gen); ++ ++ return new_gen; ++} ++ ++static void inc_min_seq(struct lruvec *lruvec, int type) ++{ ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ reset_ctrl_pos(lruvec, type, true); ++ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++} ++ ++static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) ++{ ++ int gen, type, zone; ++ bool success = false; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); ++ ++ /* find the oldest populated generation */ ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { ++ gen = lru_gen_from_seq(min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ goto next; ++ } ++ ++ min_seq[type]++; ++ } ++next: ++ ; ++ } ++ ++ /* see the comment on lru_gen_struct */ ++ if (can_swap) { ++ min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); ++ min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); ++ } ++ ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ if (min_seq[type] == lrugen->min_seq[type]) ++ continue; ++ ++ reset_ctrl_pos(lruvec, type, true); ++ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); ++ success = true; ++ } ++ ++ return success; ++} ++ ++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) ++{ ++ int prev, next; ++ int type, zone; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); ++ ++ if (max_seq != lrugen->max_seq) ++ goto unlock; ++ ++ for (type = ANON_AND_FILE - 1; type >= 0; type--) { ++ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) ++ continue; ++ ++ VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); ++ ++ inc_min_seq(lruvec, type); ++ } ++ ++ /* ++ * Update the active/inactive LRU sizes for compatibility. Both sides of ++ * the current max_seq need to be covered, since max_seq+1 can overlap ++ * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do ++ * overlap, cold/hot inversion happens. ++ */ ++ prev = lru_gen_from_seq(lrugen->max_seq - 1); ++ next = lru_gen_from_seq(lrugen->max_seq + 1); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ enum lru_list lru = type * LRU_INACTIVE_FILE; ++ long delta = lrugen->nr_pages[prev][type][zone] - ++ lrugen->nr_pages[next][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ __update_lru_size(lruvec, lru, zone, delta); ++ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); ++ } ++ } ++ ++ for (type = 0; type < ANON_AND_FILE; type++) ++ reset_ctrl_pos(lruvec, type, false); ++ ++ /* make sure preceding modifications appear */ ++ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); ++unlock: ++ spin_unlock_irq(&lruvec->lru_lock); ++} ++ ++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, ++ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) ++{ ++ int gen, type, zone; ++ unsigned long old = 0; ++ unsigned long young = 0; ++ unsigned long total = 0; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ unsigned long seq; ++ ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ unsigned long size = 0; ++ ++ gen = lru_gen_from_seq(seq); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); ++ ++ total += size; ++ if (seq == max_seq) ++ young += size; ++ else if (seq + MIN_NR_GENS == max_seq) ++ old += size; ++ } ++ } ++ ++ /* try to scrape all its memory if this memcg was deleted */ ++ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; ++ ++ /* ++ * The aging tries to be lazy to reduce the overhead, while the eviction ++ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the ++ * ideal number of generations is MIN_NR_GENS+1. ++ */ ++ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) ++ return true; ++ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) ++ return false; ++ ++ /* ++ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) ++ * of the total number of pages for each generation. A reasonable range ++ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The ++ * aging cares about the upper bound of hot pages, while the eviction ++ * cares about the lower bound of cold pages. ++ */ ++ if (young * MIN_NR_GENS > total) ++ return true; ++ if (old * (MIN_NR_GENS + 2) < total) ++ return true; ++ ++ return false; ++} ++ ++static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool need_aging; ++ unsigned long nr_to_scan; ++ int swappiness = get_swappiness(lruvec, sc); ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_WARN_ON_ONCE(sc->memcg_low_reclaim); ++ ++ mem_cgroup_calculate_protection(NULL, memcg); ++ ++ if (mem_cgroup_below_min(memcg)) ++ return; ++ ++ need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); ++ if (need_aging) ++ inc_max_seq(lruvec, max_seq, swappiness); ++} ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ struct mem_cgroup *memcg; ++ ++ VM_WARN_ON_ONCE(!current_is_kswapd()); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ ++ age_lruvec(lruvec, sc); ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++} ++ ++/****************************************************************************** ++ * the eviction ++ ******************************************************************************/ ++ ++static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) ++{ ++ bool success; ++ int gen = page_lru_gen(page); ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ int refs = page_lru_refs(page); ++ int tier = lru_tier_from_refs(refs); ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page); ++ ++ /* unevictable */ ++ if (!page_evictable(page)) { ++ success = lru_gen_del_page(lruvec, page, true); ++ VM_WARN_ON_ONCE_PAGE(!success, page); ++ SetPageUnevictable(page); ++ add_page_to_lru_list(page, lruvec); ++ __count_vm_events(UNEVICTABLE_PGCULLED, delta); ++ return true; ++ } ++ ++ /* dirty lazyfree */ ++ if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) { ++ success = lru_gen_del_page(lruvec, page, true); ++ VM_WARN_ON_ONCE_PAGE(!success, page); ++ SetPageSwapBacked(page); ++ add_page_to_lru_list_tail(page, lruvec); ++ return true; ++ } ++ ++ /* protected */ ++ if (tier > tier_idx) { ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ gen = page_inc_gen(lruvec, page, false); ++ list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], ++ lrugen->protected[hist][type][tier - 1] + delta); ++ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); ++ return true; ++ } ++ ++ /* waiting for writeback */ ++ if (PageLocked(page) || PageWriteback(page) || ++ (type == LRU_GEN_FILE && PageDirty(page))) { ++ gen = page_inc_gen(lruvec, page, true); ++ list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc) ++{ ++ bool success; ++ ++ /* unmapping inhibited */ ++ if (!sc->may_unmap && page_mapped(page)) ++ return false; ++ ++ /* swapping inhibited */ ++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ (PageDirty(page) || ++ (PageAnon(page) && !PageSwapCache(page)))) ++ return false; ++ ++ /* raced with release_pages() */ ++ if (!get_page_unless_zero(page)) ++ return false; ++ ++ /* raced with another isolation */ ++ if (!TestClearPageLRU(page)) { ++ put_page(page); ++ return false; ++ } ++ ++ /* see the comment on MAX_NR_TIERS */ ++ if (!PageReferenced(page)) ++ set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); ++ ++ /* for shrink_page_list() */ ++ ClearPageReclaim(page); ++ ClearPageReferenced(page); ++ ++ success = lru_gen_del_page(lruvec, page, true); ++ VM_WARN_ON_ONCE_PAGE(!success, page); ++ ++ return true; ++} ++ ++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, ++ int type, int tier, struct list_head *list) ++{ ++ int gen, zone; ++ enum vm_event_item item; ++ int sorted = 0; ++ int scanned = 0; ++ int isolated = 0; ++ int remaining = MAX_LRU_BATCH; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ VM_WARN_ON_ONCE(!list_empty(list)); ++ ++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) ++ return 0; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = sc->reclaim_idx; zone >= 0; zone--) { ++ LIST_HEAD(moved); ++ int skipped = 0; ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ int delta = thp_nr_pages(page); ++ ++ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); ++ VM_WARN_ON_ONCE_PAGE(PageActive(page), page); ++ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); ++ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); ++ ++ scanned += delta; ++ ++ if (sort_page(lruvec, page, tier)) ++ sorted += delta; ++ else if (isolate_page(lruvec, page, sc)) { ++ list_add(&page->lru, list); ++ isolated += delta; ++ } else { ++ list_move(&page->lru, &moved); ++ skipped += delta; ++ } ++ ++ if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) ++ break; ++ } ++ ++ if (skipped) { ++ list_splice(&moved, head); ++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); ++ } ++ ++ if (!remaining || isolated >= MIN_LRU_BATCH) ++ break; ++ } ++ ++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; ++ if (!cgroup_reclaim(sc)) { ++ __count_vm_events(item, isolated); ++ __count_vm_events(PGREFILL, sorted); ++ } ++ __count_memcg_events(memcg, item, isolated); ++ __count_memcg_events(memcg, PGREFILL, sorted); ++ __count_vm_events(PGSCAN_ANON + type, isolated); ++ ++ /* ++ * There might not be eligible pages due to reclaim_idx, may_unmap and ++ * may_writepage. Check the remaining to prevent livelock if it's not ++ * making progress. ++ */ ++ return isolated || !remaining ? scanned : 0; ++} ++ ++static int get_tier_idx(struct lruvec *lruvec, int type) ++{ ++ int tier; ++ struct ctrl_pos sp, pv; ++ ++ /* ++ * To leave a margin for fluctuations, use a larger gain factor (1:2). ++ * This value is chosen because any other tier would have at least twice ++ * as many refaults as the first tier. ++ */ ++ read_ctrl_pos(lruvec, type, 0, 1, &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, 2, &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ return tier - 1; ++} ++ ++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) ++{ ++ int type, tier; ++ struct ctrl_pos sp, pv; ++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; ++ ++ /* ++ * Compare the first tier of anon with that of file to determine which ++ * type to scan. Also need to compare other tiers of the selected type ++ * with the first tier of the other type to determine the last tier (of ++ * the selected type) to evict. ++ */ ++ read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); ++ read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); ++ type = positive_ctrl_err(&sp, &pv); ++ ++ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ *tier_idx = tier - 1; ++ ++ return type; ++} ++ ++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ int *type_scanned, struct list_head *list) ++{ ++ int i; ++ int type; ++ int scanned; ++ int tier = -1; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ /* ++ * Try to make the obvious choice first. When anon and file are both ++ * available from the same generation, interpret swappiness 1 as file ++ * first and 200 as anon first. ++ */ ++ if (!swappiness) ++ type = LRU_GEN_FILE; ++ else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) ++ type = LRU_GEN_ANON; ++ else if (swappiness == 1) ++ type = LRU_GEN_FILE; ++ else if (swappiness == 200) ++ type = LRU_GEN_ANON; ++ else ++ type = get_type_to_scan(lruvec, swappiness, &tier); ++ ++ for (i = !swappiness; i < ANON_AND_FILE; i++) { ++ if (tier < 0) ++ tier = get_tier_idx(lruvec, type); ++ ++ scanned = scan_pages(lruvec, sc, type, tier, list); ++ if (scanned) ++ break; ++ ++ type = !type; ++ tier = -1; ++ } ++ ++ *type_scanned = type; ++ ++ return scanned; ++} ++ ++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ int type; ++ int scanned; ++ int reclaimed; ++ LIST_HEAD(list); ++ struct page *page; ++ enum vm_event_item item; ++ struct reclaim_stat stat; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); ++ ++ scanned += try_to_inc_min_seq(lruvec, swappiness); ++ ++ if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) ++ scanned = 0; ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ if (list_empty(&list)) ++ return scanned; ++ ++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); ++ ++ list_for_each_entry(page, &list, lru) { ++ /* restore LRU_REFS_FLAGS cleared by isolate_page() */ ++ if (PageWorkingset(page)) ++ SetPageReferenced(page); ++ ++ /* don't add rejected pages to the oldest generation */ ++ if (PageReclaim(page) && ++ (PageDirty(page) || PageWriteback(page))) ++ ClearPageActive(page); ++ else ++ SetPageActive(page); ++ } ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ move_pages_to_lru(lruvec, &list); ++ ++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; ++ if (!cgroup_reclaim(sc)) ++ __count_vm_events(item, reclaimed); ++ __count_memcg_events(memcg, item, reclaimed); ++ __count_vm_events(PGSTEAL_ANON + type, reclaimed); ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_uncharge_list(&list); ++ free_unref_page_list(&list); ++ ++ sc->nr_reclaimed += reclaimed; ++ ++ return scanned; ++} ++ ++static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, ++ bool can_swap) ++{ ++ bool need_aging; ++ unsigned long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg) || ++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) ++ return 0; ++ ++ need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); ++ if (!need_aging) ++ return nr_to_scan; ++ ++ /* skip the aging path at the default priority */ ++ if (sc->priority == DEF_PRIORITY) ++ goto done; ++ ++ /* leave the work to lru_gen_age_node() */ ++ if (current_is_kswapd()) ++ return 0; ++ ++ inc_max_seq(lruvec, max_seq, can_swap); ++done: ++ return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ unsigned long scanned = 0; ++ ++ lru_add_drain(); ++ ++ blk_start_plug(&plug); ++ ++ while (true) { ++ int delta; ++ int swappiness; ++ unsigned long nr_to_scan; ++ ++ if (sc->may_swap) ++ swappiness = get_swappiness(lruvec, sc); ++ else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) ++ swappiness = 1; ++ else ++ swappiness = 0; ++ ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); ++ if (!nr_to_scan) ++ break; ++ ++ delta = evict_pages(lruvec, sc, swappiness); ++ if (!delta) ++ break; ++ ++ scanned += delta; ++ if (scanned >= nr_to_scan) ++ break; ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++} ++ + /****************************************************************************** + * initialization + ******************************************************************************/ +@@ -2894,6 +3649,16 @@ static int __init init_lru_gen(void) + }; + late_initcall(init_lru_gen); + ++#else /* !CONFIG_LRU_GEN */ ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -2907,6 +3672,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + bool proportional_reclaim; + struct blk_plug plug; + ++ if (lru_gen_enabled()) { ++ lru_gen_shrink_lruvec(lruvec, sc); ++ return; ++ } ++ + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ +@@ -3372,6 +4142,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) + struct lruvec *target_lruvec; + unsigned long refaults; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; +@@ -3736,12 +4509,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + } + #endif + +-static void age_active_anon(struct pglist_data *pgdat, +- struct scan_control *sc) ++static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { + struct mem_cgroup *memcg; + struct lruvec *lruvec; + ++ if (lru_gen_enabled()) { ++ lru_gen_age_node(pgdat, sc); ++ return; ++ } ++ + if (!can_age_anon_pages(pgdat, sc)) + return; + +@@ -4058,12 +4835,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) + sc.may_swap = !nr_boost_reclaim; + + /* +- * Do some background aging of the anon list, to give +- * pages a chance to be referenced before reclaiming. All +- * pages are rotated regardless of classzone as this is +- * about consistent aging. ++ * Do some background aging, to give pages a chance to be ++ * referenced before reclaiming. All pages are rotated ++ * regardless of classzone as this is about consistent aging. + */ +- age_active_anon(pgdat, &sc); ++ kswapd_age_node(pgdat, &sc); + + /* + * If we're getting trouble reclaiming, start doing writepage +diff --git a/mm/workingset.c b/mm/workingset.c +index 880d882f3325..aeba62cebf8c 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; + static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) + { +- eviction >>= bucket_order; + eviction &= EVICTION_MASK; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; +@@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, + + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); +- *evictionp = entry << bucket_order; ++ *evictionp = entry; + *workingsetp = workingset; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static void *lru_gen_eviction(struct page *page) ++{ ++ int hist; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lru_gen_struct *lrugen; ++ int type = page_is_file_lru(page); ++ int delta = thp_nr_pages(page); ++ int refs = page_lru_refs(page); ++ int tier = lru_tier_from_refs(refs); ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); ++ ++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->lrugen; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); ++ ++ hist = lru_hist_from_seq(min_seq); ++ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); ++ ++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); ++} ++ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++ int hist, tier, refs; ++ int memcg_id; ++ bool workingset; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lru_gen_struct *lrugen; ++ struct mem_cgroup *memcg; ++ struct pglist_data *pgdat; ++ int type = page_is_file_lru(page); ++ int delta = thp_nr_pages(page); ++ ++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); ++ ++ if (pgdat != page_pgdat(page)) ++ return; ++ ++ rcu_read_lock(); ++ ++ memcg = page_memcg_rcu(page); ++ if (memcg_id != mem_cgroup_id(memcg)) ++ goto unlock; ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->lrugen; ++ ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) ++ goto unlock; ++ ++ hist = lru_hist_from_seq(min_seq); ++ /* see the comment in page_lru_refs() */ ++ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; ++ tier = lru_tier_from_refs(refs); ++ ++ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); ++ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); ++ ++ /* ++ * Count the following two cases as stalls: ++ * 1. For pages accessed through page tables, hotter pages pushed out ++ * hot pages which refaulted immediately. ++ * 2. For pages accessed multiple times through file descriptors, ++ * numbers of accesses might have been out of the range. ++ */ ++ if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) { ++ SetPageWorkingset(page); ++ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); ++ } ++unlock: ++ rcu_read_unlock(); ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static void *lru_gen_eviction(struct page *page) ++{ ++ return NULL; ++} ++ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + /** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged +@@ -264,10 +360,14 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + ++ if (lru_gen_enabled()) ++ return lru_gen_eviction(page); ++ + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); ++ eviction >>= bucket_order; + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + } +@@ -296,7 +396,13 @@ void workingset_refault(struct page *page, void *shadow) + bool workingset; + int memcgid; + ++ if (lru_gen_enabled()) { ++ lru_gen_refault(page, shadow); ++ return; ++ } ++ + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); ++ eviction <<= bucket_order; + + rcu_read_lock(); + /* +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-06-mm-multigenerational-lru-aging.patch b/target/linux/generic/backport-5.15/020-v6.1-06-mm-multigenerational-lru-aging.patch deleted file mode 100644 index 2ff49681c7..0000000000 --- a/target/linux/generic/backport-5.15/020-v6.1-06-mm-multigenerational-lru-aging.patch +++ /dev/null @@ -1,1176 +0,0 @@ -From 8217cd2238c40cf77208aa27a7cc09879e685890 Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Mon, 5 Apr 2021 04:35:07 -0600 -Subject: [PATCH 06/10] mm: multigenerational lru: aging - -The aging produces young generations. Given an lruvec, the aging -traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan -PTEs for accessed pages. Upon finding one, the aging updates its -generation number to max_seq (modulo MAX_NR_GENS). After each round of -traversal, the aging increments max_seq. The aging is due when -min_seq[] reaches max_seq-1. - -The aging uses the following optimizations when walking page tables: - 1) It skips non-leaf PMD entries that have the accessed bit cleared - when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. - 2) It does not zigzag between a PGD table and the same PMD or PTE - table spanning multiple VMAs. In other words, it finishes all the - VMAs within the range of the same PMD or PTE table before it returns - to this PGD table. This optimizes workloads that have large numbers - of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5. - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I3ae8abc3100d023cecb3a699d86020ae6fc10a45 ---- - include/linux/memcontrol.h | 3 + - include/linux/mmzone.h | 9 + - include/linux/oom.h | 16 + - include/linux/swap.h | 3 + - mm/memcontrol.c | 5 + - mm/oom_kill.c | 4 +- - mm/rmap.c | 8 + - mm/vmscan.c | 948 +++++++++++++++++++++++++++++++++++++ - 8 files changed, 994 insertions(+), 2 deletions(-) - ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -1367,10 +1367,13 @@ mem_cgroup_print_oom_meminfo(struct mem_ - - static inline void lock_page_memcg(struct page *page) - { -+ /* to match page_memcg_rcu() */ -+ rcu_read_lock(); - } - - static inline void unlock_page_memcg(struct page *page) - { -+ rcu_read_unlock(); - } - - static inline void mem_cgroup_handle_over_high(void) ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -295,6 +295,7 @@ enum lruvec_flags { - }; - - struct lruvec; -+struct page_vma_mapped_walk; - - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) - #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -@@ -393,6 +394,7 @@ struct mm_walk_args { - - void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); - void lru_gen_change_state(bool enable, bool main, bool swap); -+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - - #ifdef CONFIG_MEMCG - void lru_gen_init_memcg(struct mem_cgroup *memcg); -@@ -409,6 +411,10 @@ static inline void lru_gen_change_state( - { - } - -+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+{ -+} -+ - #ifdef CONFIG_MEMCG - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) - { -@@ -1028,6 +1034,9 @@ typedef struct pglist_data { - - unsigned long flags; - -+#ifdef CONFIG_LRU_GEN -+ struct mm_walk_args mm_walk_args; -+#endif - ZONE_PADDING(_pad2_) - - /* Per-node vmstats */ ---- a/include/linux/oom.h -+++ b/include/linux/oom.h -@@ -57,6 +57,22 @@ struct oom_control { - extern struct mutex oom_lock; - extern struct mutex oom_adj_mutex; - -+#ifdef CONFIG_MMU -+extern struct task_struct *oom_reaper_list; -+extern struct wait_queue_head oom_reaper_wait; -+ -+static inline bool oom_reaping_in_progress(void) -+{ -+ /* racy check to see if oom reaping could be in progress */ -+ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait); -+} -+#else -+static inline bool oom_reaping_in_progress(void) -+{ -+ return false; -+} -+#endif -+ - static inline void set_current_oom_origin(void) - { - current->signal->oom_flag_origin = true; ---- a/include/linux/swap.h -+++ b/include/linux/swap.h -@@ -137,6 +137,9 @@ union swap_header { - */ - struct reclaim_state { - unsigned long reclaimed_slab; -+#ifdef CONFIG_LRU_GEN -+ struct mm_walk_args *mm_walk_args; -+#endif - }; - - #ifdef __KERNEL__ ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -1304,12 +1304,17 @@ void mem_cgroup_update_lru_size(struct l - *lru_size += nr_pages; - - size = *lru_size; -+#ifdef CONFIG_LRU_GEN -+ /* unlikely but not a bug when reset_batch_size() is pending */ -+ VM_WARN_ON(size + MAX_BATCH_SIZE < 0); -+#else - if (WARN_ONCE(size < 0, - "%s(%p, %d, %d): lru_size %ld\n", - __func__, lruvec, lru, nr_pages, size)) { - VM_BUG_ON(1); - *lru_size = 0; - } -+#endif - - if (nr_pages > 0) - *lru_size += nr_pages; ---- a/mm/oom_kill.c -+++ b/mm/oom_kill.c -@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struc - * victim (if that is possible) to help the OOM killer to move on. - */ - static struct task_struct *oom_reaper_th; --static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); --static struct task_struct *oom_reaper_list; -+DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -+struct task_struct *oom_reaper_list; - static DEFINE_SPINLOCK(oom_reaper_lock); - - bool __oom_reap_task_mm(struct mm_struct *mm) ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -73,6 +73,7 @@ - #include <linux/page_idle.h> - #include <linux/memremap.h> - #include <linux/userfaultfd_k.h> -+#include <linux/mm_inline.h> - - #include <asm/tlbflush.h> - -@@ -793,6 +794,13 @@ static bool page_referenced_one(struct p - } - - if (pvmw.pte) { -+ /* the multigenerational lru exploits the spatial locality */ -+ if (lru_gen_enabled() && pte_young(*pvmw.pte) && -+ !(vma->vm_flags & VM_SEQ_READ)) { -+ lru_gen_look_around(&pvmw); -+ referenced++; -+ } -+ - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - /* ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -51,6 +51,8 @@ - #include <linux/dax.h> - #include <linux/psi.h> - #include <linux/memory.h> -+#include <linux/pagewalk.h> -+#include <linux/shmem_fs.h> - - #include <asm/tlbflush.h> - #include <asm/div64.h> -@@ -2822,6 +2824,15 @@ static bool can_age_anon_pages(struct pg - * shorthand helpers - ******************************************************************************/ - -+#define DEFINE_MAX_SEQ(lruvec) \ -+ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq) -+ -+#define DEFINE_MIN_SEQ(lruvec) \ -+ unsigned long min_seq[ANON_AND_FILE] = { \ -+ READ_ONCE((lruvec)->evictable.min_seq[0]), \ -+ READ_ONCE((lruvec)->evictable.min_seq[1]), \ -+ } -+ - #define for_each_gen_type_zone(gen, type, zone) \ - for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ - for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -@@ -2834,6 +2845,12 @@ static int page_lru_gen(struct page *pag - return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; - } - -+static int get_swappiness(struct mem_cgroup *memcg) -+{ -+ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? -+ mem_cgroup_swappiness(memcg) : 0; -+} -+ - static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) - { - struct pglist_data *pgdat = NODE_DATA(nid); -@@ -3164,6 +3181,926 @@ done: - } - - /****************************************************************************** -+ * the aging -+ ******************************************************************************/ -+ -+static int page_update_gen(struct page *page, int gen) -+{ -+ unsigned long old_flags, new_flags; -+ -+ VM_BUG_ON(gen >= MAX_NR_GENS); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ -+ if (!(new_flags & LRU_GEN_MASK)) { -+ new_flags |= BIT(PG_referenced); -+ continue; -+ } -+ -+ new_flags &= ~LRU_GEN_MASK; -+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int old_gen, new_gen; -+ unsigned long old_flags, new_flags; -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ old_gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page); -+ -+ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ /* page_update_gen() has updated this page? */ -+ if (new_gen >= 0 && new_gen != old_gen) { -+ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); -+ return; -+ } -+ -+ new_gen = (old_gen + 1) % MAX_NR_GENS; -+ -+ new_flags &= ~LRU_GEN_MASK; -+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; -+ /* for end_page_writeback() */ -+ if (reclaiming) -+ new_flags |= BIT(PG_reclaim); -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, old_gen, new_gen); -+ if (reclaiming) -+ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); -+ else -+ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); -+} -+ -+static void update_batch_size(struct page *page, int old_gen, int new_gen, -+ struct mm_walk_args *args) -+{ -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int delta = thp_nr_pages(page); -+ -+ VM_BUG_ON(old_gen >= MAX_NR_GENS); -+ VM_BUG_ON(new_gen >= MAX_NR_GENS); -+ -+ args->batch_size++; -+ -+ args->nr_pages[old_gen][type][zone] -= delta; -+ args->nr_pages[new_gen][type][zone] += delta; -+} -+ -+static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args) -+{ -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ args->batch_size = 0; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ enum lru_list lru = type * LRU_FILE; -+ int delta = args->nr_pages[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ args->nr_pages[gen][type][zone] = 0; -+ WRITE_ONCE(lrugen->sizes[gen][type][zone], -+ lrugen->sizes[gen][type][zone] + delta); -+ -+ if (lru_gen_is_active(lruvec, gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, delta); -+ } -+} -+ -+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk) -+{ -+ struct address_space *mapping; -+ struct vm_area_struct *vma = walk->vma; -+ struct mm_walk_args *args = walk->private; -+ -+ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) || -+ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ))) -+ return true; -+ -+ if (vma_is_anonymous(vma)) -+ return !args->swappiness; -+ -+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) -+ return true; -+ -+ mapping = vma->vm_file->f_mapping; -+ if (!mapping->a_ops->writepage) -+ return true; -+ -+ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping); -+} -+ -+/* -+ * Some userspace memory allocators create many single-page VMAs. So instead of -+ * returning back to the PGD table for each of such VMAs, we finish at least an -+ * entire PMD table and therefore avoid many zigzags. -+ */ -+static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size, -+ unsigned long *start, unsigned long *end) -+{ -+ unsigned long next = round_up(*end, size); -+ -+ VM_BUG_ON(mask & size); -+ VM_BUG_ON(*start >= *end); -+ VM_BUG_ON((next & mask) != (*start & mask)); -+ -+ while (walk->vma) { -+ if (next >= walk->vma->vm_end) { -+ walk->vma = walk->vma->vm_next; -+ continue; -+ } -+ -+ if ((next & mask) != (walk->vma->vm_start & mask)) -+ return false; -+ -+ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) { -+ walk->vma = walk->vma->vm_next; -+ continue; -+ } -+ -+ *start = max(next, walk->vma->vm_start); -+ next = (next | ~mask) + 1; -+ /* rounded-up boundaries can wrap to 0 */ -+ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end; -+ -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pte_t *pte; -+ spinlock_t *ptl; -+ unsigned long addr; -+ int worth = 0; -+ struct mm_walk_args *args = walk->private; -+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); -+ -+ VM_BUG_ON(pmd_leaf(*pmd)); -+ -+ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl); -+ arch_enter_lazy_mmu_mode(); -+restart: -+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ struct page *page; -+ unsigned long pfn = pte_pfn(pte[i]); -+ -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ -+ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) -+ continue; -+ -+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) -+ continue; -+ -+ if (!pte_young(pte[i])) { -+ args->mm_stats[MM_LEAF_OLD]++; -+ continue; -+ } -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ page = compound_head(pfn_to_page(pfn)); -+ if (page_to_nid(page) != args->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != args->memcg) -+ continue; -+ -+ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end); -+ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i)) -+ continue; -+ -+ args->mm_stats[MM_LEAF_YOUNG]++; -+ -+ if (pte_dirty(pte[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ set_page_dirty(page); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ -+ worth++; -+ } -+ -+ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end)) -+ goto restart; -+ -+ arch_leave_lazy_mmu_mode(); -+ pte_unmap_unlock(pte, ptl); -+ -+ return worth >= MIN_BATCH_SIZE / 2; -+} -+ -+/* -+ * We scan PMD entries in two passes. The first pass reaches to PTE tables and -+ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD -+ * entries and needs to take the PMD lock. -+ */ -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, -+ struct vm_area_struct *vma, struct mm_walk *walk) -+{ -+ int i; -+ pmd_t *pmd; -+ spinlock_t *ptl; -+ struct mm_walk_args *args = walk->private; -+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); -+ -+ VM_BUG_ON(pud_leaf(*pud)); -+ -+ start = (start & PUD_MASK) + offset * PMD_SIZE; -+ pmd = pmd_offset(pud, start); -+ ptl = pmd_lock(walk->mm, pmd); -+ arch_enter_lazy_mmu_mode(); -+ -+ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) { -+ struct page *page; -+ unsigned long pfn = pmd_pfn(pmd[i]); -+ unsigned long addr = start + i * PMD_SIZE; -+ -+ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) -+ continue; -+ -+ if (WARN_ON_ONCE(pmd_devmap(pmd[i]))) -+ continue; -+ -+ if (!pmd_trans_huge(pmd[i])) { -+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) -+ pmdp_test_and_clear_young(vma, addr, pmd + i); -+ continue; -+ } -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ page = pfn_to_page(pfn); -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ if (page_to_nid(page) != args->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != args->memcg) -+ continue; -+ -+ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end); -+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) -+ continue; -+ -+ args->mm_stats[MM_LEAF_YOUNG]++; -+ -+ if (pmd_dirty(pmd[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ set_page_dirty(page); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ } -+ -+ arch_leave_lazy_mmu_mode(); -+ spin_unlock(ptl); -+ -+ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); -+} -+#else -+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, -+ struct vm_area_struct *vma, struct mm_walk *walk) -+{ -+} -+#endif -+ -+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pmd_t *pmd; -+ unsigned long next; -+ unsigned long addr; -+ struct vm_area_struct *vma; -+ int offset = -1; -+ bool reset = false; -+ struct mm_walk_args *args = walk->private; -+ struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg); -+ -+ VM_BUG_ON(pud_leaf(*pud)); -+ -+ pmd = pmd_offset(pud, start & PUD_MASK); -+restart: -+ vma = walk->vma; -+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { -+ pmd_t val = pmd_read_atomic(pmd + i); -+ -+ /* for pmd_read_atomic() */ -+ barrier(); -+ -+ next = pmd_addr_end(addr, end); -+ -+ if (!pmd_present(val)) { -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ continue; -+ } -+ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ if (pmd_trans_huge(val)) { -+ unsigned long pfn = pmd_pfn(val); -+ -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ -+ if (is_huge_zero_pmd(val)) -+ continue; -+ -+ if (!pmd_young(val)) { -+ args->mm_stats[MM_LEAF_OLD]++; -+ continue; -+ } -+ -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ if (offset < 0) -+ offset = i; -+ else if (i - offset >= MIN_BATCH_SIZE) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = i; -+ } -+ __set_bit(i - offset, args->bitmap); -+ reset = true; -+ continue; -+ } -+#endif -+ args->mm_stats[MM_NONLEAF_TOTAL]++; -+ -+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG -+ if (!pmd_young(val)) -+ continue; -+ -+ if (offset < 0) -+ offset = i; -+ else if (i - offset >= MIN_BATCH_SIZE) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = i; -+ reset = false; -+ } -+ __set_bit(i - offset, args->bitmap); -+#endif -+ if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i)) -+ continue; -+ -+ args->mm_stats[MM_NONLEAF_PREV]++; -+ -+ if (!walk_pte_range(&val, addr, next, walk)) -+ continue; -+ -+ args->mm_stats[MM_NONLEAF_CUR]++; -+ -+ set_bloom_filter(lruvec, args->max_seq + 1, pmd + i); -+ } -+ -+ if (reset) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = -1; -+ reset = false; -+ } -+ -+ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end)) -+ goto restart; -+ -+ if (offset >= 0) -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+} -+ -+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pud_t *pud; -+ unsigned long addr; -+ unsigned long next; -+ struct mm_walk_args *args = walk->private; -+ -+ VM_BUG_ON(p4d_leaf(*p4d)); -+ -+ pud = pud_offset(p4d, start & P4D_MASK); -+restart: -+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { -+ pud_t val = READ_ONCE(pud[i]); -+ -+ next = pud_addr_end(addr, end); -+ -+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) -+ continue; -+ -+ walk_pmd_range(&val, addr, next, walk); -+ -+ if (args->batch_size >= MAX_BATCH_SIZE) { -+ end = (addr | ~PUD_MASK) + 1; -+ goto done; -+ } -+ } -+ -+ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end)) -+ goto restart; -+ -+ end = round_up(end, P4D_SIZE); -+done: -+ /* rounded-up boundaries can wrap to 0 */ -+ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0; -+ -+ return -EAGAIN; -+} -+ -+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args) -+{ -+ static const struct mm_walk_ops mm_walk_ops = { -+ .test_walk = should_skip_vma, -+ .p4d_entry = walk_pud_range, -+ }; -+ -+ int err; -+ -+ args->next_addr = FIRST_USER_ADDRESS; -+ -+ do { -+ unsigned long start = args->next_addr; -+ unsigned long end = mm->highest_vm_end; -+ -+ err = -EBUSY; -+ -+ rcu_read_lock(); -+#ifdef CONFIG_MEMCG -+ if (args->memcg && atomic_read(&args->memcg->moving_account)) -+ goto contended; -+#endif -+ if (!mmap_read_trylock(mm)) -+ goto contended; -+ -+ err = walk_page_range(mm, start, end, &mm_walk_ops, args); -+ -+ mmap_read_unlock(mm); -+ -+ if (args->batch_size) { -+ spin_lock_irq(&lruvec->lru_lock); -+ reset_batch_size(lruvec, args); -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+contended: -+ rcu_read_unlock(); -+ -+ cond_resched(); -+ } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm)); -+} -+ -+static struct mm_walk_args *alloc_mm_walk_args(void) -+{ -+ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) -+ return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL); -+ -+ return current->reclaim_state->mm_walk_args; -+} -+ -+static void free_mm_walk_args(struct mm_walk_args *args) -+{ -+ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) -+ kvfree(args); -+} -+ -+static bool inc_min_seq(struct lruvec *lruvec, int type) -+{ -+ int gen, zone; -+ int remaining = MAX_BATCH_SIZE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) -+ return true; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ struct list_head *head = &lrugen->lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ page_inc_gen(page, lruvec, false); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); -+ -+ return true; -+} -+ -+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) -+{ -+ int gen, type, zone; -+ bool success = false; -+ struct lrugen *lrugen = &lruvec->evictable; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) { -+ gen = lru_gen_from_seq(min_seq[type]); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ if (!list_empty(&lrugen->lists[gen][type][zone])) -+ goto next; -+ } -+ -+ min_seq[type]++; -+ } -+next: -+ ; -+ } -+ -+ min_seq[0] = min(min_seq[0], min_seq[1]); -+ if (swappiness) -+ min_seq[1] = max(min_seq[0], lrugen->min_seq[1]); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ if (min_seq[type] == lrugen->min_seq[type]) -+ continue; -+ -+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); -+ success = true; -+ } -+ -+ return success; -+} -+ -+static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq) -+{ -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ if (max_seq != lrugen->max_seq) -+ goto unlock; -+ -+ if (!try_to_inc_min_seq(lruvec, true)) { -+ for (type = ANON_AND_FILE - 1; type >= 0; type--) { -+ while (!inc_min_seq(lruvec, type)) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ } -+ } -+ -+ gen = lru_gen_from_seq(lrugen->max_seq - 1); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ enum lru_list lru = type * LRU_FILE; -+ long delta = lrugen->sizes[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ WARN_ON_ONCE(delta != (int)delta); -+ -+ update_lru_size(lruvec, lru, zone, delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); -+ } -+ } -+ -+ gen = lru_gen_from_seq(lrugen->max_seq + 1); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ enum lru_list lru = type * LRU_FILE; -+ long delta = lrugen->sizes[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ WARN_ON_ONCE(delta != (int)delta); -+ -+ update_lru_size(lruvec, lru, zone, -delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); -+ } -+ } -+ -+ WRITE_ONCE(lrugen->timestamps[gen], jiffies); -+ /* make sure all preceding modifications appear first */ -+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); -+unlock: -+ spin_unlock_irq(&lruvec->lru_lock); -+} -+ -+/* Main function used by the foreground, the background and the user-triggered aging. */ -+static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long max_seq, bool use_filter) -+{ -+ bool last; -+ struct mm_walk_args *args; -+ struct mm_struct *mm = NULL; -+ struct lrugen *lrugen = &lruvec->evictable; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ int nid = pgdat->node_id; -+ -+ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq)); -+ -+ /* -+ * If we are not from run_aging() and clearing the accessed bit may -+ * trigger page faults, then don't proceed to clearing all accessed -+ * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a -+ * handful of accessed PTEs. This is less efficient but causes fewer -+ * page faults on CPUs that don't have the capability. -+ */ -+ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) { -+ inc_max_seq(lruvec, max_seq); -+ return true; -+ } -+ -+ args = alloc_mm_walk_args(); -+ if (!args) -+ return false; -+ -+ args->memcg = memcg; -+ args->max_seq = max_seq; -+ args->start_pfn = pgdat->node_start_pfn; -+ args->end_pfn = pgdat_end_pfn(pgdat); -+ args->node_id = nid; -+ args->swappiness = swappiness; -+ args->use_filter = use_filter; -+ -+ do { -+ last = get_next_mm(lruvec, args, &mm); -+ if (mm) -+ walk_mm(lruvec, mm, args); -+ -+ cond_resched(); -+ } while (mm); -+ -+ free_mm_walk_args(args); -+ -+ if (!last) { -+ /* don't wait unless we may have trouble reclaiming */ -+ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2) -+ wait_event_killable(lruvec->mm_walk.wait, -+ max_seq < READ_ONCE(lrugen->max_seq)); -+ -+ return max_seq < READ_ONCE(lrugen->max_seq); -+ } -+ -+ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq)); -+ -+ inc_max_seq(lruvec, max_seq); -+ /* either we see any waiters or they will see updated max_seq */ -+ if (wq_has_sleeper(&lruvec->mm_walk.wait)) -+ wake_up_all(&lruvec->mm_walk.wait); -+ -+ wakeup_flusher_threads(WB_REASON_VMSCAN); -+ -+ return true; -+} -+ -+static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long max_seq, unsigned long *min_seq, bool *low) -+{ -+ int gen, type, zone; -+ long max = 0; -+ long min = 0; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for (type = !swappiness; type < ANON_AND_FILE; type++) { -+ unsigned long seq; -+ -+ for (seq = min_seq[type]; seq <= max_seq; seq++) { -+ long size = 0; -+ -+ gen = lru_gen_from_seq(seq); -+ -+ for (zone = 0; zone <= sc->reclaim_idx; zone++) -+ size += READ_ONCE(lrugen->sizes[gen][type][zone]); -+ -+ max += size; -+ if (type && max_seq - seq >= MIN_NR_GENS) -+ min += size; -+ } -+ } -+ -+ *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE; -+ -+ return max > 0 ? max : 0; -+} -+ -+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, -+ unsigned long min_ttl) -+{ -+ bool low; -+ long nr_to_scan; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ int swappiness = get_swappiness(memcg); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (mem_cgroup_below_min(memcg)) -+ return false; -+ -+ if (min_ttl) { -+ int gen = lru_gen_from_seq(min_seq[1]); -+ unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]); -+ -+ if (time_is_after_jiffies(birth + min_ttl)) -+ return false; -+ } -+ -+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); -+ if (!nr_to_scan) -+ return false; -+ -+ nr_to_scan >>= sc->priority; -+ -+ if (!mem_cgroup_online(memcg)) -+ nr_to_scan++; -+ -+ if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim)) -+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true); -+ -+ return true; -+} -+ -+/* Protect the working set accessed within the last N milliseconds. */ -+static unsigned long lru_gen_min_ttl __read_mostly; -+ -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+ struct mem_cgroup *memcg; -+ bool success = false; -+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); -+ -+ VM_BUG_ON(!current_is_kswapd()); -+ -+ if (!sc->force_deactivate) { -+ sc->force_deactivate = 1; -+ return; -+ } -+ -+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ -+ if (age_lruvec(lruvec, sc, min_ttl)) -+ success = true; -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ if (!success && mutex_trylock(&oom_lock)) { -+ struct oom_control oc = { -+ .gfp_mask = sc->gfp_mask, -+ .order = sc->order, -+ }; -+ -+ /* to avoid overkilling */ -+ if (!oom_reaping_in_progress()) -+ out_of_memory(&oc); -+ -+ mutex_unlock(&oom_lock); -+ } -+ -+ current->reclaim_state->mm_walk_args = NULL; -+} -+ -+/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */ -+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+{ -+ int i; -+ pte_t *pte; -+ struct page *page; -+ int old_gen, new_gen; -+ unsigned long start; -+ unsigned long end; -+ unsigned long addr; -+ struct mm_walk_args *args; -+ int worth = 0; -+ struct mem_cgroup *memcg = page_memcg(pvmw->page); -+ struct pglist_data *pgdat = page_pgdat(pvmw->page); -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ DEFINE_MAX_SEQ(lruvec); -+ -+ lockdep_assert_held(pvmw->ptl); -+ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page); -+ -+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; -+ if (!args) -+ return; -+ -+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); -+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; -+ -+ if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) { -+ if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2) -+ end = start + MIN_BATCH_SIZE * PAGE_SIZE; -+ else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2) -+ start = end - MIN_BATCH_SIZE * PAGE_SIZE; -+ else { -+ start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2; -+ end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2; -+ } -+ } -+ -+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; -+ new_gen = lru_gen_from_seq(max_seq); -+ -+ lock_page_memcg(pvmw->page); -+ arch_enter_lazy_mmu_mode(); -+ -+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ unsigned long pfn = pte_pfn(pte[i]); -+ -+ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) -+ continue; -+ -+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) -+ continue; -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) -+ continue; -+ -+ worth++; -+ -+ if (!pte_young(pte[i])) -+ continue; -+ -+ page = compound_head(pfn_to_page(pfn)); -+ if (page_to_nid(page) != pgdat->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != memcg) -+ continue; -+ -+ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end); -+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) -+ continue; -+ -+ if (pte_dirty(pte[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ __set_bit(i, args->bitmap); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ } -+ -+ arch_leave_lazy_mmu_mode(); -+ unlock_page_memcg(pvmw->page); -+ -+ if (worth >= MIN_BATCH_SIZE / 2) -+ set_bloom_filter(lruvec, max_seq, pvmw->pmd); -+ -+ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) -+ set_page_dirty(pte_page(pte[i])); -+ -+ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); -+} -+ -+/****************************************************************************** - * state change - ******************************************************************************/ - -@@ -3412,6 +4349,12 @@ static int __init init_lru_gen(void) - }; - late_initcall(init_lru_gen); - -+#else -+ -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+} -+ - #endif /* CONFIG_LRU_GEN */ - - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -4266,6 +5209,11 @@ static void age_active_anon(struct pglis - struct mem_cgroup *memcg; - struct lruvec *lruvec; - -+ if (lru_gen_enabled()) { -+ lru_gen_age_node(pgdat, sc); -+ return; -+ } -+ - if (!can_age_anon_pages(pgdat, sc)) - return; - diff --git a/target/linux/generic/backport-5.15/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch new file mode 100644 index 0000000000..e0c6380b5f --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch @@ -0,0 +1,508 @@ +From e4277535f6d6708bb19b88c4bad155832671d69b Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:04 -0600 +Subject: [PATCH 07/29] mm: multi-gen LRU: exploit locality in rmap +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Searching the rmap for PTEs mapping each page on an LRU list (to test and +clear the accessed bit) can be expensive because pages from different VMAs +(PA space) are not cache friendly to the rmap (VA space). For workloads +mostly using mapped pages, searching the rmap can incur the highest CPU +cost in the reclaim path. + +This patch exploits spatial locality to reduce the trips into the rmap. +When shrink_page_list() walks the rmap and finds a young PTE, a new +function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent +PTEs. On finding another young PTE, it clears the accessed bit and +updates the gen counter of the page mapped by this PTE to +(max_seq%MAX_NR_GENS)+1. + +Server benchmark results: + Single workload: + fio (buffered I/O): no change + + Single workload: + memcached (anon): +[3, 5]% + Ops/sec KB/sec + patch1-6: 1106168.46 43025.04 + patch1-7: 1147696.57 44640.29 + + Configurations: + no change + +Client benchmark results: + kswapd profiles: + patch1-6 + 39.03% lzo1x_1_do_compress (real work) + 18.47% page_vma_mapped_walk (overhead) + 6.74% _raw_spin_unlock_irq + 3.97% do_raw_spin_lock + 2.49% ptep_clear_flush + 2.48% anon_vma_interval_tree_iter_first + 1.92% page_referenced_one + 1.88% __zram_bvec_write + 1.48% memmove + 1.31% vma_interval_tree_iter_next + + patch1-7 + 48.16% lzo1x_1_do_compress (real work) + 8.20% page_vma_mapped_walk (overhead) + 7.06% _raw_spin_unlock_irq + 2.92% ptep_clear_flush + 2.53% __zram_bvec_write + 2.11% do_raw_spin_lock + 2.02% memmove + 1.93% lru_gen_look_around + 1.56% free_unref_page_list + 1.40% memset + + Configurations: + no change + +Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Barry Song <baohua@kernel.org> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/memcontrol.h | 31 +++++++ + include/linux/mmzone.h | 6 ++ + mm/internal.h | 1 + + mm/memcontrol.c | 1 + + mm/rmap.c | 7 ++ + mm/swap.c | 4 +- + mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++ + 7 files changed, 232 insertions(+), 2 deletions(-) + +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 4f189b17dafc..8d6a0329bc59 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -442,6 +442,7 @@ static inline struct obj_cgroup *__page_objcg(struct page *page) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. +@@ -497,6 +498,7 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. +@@ -953,6 +955,23 @@ void unlock_page_memcg(struct page *page); + + void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); + ++/* try to stablize page_memcg() for all the pages in a memcg */ ++static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) ++{ ++ rcu_read_lock(); ++ ++ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) ++ return true; ++ ++ rcu_read_unlock(); ++ return false; ++} ++ ++static inline void mem_cgroup_unlock_pages(void) ++{ ++ rcu_read_unlock(); ++} ++ + /* idx can be of type enum memcg_stat_item or node_stat_item */ + static inline void mod_memcg_state(struct mem_cgroup *memcg, + int idx, int val) +@@ -1369,6 +1388,18 @@ static inline void unlock_page_memcg(struct page *page) + { + } + ++static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) ++{ ++ /* to match page_memcg_rcu() */ ++ rcu_read_lock(); ++ return true; ++} ++ ++static inline void mem_cgroup_unlock_pages(void) ++{ ++ rcu_read_unlock(); ++} ++ + static inline void mem_cgroup_handle_over_high(void) + { + } +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index fce8945c507c..4db2b877fcf9 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -352,6 +352,7 @@ enum lruvec_flags { + #ifndef __GENERATING_BOUNDS_H + + struct lruvec; ++struct page_vma_mapped_walk; + + #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) + #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +@@ -407,6 +408,7 @@ struct lru_gen_struct { + }; + + void lru_gen_init_lruvec(struct lruvec *lruvec); ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); +@@ -419,6 +421,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) + { + } + ++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++} ++ + #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { +diff --git a/mm/internal.h b/mm/internal.h +index cf3cb933eba3..5c73246a092e 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -35,6 +35,7 @@ + void page_writeback_init(void); + + vm_fault_t do_swap_page(struct vm_fault *vmf); ++void activate_page(struct page *page); + + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 8b634dc72e7f..cc3431c5d9ba 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2798,6 +2798,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + */ + page->memcg_data = (unsigned long)memcg; + } +diff --git a/mm/rmap.c b/mm/rmap.c +index 330b361a460e..22a86122732e 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -73,6 +73,7 @@ + #include <linux/page_idle.h> + #include <linux/memremap.h> + #include <linux/userfaultfd_k.h> ++#include <linux/mm_inline.h> + + #include <asm/tlbflush.h> + +@@ -793,6 +794,12 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, + } + + if (pvmw.pte) { ++ if (lru_gen_enabled() && pte_young(*pvmw.pte) && ++ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { ++ lru_gen_look_around(&pvmw); ++ referenced++; ++ } ++ + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* +diff --git a/mm/swap.c b/mm/swap.c +index 5d227577b609..966ff2d83343 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -325,7 +325,7 @@ static bool need_activate_page_drain(int cpu) + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; + } + +-static void activate_page(struct page *page) ++void activate_page(struct page *page) + { + page = compound_head(page); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { +@@ -345,7 +345,7 @@ static inline void activate_page_drain(int cpu) + { + } + +-static void activate_page(struct page *page) ++void activate_page(struct page *page) + { + struct lruvec *lruvec; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 932abd24c1b3..1d0b25ae378c 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1409,6 +1409,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + ++ /* page_update_gen() tried to promote this page? */ ++ if (lru_gen_enabled() && !ignore_references && ++ page_mapped(page) && PageReferenced(page)) ++ goto keep_locked; ++ + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + +@@ -2990,6 +2995,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) + * the aging + ******************************************************************************/ + ++/* promote pages accessed through page tables */ ++static int page_update_gen(struct page *page, int gen) ++{ ++ unsigned long new_flags, old_flags = READ_ONCE(page->flags); ++ ++ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(!rcu_read_lock_held()); ++ ++ do { ++ /* lru_gen_del_page() has isolated this page? */ ++ if (!(old_flags & LRU_GEN_MASK)) { ++ /* for shrink_page_list() */ ++ new_flags = old_flags | BIT(PG_referenced); ++ continue; ++ } ++ ++ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); ++ ++ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ + /* protect pages accessed multiple times through file descriptors */ + static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming) + { +@@ -3001,6 +3029,11 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin + VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page); + + do { ++ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ /* page_update_gen() has promoted this page? */ ++ if (new_gen >= 0 && new_gen != old_gen) ++ return new_gen; ++ + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); +@@ -3015,6 +3048,43 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin + return new_gen; + } + ++static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) ++{ ++ unsigned long pfn = pte_pfn(pte); ++ ++ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); ++ ++ if (!pte_present(pte) || is_zero_pfn(pfn)) ++ return -1; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) ++ return -1; ++ ++ if (WARN_ON_ONCE(!pfn_valid(pfn))) ++ return -1; ++ ++ return pfn; ++} ++ ++static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg, ++ struct pglist_data *pgdat) ++{ ++ struct page *page; ++ ++ /* try to avoid unnecessary memory loads */ ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ return NULL; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != pgdat->node_id) ++ return NULL; ++ ++ if (page_memcg_rcu(page) != memcg) ++ return NULL; ++ ++ return page; ++} ++ + static void inc_min_seq(struct lruvec *lruvec, int type) + { + struct lru_gen_struct *lrugen = &lruvec->lrugen; +@@ -3214,6 +3284,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + } + ++/* ++ * This function exploits spatial locality when shrink_page_list() walks the ++ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. ++ */ ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++ int i; ++ pte_t *pte; ++ unsigned long start; ++ unsigned long end; ++ unsigned long addr; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; ++ struct page *page = pvmw->page; ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ DEFINE_MAX_SEQ(lruvec); ++ int old_gen, new_gen = lru_gen_from_seq(max_seq); ++ ++ lockdep_assert_held(pvmw->ptl); ++ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); ++ ++ if (spin_is_contended(pvmw->ptl)) ++ return; ++ ++ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); ++ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; ++ ++ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { ++ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) ++ end = start + MIN_LRU_BATCH * PAGE_SIZE; ++ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) ++ start = end - MIN_LRU_BATCH * PAGE_SIZE; ++ else { ++ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; ++ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; ++ } ++ } ++ ++ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ ++ rcu_read_lock(); ++ arch_enter_lazy_mmu_mode(); ++ ++ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn; ++ ++ pfn = get_pte_pfn(pte[i], pvmw->vma, addr); ++ if (pfn == -1) ++ continue; ++ ++ if (!pte_young(pte[i])) ++ continue; ++ ++ page = get_pfn_page(pfn, memcg, pgdat); ++ if (!page) ++ continue; ++ ++ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) ++ VM_WARN_ON_ONCE(true); ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && ++ !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_lru_gen(page); ++ if (old_gen < 0) ++ SetPageReferenced(page); ++ else if (old_gen != new_gen) ++ __set_bit(i, bitmap); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ rcu_read_unlock(); ++ ++ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { ++ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { ++ page = pte_page(pte[i]); ++ activate_page(page); ++ } ++ return; ++ } ++ ++ /* page_update_gen() requires stable page_memcg() */ ++ if (!mem_cgroup_trylock_pages(memcg)) ++ return; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); ++ ++ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { ++ page = compound_head(pte_page(pte[i])); ++ if (page_memcg_rcu(page) != memcg) ++ continue; ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen < 0 || old_gen == new_gen) ++ continue; ++ ++ lru_gen_update_size(lruvec, page, old_gen, new_gen); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_unlock_pages(); ++} ++ + /****************************************************************************** + * the eviction + ******************************************************************************/ +@@ -3250,6 +3428,12 @@ static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) + return true; + } + ++ /* promoted */ ++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { ++ list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ + /* protected */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch deleted file mode 100644 index a75fedecaa..0000000000 --- a/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch +++ /dev/null @@ -1,1002 +0,0 @@ -From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Tue, 29 Jun 2021 20:46:47 -0600 -Subject: [PATCH 07/10] mm: multigenerational lru: eviction - -The eviction consumes old generations. Given an lruvec, the eviction -scans pages on lrugen->lists indexed by anon and file min_seq[] -(modulo MAX_NR_GENS). It first tries to select a type based on the -values of min_seq[]. If they are equal, it selects the type that has -a lower refaulted %. The eviction sorts a page according to its -updated generation number if the aging has found this page accessed. -It also moves a page to the next generation if this page is from an -upper tier that has a higher refaulted % than the base tier. The -eviction increments min_seq[] of a selected type when it finds -lrugen->lists indexed by min_seq[] of this selected type are empty. - -Each generation is divided into multiple tiers. Tiers represent -different ranges of numbers of accesses from file descriptors only. -Pages accessed N times via file descriptors belong to tier -order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, -and they require additional MAX_NR_TIERS-2 bits in page->flags. In -contrast to moving between generations which requires list operations, -moving between tiers only involves operations on page->flags and -therefore has a negligible cost. A feedback loop modeled after the PID -controller monitors refaulted % across all tiers and decides when to -protect pages from which tiers. - -Unmapped pages are initially added to the oldest generation and then -conditionally protected by tiers. Each tier keeps track of how many -pages from it have refaulted. Tier 0 is the base tier and pages from -it are evicted unconditionally because there are no better candidates. -Pages from an upper tier are either evicted or moved to the next -generation, depending on whether this upper tier has a higher -refaulted % than the base tier. This model has the following -advantages: - 1) It removes the cost in the buffered access path and reduces the - overall cost of protection because pages are conditionally protected - in the reclaim path. - 2) It takes mapped pages into account and avoids overprotecting - pages accessed multiple times via file descriptors. - 3 Additional tiers improve the protection of pages accessed more - than twice. - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac ---- - include/linux/mm_inline.h | 10 + - include/linux/mmzone.h | 33 +++ - mm/swap.c | 42 +++ - mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++- - mm/workingset.c | 120 ++++++++- - 5 files changed, 757 insertions(+), 3 deletions(-) - ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi - return seq % NR_HIST_GENS; - } - -+/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ -+static inline int lru_tier_from_refs(int refs) -+{ -+ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); -+ -+ return order_base_2(refs + 1); -+} -+ - /* The youngest and the second youngest generations are counted as active. */ - static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) - { -@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru - gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; - - new_flags &= ~LRU_GEN_MASK; -+ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - /* for shrink_page_list() */ - if (reclaiming) - new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -319,6 +319,30 @@ struct page_vma_mapped_walk; - #define MIN_NR_GENS 2 - #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) - -+/* -+ * Each generation is divided into multiple tiers. Tiers represent different -+ * ranges of numbers of accesses from file descriptors, i.e., -+ * mark_page_accessed(). In contrast to moving between generations which -+ * requires the lru lock, moving between tiers only involves an atomic -+ * operation on page->flags and therefore has a negligible cost. -+ * -+ * The purposes of tiers are to: -+ * 1) estimate whether pages accessed multiple times via file descriptors are -+ * more active than pages accessed only via page tables by separating the two -+ * access types into upper tiers and the base tier, and comparing refaulted % -+ * across all tiers. -+ * 2) improve buffered io performance by deferring the protection of pages -+ * accessed multiple times until the eviction. That is the protection happens -+ * in the reclaim path, not the access path. -+ * -+ * Pages accessed N times via file descriptors belong to tier order_base_2(N). -+ * The base tier may be marked by PageReferenced(). All upper tiers are marked -+ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are -+ * used to support more than one upper tier. -+ */ -+#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) -+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) -+ - /* Whether to keep stats for historical generations. */ - #ifdef CONFIG_LRU_GEN_STATS - #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -@@ -337,6 +361,15 @@ struct lrugen { - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; - /* the sizes of the multigenerational lru lists in pages */ - unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the exponential moving average of refaulted */ -+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the exponential moving average of protected+evicted */ -+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the base tier isn't protected, hence the minus one */ -+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; -+ /* incremented without holding the lru lock */ -+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; -+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; - /* whether the multigenerational lru is enabled */ - bool enabled[ANON_AND_FILE]; - }; ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st - local_unlock(&lru_pvecs.lock); - } - -+#ifdef CONFIG_LRU_GEN -+static void page_inc_refs(struct page *page) -+{ -+ unsigned long refs; -+ unsigned long old_flags, new_flags; -+ -+ if (PageUnevictable(page)) -+ return; -+ -+ /* see the comment on MAX_NR_TIERS */ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ -+ if (!(new_flags & BIT(PG_referenced))) { -+ new_flags |= BIT(PG_referenced); -+ continue; -+ } -+ -+ if (!(new_flags & BIT(PG_workingset))) { -+ new_flags |= BIT(PG_workingset); -+ continue; -+ } -+ -+ refs = new_flags & LRU_REFS_MASK; -+ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); -+ -+ new_flags &= ~LRU_REFS_MASK; -+ new_flags |= refs; -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+} -+#else -+static void page_inc_refs(struct page *page) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - /* - * Mark a page as having seen activity. - * -@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag - { - page = compound_head(page); - -+ if (lru_gen_enabled()) { -+ page_inc_refs(page); -+ return; -+ } -+ - if (!PageReferenced(page)) { - SetPageReferenced(page); - } else if (PageUnevictable(page)) { ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre - - if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; -- mem_cgroup_swapout(page, swap); -+ -+ /* get a shadow entry before page_memcg() is cleared */ - if (reclaimed && !mapping_exiting(mapping)) - shadow = workingset_eviction(page, target_memcg); -+ mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page, swap, shadow); - xa_unlock_irq(&mapping->i_pages); - put_swap_page(page, swap); -@@ -1410,6 +1412,11 @@ retry: - if (!sc->may_unmap && page_mapped(page)) - goto keep_locked; - -+ /* lru_gen_look_around() has updated this page? */ -+ if (lru_gen_enabled() && !ignore_references && -+ page_mapped(page) && PageReferenced(page)) -+ goto keep_locked; -+ - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); - -@@ -2505,6 +2512,9 @@ static void prepare_scan_count(pg_data_t - unsigned long file; - struct lruvec *target_lruvec; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - - /* -@@ -2845,6 +2855,17 @@ static int page_lru_gen(struct page *pag - return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; - } - -+static int page_lru_tier(struct page *page) -+{ -+ int refs; -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? -+ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; -+ -+ return lru_tier_from_refs(refs); -+} -+ - static int get_swappiness(struct mem_cgroup *memcg) - { - return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? -@@ -3181,6 +3202,91 @@ done: - } - - /****************************************************************************** -+ * refault feedback loop -+ ******************************************************************************/ -+ -+/* -+ * A feedback loop modeled after the PID controller. Currently supports the -+ * proportional (P) and the integral (I) terms; the derivative (D) term can be -+ * added if necessary. The setpoint (SP) is the desired position; the process -+ * variable (PV) is the measured position. The error is the difference between -+ * the SP and the PV. A positive error results in a positive control output -+ * correction, which, in our case, is to allow eviction. -+ * -+ * The P term is refaulted % of the current generation being evicted. The I -+ * term is the exponential moving average of refaulted % of previously evicted -+ * generations, using the smoothing factor 1/2. -+ * -+ * Our goal is to maintain proportional refaulted % across all tiers. -+ */ -+struct ctrl_pos { -+ unsigned long refaulted; -+ unsigned long total; -+ int gain; -+}; -+ -+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, -+ struct ctrl_pos *pos) -+{ -+ struct lrugen *lrugen = &lruvec->evictable; -+ int hist = lru_hist_from_seq(lrugen->min_seq[type]); -+ -+ pos->refaulted = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ pos->total = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ pos->total += lrugen->protected[hist][type][tier - 1]; -+ pos->gain = gain; -+} -+ -+static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) -+{ -+ int tier; -+ int hist = lru_hist_from_seq(gen); -+ struct lrugen *lrugen = &lruvec->evictable; -+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); -+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; -+ -+ if (!carryover && !clear) -+ return; -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ if (carryover) { -+ unsigned long sum; -+ -+ sum = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); -+ -+ sum = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ sum += lrugen->protected[hist][type][tier - 1]; -+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); -+ } -+ -+ if (clear) { -+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); -+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); -+ if (tier) -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); -+ } -+ } -+} -+ -+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) -+{ -+ /* -+ * Allow eviction if the PV has a limited number of refaulted pages or a -+ * lower refaulted % than the SP. -+ */ -+ return pv->refaulted < MIN_BATCH_SIZE || -+ pv->refaulted * max(sp->total, 1UL) * sp->gain <= -+ sp->refaulted * max(pv->total, 1UL) * pv->gain; -+} -+ -+/****************************************************************************** - * the aging - ******************************************************************************/ - -@@ -3200,6 +3306,7 @@ static int page_update_gen(struct page * - - new_flags &= ~LRU_GEN_MASK; - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - } while (new_flags != old_flags && - cmpxchg(&page->flags, old_flags, new_flags) != old_flags); - -@@ -3231,6 +3338,7 @@ static void page_inc_gen(struct page *pa - - new_flags &= ~LRU_GEN_MASK; - new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - /* for end_page_writeback() */ - if (reclaiming) - new_flags |= BIT(PG_reclaim); -@@ -3722,6 +3830,7 @@ static bool inc_min_seq(struct lruvec *l - } - } - -+ reset_ctrl_pos(lruvec, gen, type); - WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); - - return true; -@@ -3759,6 +3868,8 @@ next: - if (min_seq[type] == lrugen->min_seq[type]) - continue; - -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ reset_ctrl_pos(lruvec, gen, type); - WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); - success = true; - } -@@ -3820,6 +3931,9 @@ static void inc_max_seq(struct lruvec *l - } - } - -+ for (type = 0; type < ANON_AND_FILE; type++) -+ reset_ctrl_pos(lruvec, gen, type); -+ - WRITE_ONCE(lrugen->timestamps[gen], jiffies); - /* make sure all preceding modifications appear first */ - smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); -@@ -4101,6 +4215,433 @@ void lru_gen_look_around(struct page_vma - } - - /****************************************************************************** -+ * the eviction -+ ******************************************************************************/ -+ -+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) -+{ -+ bool success; -+ int gen = page_lru_gen(page); -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int tier = page_lru_tier(page); -+ int delta = thp_nr_pages(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); -+ -+ /* an mlocked page? */ -+ if (!page_evictable(page)) { -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ SetPageUnevictable(page); -+ add_page_to_lru_list(page, lruvec); -+ __count_vm_events(UNEVICTABLE_PGCULLED, delta); -+ return true; -+ } -+ -+ /* a lazy-free page that has been written into? */ -+ if (type && PageDirty(page) && PageAnon(page)) { -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ SetPageSwapBacked(page); -+ add_page_to_lru_list_tail(page, lruvec); -+ return true; -+ } -+ -+ /* page_update_gen() has updated this page? */ -+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { -+ list_move(&page->lru, &lrugen->lists[gen][type][zone]); -+ return true; -+ } -+ -+ /* protect this page if its tier has a higher refaulted % */ -+ if (tier > tier_idx) { -+ int hist = lru_hist_from_seq(gen); -+ -+ page_inc_gen(page, lruvec, false); -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], -+ lrugen->protected[hist][type][tier - 1] + delta); -+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); -+ return true; -+ } -+ -+ /* mark this page for reclaim if it's pending writeback */ -+ if (PageWriteback(page) || (type && PageDirty(page))) { -+ page_inc_gen(page, lruvec, true); -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) -+{ -+ bool success; -+ -+ if (!sc->may_unmap && page_mapped(page)) -+ return false; -+ -+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && -+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) -+ return false; -+ -+ if (!get_page_unless_zero(page)) -+ return false; -+ -+ if (!TestClearPageLRU(page)) { -+ put_page(page); -+ return false; -+ } -+ -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ -+ return true; -+} -+ -+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, -+ int type, int tier, struct list_head *list) -+{ -+ int gen, zone; -+ enum vm_event_item item; -+ int sorted = 0; -+ int scanned = 0; -+ int isolated = 0; -+ int remaining = MAX_BATCH_SIZE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ -+ VM_BUG_ON(!list_empty(list)); -+ -+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) -+ return 0; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ for (zone = sc->reclaim_idx; zone >= 0; zone--) { -+ LIST_HEAD(moved); -+ int skipped = 0; -+ struct list_head *head = &lrugen->lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct page *page = lru_to_page(head); -+ int delta = thp_nr_pages(page); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ scanned += delta; -+ -+ if (sort_page(page, lruvec, tier)) -+ sorted += delta; -+ else if (isolate_page(page, lruvec, sc)) { -+ list_add(&page->lru, list); -+ isolated += delta; -+ } else { -+ list_move(&page->lru, &moved); -+ skipped += delta; -+ } -+ -+ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) -+ break; -+ } -+ -+ if (skipped) { -+ list_splice(&moved, head); -+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); -+ } -+ -+ if (!remaining || isolated >= MIN_BATCH_SIZE) -+ break; -+ } -+ -+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; -+ if (!cgroup_reclaim(sc)) { -+ __count_vm_events(item, isolated); -+ __count_vm_events(PGREFILL, sorted); -+ } -+ __count_memcg_events(memcg, item, isolated); -+ __count_memcg_events(memcg, PGREFILL, sorted); -+ __count_vm_events(PGSCAN_ANON + type, isolated); -+ -+ /* -+ * We may have trouble finding eligible pages due to reclaim_idx, -+ * may_unmap and may_writepage. Check `remaining` to make sure we won't -+ * be stuck if we aren't making enough progress. -+ */ -+ return isolated || !remaining ? scanned : 0; -+} -+ -+static int get_tier_idx(struct lruvec *lruvec, int type) -+{ -+ int tier; -+ struct ctrl_pos sp, pv; -+ -+ /* -+ * Ideally we don't want to evict upper tiers that have higher refaulted -+ * %. However, we need to leave a margin for the fluctuation in -+ * refaulted %. So we use a larger gain factor to make sure upper tiers -+ * are indeed more active. We choose 2 because the lowest upper tier -+ * would have twice of refaulted % of the base tier, according to their -+ * numbers of accesses. -+ */ -+ read_ctrl_pos(lruvec, type, 0, 1, &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, 2, &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ return tier - 1; -+} -+ -+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) -+{ -+ int type, tier; -+ struct ctrl_pos sp, pv; -+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; -+ -+ /* -+ * Compare refaulted % between the base tiers of anon and file to -+ * determine which type to evict. Also need to compare refaulted % of -+ * the upper tiers of the selected type with that of the base tier of -+ * the other type to determine which tier of the selected type to evict. -+ */ -+ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); -+ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); -+ type = positive_ctrl_err(&sp, &pv); -+ -+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ *tier_idx = tier - 1; -+ -+ return type; -+} -+ -+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ int *type_scanned, struct list_head *list) -+{ -+ int i; -+ int type; -+ int scanned; -+ int tier = -1; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ /* -+ * Try to select a type based on generations and swappiness, and if that -+ * fails, fall back to get_type_to_scan(). When anon and file are both -+ * available from the same generation, swappiness 200 is interpreted as -+ * anon first and swappiness 1 is interpreted as file first. -+ */ -+ if (!swappiness) -+ type = 1; -+ else if (min_seq[0] < min_seq[1]) -+ type = 0; -+ else if (swappiness == 1) -+ type = 1; -+ else if (swappiness == 200) -+ type = 0; -+ else -+ type = get_type_to_scan(lruvec, swappiness, &tier); -+ -+ for (i = !swappiness; i < ANON_AND_FILE; i++) { -+ if (tier < 0) -+ tier = get_tier_idx(lruvec, type); -+ -+ scanned = scan_pages(lruvec, sc, type, tier, list); -+ if (scanned) -+ break; -+ -+ type = !type; -+ tier = -1; -+ } -+ -+ *type_scanned = type; -+ -+ return scanned; -+} -+ -+/* Main function used by the foreground, the background and the user-triggered eviction. */ -+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ int type; -+ int scanned; -+ int reclaimed; -+ LIST_HEAD(list); -+ struct page *page; -+ enum vm_event_item item; -+ struct reclaim_stat stat; -+ struct mm_walk_args *args; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); -+ -+ if (try_to_inc_min_seq(lruvec, swappiness)) -+ scanned++; -+ -+ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) -+ scanned = 0; -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ if (list_empty(&list)) -+ return scanned; -+ -+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); -+ /* -+ * We need to prevent rejected pages from being added back to the same -+ * lists they were isolated from. Otherwise we may risk looping on them -+ * forever. -+ */ -+ list_for_each_entry(page, &list, lru) { -+ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) -+ SetPageActive(page); -+ -+ ClearPageReferenced(page); -+ ClearPageWorkingset(page); -+ } -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ move_pages_to_lru(lruvec, &list); -+ -+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; -+ if (args && args->batch_size) -+ reset_batch_size(lruvec, args); -+ -+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; -+ if (!cgroup_reclaim(sc)) -+ __count_vm_events(item, reclaimed); -+ __count_memcg_events(memcg, item, reclaimed); -+ __count_vm_events(PGSTEAL_ANON + type, reclaimed); -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ mem_cgroup_uncharge_list(&list); -+ free_unref_page_list(&list); -+ -+ sc->nr_reclaimed += reclaimed; -+ -+ return scanned; -+} -+ -+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ bool low; -+ long nr_to_scan; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ int priority = sc->priority; -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (mem_cgroup_below_min(memcg) || -+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) -+ return 0; -+ -+ if (sc->nr_reclaimed >= sc->nr_to_reclaim) { -+ priority = DEF_PRIORITY; -+ sc->force_deactivate = 0; -+ } -+ -+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); -+ if (!nr_to_scan) -+ return 0; -+ -+ nr_to_scan >>= priority; -+ -+ if (!mem_cgroup_online(memcg)) -+ nr_to_scan++; -+ -+ if (!nr_to_scan) -+ return 0; -+ -+ if (current_is_kswapd()) { -+ /* leave the work to lru_gen_age_node() */ -+ if (max_seq - min_seq[1] < MIN_NR_GENS) -+ return 0; -+ -+ if (!low) -+ sc->force_deactivate = 0; -+ -+ return nr_to_scan; -+ } -+ -+ if (max_seq - min_seq[1] >= MIN_NR_GENS) -+ return nr_to_scan; -+ -+ /* move onto slab and other memcgs if we haven't tried them all */ -+ if (!sc->force_deactivate) { -+ sc->skipped_deactivate = 1; -+ return 0; -+ } -+ -+ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; -+} -+ -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ struct blk_plug plug; -+ long scanned = 0; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ lru_add_drain(); -+ -+ if (current_is_kswapd()) -+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; -+ -+ blk_start_plug(&plug); -+ -+ while (true) { -+ int delta; -+ int swappiness; -+ long nr_to_scan; -+ -+ if (sc->may_swap) -+ swappiness = get_swappiness(memcg); -+ else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) -+ swappiness = 1; -+ else -+ swappiness = 0; -+ -+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); -+ if (!nr_to_scan) -+ break; -+ -+ delta = evict_pages(lruvec, sc, swappiness); -+ if (!delta) -+ break; -+ -+ scanned += delta; -+ if (scanned >= nr_to_scan) -+ break; -+ -+ cond_resched(); -+ } -+ -+ blk_finish_plug(&plug); -+ -+ if (current_is_kswapd()) -+ current->reclaim_state->mm_walk_args = NULL; -+} -+ -+/****************************************************************************** - * state change - ******************************************************************************/ - -@@ -4355,6 +4896,10 @@ static void lru_gen_age_node(struct pgli - { - } - -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+} -+ - #endif /* CONFIG_LRU_GEN */ - - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -4368,6 +4913,11 @@ static void shrink_lruvec(struct lruvec - bool proportional_reclaim; - struct blk_plug plug; - -+ if (lru_gen_enabled()) { -+ lru_gen_shrink_lruvec(lruvec, sc); -+ return; -+ } -+ - get_scan_count(lruvec, sc, nr); - - /* Record the original scan target for proportional adjustments later */ -@@ -4839,6 +5389,9 @@ static void snapshot_refaults(struct mem - struct lruvec *target_lruvec; - unsigned long refaults; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; ---- a/mm/workingset.c -+++ b/mm/workingset.c -@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_ - static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, - bool workingset) - { -- eviction >>= bucket_order; - eviction &= EVICTION_MASK; - eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; - eviction = (eviction << NODES_SHIFT) | pgdat->node_id; -@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, - - *memcgidp = memcgid; - *pgdat = NODE_DATA(nid); -- *evictionp = entry << bucket_order; -+ *evictionp = entry; - *workingsetp = workingset; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static int page_lru_refs(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); -+ -+ /* see the comment on MAX_NR_TIERS */ -+ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; -+} -+ -+/* Return a token to be stored in the shadow entry of a page being evicted. */ -+static void *lru_gen_eviction(struct page *page) -+{ -+ int hist, tier; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ int type = page_is_file_lru(page); -+ int refs = page_lru_refs(page); -+ int delta = thp_nr_pages(page); -+ bool workingset = PageWorkingset(page); -+ struct mem_cgroup *memcg = page_memcg(page); -+ struct pglist_data *pgdat = page_pgdat(page); -+ -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ token = (min_seq << LRU_REFS_WIDTH) | refs; -+ -+ hist = lru_hist_from_seq(min_seq); -+ tier = lru_tier_from_refs(refs + workingset); -+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); -+ -+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); -+} -+ -+/* Count a refaulted page based on the token stored in its shadow entry. */ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+ int hist, tier, refs; -+ int memcg_id; -+ bool workingset; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ struct mem_cgroup *memcg; -+ struct pglist_data *pgdat; -+ int type = page_is_file_lru(page); -+ int delta = thp_nr_pages(page); -+ -+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); -+ if (page_pgdat(page) != pgdat) -+ return; -+ -+ rcu_read_lock(); -+ memcg = page_memcg_rcu(page); -+ if (mem_cgroup_id(memcg) != memcg_id) -+ goto unlock; -+ -+ refs = token & (BIT(LRU_REFS_WIDTH) - 1); -+ if (refs && !workingset) -+ goto unlock; -+ -+ token >>= LRU_REFS_WIDTH; -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) -+ goto unlock; -+ -+ hist = lru_hist_from_seq(min_seq); -+ tier = lru_tier_from_refs(refs + workingset); -+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); -+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); -+ -+ /* -+ * Tiers don't offer any protection to pages accessed via page tables. -+ * That's what generations do. Tiers can't fully protect pages after -+ * their numbers of accesses has exceeded the max value. Conservatively -+ * count these two conditions as stalls even though they might not -+ * indicate any real memory pressure. -+ */ -+ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { -+ SetPageWorkingset(page); -+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); -+ } -+unlock: -+ rcu_read_unlock(); -+} -+ -+#else -+ -+static void *lru_gen_eviction(struct page *page) -+{ -+ return NULL; -+} -+ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - /** - * workingset_age_nonresident - age non-resident entries as LRU ages - * @lruvec: the lruvec that was aged -@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - -+ if (lru_gen_enabled()) -+ return lru_gen_eviction(page); -+ - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->nonresident_age); -+ eviction >>= bucket_order; - workingset_age_nonresident(lruvec, thp_nr_pages(page)); - return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); - } -@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag - bool workingset; - int memcgid; - -+ if (lru_gen_enabled()) { -+ lru_gen_refault(page, shadow); -+ return; -+ } -+ - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); -+ eviction <<= bucket_order; - - rcu_read_lock(); - /* diff --git a/target/linux/generic/backport-5.15/020-v6.1-08-mm-multi-gen-LRU-support-page-table-walks.patch b/target/linux/generic/backport-5.15/020-v6.1-08-mm-multi-gen-LRU-support-page-table-walks.patch new file mode 100644 index 0000000000..64de2c0c82 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-08-mm-multi-gen-LRU-support-page-table-walks.patch @@ -0,0 +1,1710 @@ +From 05223c4e80b34e29f2255c04ffebc2c4475e7593 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:05 -0600 +Subject: [PATCH 08/29] mm: multi-gen LRU: support page table walks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +To further exploit spatial locality, the aging prefers to walk page tables +to search for young PTEs and promote hot pages. A kill switch will be +added in the next patch to disable this behavior. When disabled, the +aging relies on the rmap only. + +NB: this behavior has nothing similar with the page table scanning in the +2.4 kernel [1], which searches page tables for old PTEs, adds cold pages +to swapcache and unmaps them. + +To avoid confusion, the term "iteration" specifically means the traversal +of an entire mm_struct list; the term "walk" will be applied to page +tables and the rmap, as usual. + +An mm_struct list is maintained for each memcg, and an mm_struct follows +its owner task to the new memcg when this task is migrated. Given an +lruvec, the aging iterates lruvec_memcg()->mm_list and calls +walk_page_range() with each mm_struct on this list to promote hot pages +before it increments max_seq. + +When multiple page table walkers iterate the same list, each of them gets +a unique mm_struct; therefore they can run concurrently. Page table +walkers ignore any misplaced pages, e.g., if an mm_struct was migrated, +pages it left in the previous memcg will not be promoted when its current +memcg is under reclaim. Similarly, page table walkers will not promote +pages from nodes other than the one under reclaim. + +This patch uses the following optimizations when walking page tables: +1. It tracks the usage of mm_struct's between context switches so that + page table walkers can skip processes that have been sleeping since + the last iteration. +2. It uses generational Bloom filters to record populated branches so + that page table walkers can reduce their search space based on the + query results, e.g., to skip page tables containing mostly holes or + misplaced pages. +3. It takes advantage of the accessed bit in non-leaf PMD entries when + CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. +4. It does not zigzag between a PGD table and the same PMD table + spanning multiple VMAs. IOW, it finishes all the VMAs within the + range of the same PMD table before it returns to a PGD table. This + improves the cache performance for workloads that have large + numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5. + +Server benchmark results: + Single workload: + fio (buffered I/O): no change + + Single workload: + memcached (anon): +[8, 10]% + Ops/sec KB/sec + patch1-7: 1147696.57 44640.29 + patch1-8: 1245274.91 48435.66 + + Configurations: + no change + +Client benchmark results: + kswapd profiles: + patch1-7 + 48.16% lzo1x_1_do_compress (real work) + 8.20% page_vma_mapped_walk (overhead) + 7.06% _raw_spin_unlock_irq + 2.92% ptep_clear_flush + 2.53% __zram_bvec_write + 2.11% do_raw_spin_lock + 2.02% memmove + 1.93% lru_gen_look_around + 1.56% free_unref_page_list + 1.40% memset + + patch1-8 + 49.44% lzo1x_1_do_compress (real work) + 6.19% page_vma_mapped_walk (overhead) + 5.97% _raw_spin_unlock_irq + 3.13% get_pfn_page + 2.85% ptep_clear_flush + 2.42% __zram_bvec_write + 2.08% do_raw_spin_lock + 1.92% memmove + 1.44% alloc_zspage + 1.36% memset + + Configurations: + no change + +Thanks to the following developers for their efforts [3]. + kernel test robot <lkp@intel.com> + +[1] https://lwn.net/Articles/23732/ +[2] https://llvm.org/docs/ScudoHardenedAllocator.html +[3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/ + +Link: https://lkml.kernel.org/r/20220918080010.2920238-9-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + fs/exec.c | 2 + + include/linux/memcontrol.h | 5 + + include/linux/mm_types.h | 76 +++ + include/linux/mmzone.h | 56 +- + include/linux/swap.h | 4 + + kernel/exit.c | 1 + + kernel/fork.c | 9 + + kernel/sched/core.c | 1 + + mm/memcontrol.c | 25 + + mm/vmscan.c | 1010 +++++++++++++++++++++++++++++++++++- + 10 files changed, 1172 insertions(+), 17 deletions(-) + +diff --git a/fs/exec.c b/fs/exec.c +index 881390b44cfd..1afa15a07d26 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *mm) + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; ++ lru_gen_add_mm(mm); + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for +@@ -1028,6 +1029,7 @@ static int exec_mmap(struct mm_struct *mm) + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); + task_unlock(tsk); ++ lru_gen_use_mm(mm); + if (old_mm) { + mmap_read_unlock(old_mm); + BUG_ON(active_mm != old_mm); +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 8d6a0329bc59..3736405cbcf6 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -348,6 +348,11 @@ struct mem_cgroup { + struct deferred_split deferred_split_queue; + #endif + ++#ifdef CONFIG_LRU_GEN ++ /* per-memcg mm_struct list */ ++ struct lru_gen_mm_list mm_list; ++#endif ++ + struct mem_cgroup_per_node *nodeinfo[]; + }; + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 7f8ee09c711f..33c142d31261 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -580,6 +580,22 @@ struct mm_struct { + #ifdef CONFIG_IOMMU_SUPPORT + u32 pasid; + #endif ++#ifdef CONFIG_LRU_GEN ++ struct { ++ /* this mm_struct is on lru_gen_mm_list */ ++ struct list_head list; ++ /* ++ * Set when switching to this mm_struct, as a hint of ++ * whether it has been used since the last time per-node ++ * page table walkers cleared the corresponding bits. ++ */ ++ unsigned long bitmap; ++#ifdef CONFIG_MEMCG ++ /* points to the memcg of "owner" above */ ++ struct mem_cgroup *memcg; ++#endif ++ } lru_gen; ++#endif /* CONFIG_LRU_GEN */ + } __randomize_layout; + + /* +@@ -606,6 +622,66 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) + return (struct cpumask *)&mm->cpu_bitmap; + } + ++#ifdef CONFIG_LRU_GEN ++ ++struct lru_gen_mm_list { ++ /* mm_struct list for page table walkers */ ++ struct list_head fifo; ++ /* protects the list above */ ++ spinlock_t lock; ++}; ++ ++void lru_gen_add_mm(struct mm_struct *mm); ++void lru_gen_del_mm(struct mm_struct *mm); ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm); ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++ INIT_LIST_HEAD(&mm->lru_gen.list); ++ mm->lru_gen.bitmap = 0; ++#ifdef CONFIG_MEMCG ++ mm->lru_gen.memcg = NULL; ++#endif ++} ++ ++static inline void lru_gen_use_mm(struct mm_struct *mm) ++{ ++ /* ++ * When the bitmap is set, page reclaim knows this mm_struct has been ++ * used since the last time it cleared the bitmap. So it might be worth ++ * walking the page tables of this mm_struct to clear the accessed bit. ++ */ ++ WRITE_ONCE(mm->lru_gen.bitmap, -1); ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_add_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_del_mm(struct mm_struct *mm) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++} ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_use_mm(struct mm_struct *mm) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 4db2b877fcf9..659bab633bdf 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -385,7 +385,7 @@ enum { + * min_seq behind. + * + * The number of pages in each generation is eventually consistent and therefore +- * can be transiently negative. ++ * can be transiently negative when reset_batch_size() is pending. + */ + struct lru_gen_struct { + /* the aging increments the youngest generation number */ +@@ -407,6 +407,53 @@ struct lru_gen_struct { + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + }; + ++enum { ++ MM_LEAF_TOTAL, /* total leaf entries */ ++ MM_LEAF_OLD, /* old leaf entries */ ++ MM_LEAF_YOUNG, /* young leaf entries */ ++ MM_NONLEAF_TOTAL, /* total non-leaf entries */ ++ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ ++ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ ++ NR_MM_STATS ++}; ++ ++/* double-buffering Bloom filters */ ++#define NR_BLOOM_FILTERS 2 ++ ++struct lru_gen_mm_state { ++ /* set to max_seq after each iteration */ ++ unsigned long seq; ++ /* where the current iteration continues (inclusive) */ ++ struct list_head *head; ++ /* where the last iteration ended (exclusive) */ ++ struct list_head *tail; ++ /* to wait for the last page table walker to finish */ ++ struct wait_queue_head wait; ++ /* Bloom filters flip after each iteration */ ++ unsigned long *filters[NR_BLOOM_FILTERS]; ++ /* the mm stats for debugging */ ++ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; ++ /* the number of concurrent page table walkers */ ++ int nr_walkers; ++}; ++ ++struct lru_gen_mm_walk { ++ /* the lruvec under reclaim */ ++ struct lruvec *lruvec; ++ /* unstable max_seq from lru_gen_struct */ ++ unsigned long max_seq; ++ /* the next address within an mm to scan */ ++ unsigned long next_addr; ++ /* to batch promoted pages */ ++ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* to batch the mm stats */ ++ int mm_stats[NR_MM_STATS]; ++ /* total batched items */ ++ int batched; ++ bool can_swap; ++ bool force_scan; ++}; ++ + void lru_gen_init_lruvec(struct lruvec *lruvec); + void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + +@@ -457,6 +504,8 @@ struct lruvec { + #ifdef CONFIG_LRU_GEN + /* evictable pages divided into generations */ + struct lru_gen_struct lrugen; ++ /* to concurrently iterate lru_gen_mm_list */ ++ struct lru_gen_mm_state mm_state; + #endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; +@@ -1042,6 +1091,11 @@ typedef struct pglist_data { + + unsigned long flags; + ++#ifdef CONFIG_LRU_GEN ++ /* kswap mm walk data */ ++ struct lru_gen_mm_walk mm_walk; ++#endif ++ + ZONE_PADDING(_pad2_) + + /* Per-node vmstats */ +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 4efd267e2937..e970fca4f178 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -137,6 +137,10 @@ union swap_header { + */ + struct reclaim_state { + unsigned long reclaimed_slab; ++#ifdef CONFIG_LRU_GEN ++ /* per-thread mm walk data */ ++ struct lru_gen_mm_walk *mm_walk; ++#endif + }; + + #ifdef __KERNEL__ +diff --git a/kernel/exit.c b/kernel/exit.c +index 80efdfda6662..06b477395012 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -469,6 +469,7 @@ void mm_update_next_owner(struct mm_struct *mm) + goto retry; + } + WRITE_ONCE(mm->owner, c); ++ lru_gen_migrate_mm(mm); + task_unlock(c); + put_task_struct(c); + } +diff --git a/kernel/fork.c b/kernel/fork.c +index 68eab6ce3085..d8f37ecdde87 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + goto fail_nocontext; + + mm->user_ns = get_user_ns(user_ns); ++ lru_gen_init_mm(mm); + return mm; + + fail_nocontext: +@@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_struct *mm) + } + if (mm->binfmt) + module_put(mm->binfmt->module); ++ lru_gen_del_mm(mm); + mmdrop(mm); + } + +@@ -2622,6 +2624,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) + get_task_struct(p); + } + ++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { ++ /* lock the task to synchronize with memcg migration */ ++ task_lock(p); ++ lru_gen_add_mm(p->mm); ++ task_unlock(p); ++ } ++ + wake_up_new_task(p); + + /* forking complete and child started to run, tell ptracer */ +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index c1458fa8beb3..fe4d60474d4a 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5007,6 +5007,7 @@ context_switch(struct rq *rq, struct task_struct *prev, + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ lru_gen_use_mm(next->mm); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index cc3431c5d9ba..ed87d1256f0e 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -6212,6 +6212,30 @@ static void mem_cgroup_move_task(void) + } + #endif + ++#ifdef CONFIG_LRU_GEN ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++ struct task_struct *task; ++ struct cgroup_subsys_state *css; ++ ++ /* find the first leader if there is any */ ++ cgroup_taskset_for_each_leader(task, css, tset) ++ break; ++ ++ if (!task) ++ return; ++ ++ task_lock(task); ++ if (task->mm && READ_ONCE(task->mm->owner) == task) ++ lru_gen_migrate_mm(task->mm); ++ task_unlock(task); ++} ++#else ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) + { + if (value == PAGE_COUNTER_MAX) +@@ -6555,6 +6579,7 @@ struct cgroup_subsys memory_cgrp_subsys = { + .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, + .can_attach = mem_cgroup_can_attach, ++ .attach = mem_cgroup_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .post_attach = mem_cgroup_move_task, + .dfl_cftypes = memory_files, +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 1d0b25ae378c..a7844c689522 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -50,6 +50,8 @@ + #include <linux/printk.h> + #include <linux/dax.h> + #include <linux/psi.h> ++#include <linux/pagewalk.h> ++#include <linux/shmem_fs.h> + + #include <asm/tlbflush.h> + #include <asm/div64.h> +@@ -2853,7 +2855,7 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + +-static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid) ++static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) + { + struct pglist_data *pgdat = NODE_DATA(nid); + +@@ -2898,6 +2900,371 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + } + ++/****************************************************************************** ++ * mm_struct list ++ ******************************************************************************/ ++ ++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) ++{ ++ static struct lru_gen_mm_list mm_list = { ++ .fifo = LIST_HEAD_INIT(mm_list.fifo), ++ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), ++ }; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ return &memcg->mm_list; ++#endif ++ VM_WARN_ON_ONCE(!mem_cgroup_disabled()); ++ ++ return &mm_list; ++} ++ ++void lru_gen_add_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ ++ VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); ++#ifdef CONFIG_MEMCG ++ VM_WARN_ON_ONCE(mm->lru_gen.memcg); ++ mm->lru_gen.memcg = memcg; ++#endif ++ spin_lock(&mm_list->lock); ++ ++ for_each_node_state(nid, N_MEMORY) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ if (!lruvec) ++ continue; ++ ++ /* the first addition since the last iteration */ ++ if (lruvec->mm_state.tail == &mm_list->fifo) ++ lruvec->mm_state.tail = &mm->lru_gen.list; ++ } ++ ++ list_add_tail(&mm->lru_gen.list, &mm_list->fifo); ++ ++ spin_unlock(&mm_list->lock); ++} ++ ++void lru_gen_del_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct lru_gen_mm_list *mm_list; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (list_empty(&mm->lru_gen.list)) ++ return; ++ ++#ifdef CONFIG_MEMCG ++ memcg = mm->lru_gen.memcg; ++#endif ++ mm_list = get_mm_list(memcg); ++ ++ spin_lock(&mm_list->lock); ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ if (!lruvec) ++ continue; ++ ++ /* where the last iteration ended (exclusive) */ ++ if (lruvec->mm_state.tail == &mm->lru_gen.list) ++ lruvec->mm_state.tail = lruvec->mm_state.tail->next; ++ ++ /* where the current iteration continues (inclusive) */ ++ if (lruvec->mm_state.head != &mm->lru_gen.list) ++ continue; ++ ++ lruvec->mm_state.head = lruvec->mm_state.head->next; ++ /* the deletion ends the current iteration */ ++ if (lruvec->mm_state.head == &mm_list->fifo) ++ WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); ++ } ++ ++ list_del_init(&mm->lru_gen.list); ++ ++ spin_unlock(&mm_list->lock); ++ ++#ifdef CONFIG_MEMCG ++ mem_cgroup_put(mm->lru_gen.memcg); ++ mm->lru_gen.memcg = NULL; ++#endif ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++ struct mem_cgroup *memcg; ++ struct task_struct *task = rcu_dereference_protected(mm->owner, true); ++ ++ VM_WARN_ON_ONCE(task->mm != mm); ++ lockdep_assert_held(&task->alloc_lock); ++ ++ /* for mm_update_next_owner() */ ++ if (mem_cgroup_disabled()) ++ return; ++ ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_task(task); ++ rcu_read_unlock(); ++ if (memcg == mm->lru_gen.memcg) ++ return; ++ ++ VM_WARN_ON_ONCE(!mm->lru_gen.memcg); ++ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); ++ ++ lru_gen_del_mm(mm); ++ lru_gen_add_mm(mm); ++} ++#endif ++ ++/* ++ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when ++ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of ++ * bits in a bitmap, k is the number of hash functions and n is the number of ++ * inserted items. ++ * ++ * Page table walkers use one of the two filters to reduce their search space. ++ * To get rid of non-leaf entries that no longer have enough leaf entries, the ++ * aging uses the double-buffering technique to flip to the other filter each ++ * time it produces a new generation. For non-leaf entries that have enough ++ * leaf entries, the aging carries them over to the next generation in ++ * walk_pmd_range(); the eviction also report them when walking the rmap ++ * in lru_gen_look_around(). ++ * ++ * For future optimizations: ++ * 1. It's not necessary to keep both filters all the time. The spare one can be ++ * freed after the RCU grace period and reallocated if needed again. ++ * 2. And when reallocating, it's worth scaling its size according to the number ++ * of inserted entries in the other filter, to reduce the memory overhead on ++ * small systems and false positives on large systems. ++ * 3. Jenkins' hash function is an alternative to Knuth's. ++ */ ++#define BLOOM_FILTER_SHIFT 15 ++ ++static inline int filter_gen_from_seq(unsigned long seq) ++{ ++ return seq % NR_BLOOM_FILTERS; ++} ++ ++static void get_item_key(void *item, int *key) ++{ ++ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); ++ ++ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); ++ ++ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); ++ key[1] = hash >> BLOOM_FILTER_SHIFT; ++} ++ ++static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) ++{ ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = lruvec->mm_state.filters[gen]; ++ if (filter) { ++ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); ++ return; ++ } ++ ++ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), ++ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); ++ WRITE_ONCE(lruvec->mm_state.filters[gen], filter); ++} ++ ++static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_state.filters[gen]); ++ if (!filter) ++ return; ++ ++ get_item_key(item, key); ++ ++ if (!test_bit(key[0], filter)) ++ set_bit(key[0], filter); ++ if (!test_bit(key[1], filter)) ++ set_bit(key[1], filter); ++} ++ ++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_state.filters[gen]); ++ if (!filter) ++ return true; ++ ++ get_item_key(item, key); ++ ++ return test_bit(key[0], filter) && test_bit(key[1], filter); ++} ++ ++static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) ++{ ++ int i; ++ int hist; ++ ++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); ++ ++ if (walk) { ++ hist = lru_hist_from_seq(walk->max_seq); ++ ++ for (i = 0; i < NR_MM_STATS; i++) { ++ WRITE_ONCE(lruvec->mm_state.stats[hist][i], ++ lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); ++ walk->mm_stats[i] = 0; ++ } ++ } ++ ++ if (NR_HIST_GENS > 1 && last) { ++ hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); ++ ++ for (i = 0; i < NR_MM_STATS; i++) ++ WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); ++ } ++} ++ ++static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) ++{ ++ int type; ++ unsigned long size = 0; ++ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); ++ int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); ++ ++ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) ++ return true; ++ ++ clear_bit(key, &mm->lru_gen.bitmap); ++ ++ for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { ++ size += type ? get_mm_counter(mm, MM_FILEPAGES) : ++ get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ } ++ ++ if (size < MIN_LRU_BATCH) ++ return true; ++ ++ return !mmget_not_zero(mm); ++} ++ ++static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, ++ struct mm_struct **iter) ++{ ++ bool first = false; ++ bool last = true; ++ struct mm_struct *mm = NULL; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ struct lru_gen_mm_state *mm_state = &lruvec->mm_state; ++ ++ /* ++ * There are four interesting cases for this page table walker: ++ * 1. It tries to start a new iteration of mm_list with a stale max_seq; ++ * there is nothing left to do. ++ * 2. It's the first of the current generation, and it needs to reset ++ * the Bloom filter for the next generation. ++ * 3. It reaches the end of mm_list, and it needs to increment ++ * mm_state->seq; the iteration is done. ++ * 4. It's the last of the current generation, and it needs to reset the ++ * mm stats counters for the next generation. ++ */ ++ spin_lock(&mm_list->lock); ++ ++ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); ++ VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); ++ VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); ++ ++ if (walk->max_seq <= mm_state->seq) { ++ if (!*iter) ++ last = false; ++ goto done; ++ } ++ ++ if (!mm_state->nr_walkers) { ++ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); ++ ++ mm_state->head = mm_list->fifo.next; ++ first = true; ++ } ++ ++ while (!mm && mm_state->head != &mm_list->fifo) { ++ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); ++ ++ mm_state->head = mm_state->head->next; ++ ++ /* force scan for those added after the last iteration */ ++ if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { ++ mm_state->tail = mm_state->head; ++ walk->force_scan = true; ++ } ++ ++ if (should_skip_mm(mm, walk)) ++ mm = NULL; ++ } ++ ++ if (mm_state->head == &mm_list->fifo) ++ WRITE_ONCE(mm_state->seq, mm_state->seq + 1); ++done: ++ if (*iter && !mm) ++ mm_state->nr_walkers--; ++ if (!*iter && mm) ++ mm_state->nr_walkers++; ++ ++ if (mm_state->nr_walkers) ++ last = false; ++ ++ if (*iter || last) ++ reset_mm_stats(lruvec, walk, last); ++ ++ spin_unlock(&mm_list->lock); ++ ++ if (mm && first) ++ reset_bloom_filter(lruvec, walk->max_seq + 1); ++ ++ if (*iter) ++ mmput_async(*iter); ++ ++ *iter = mm; ++ ++ return last; ++} ++ ++static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) ++{ ++ bool success = false; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ struct lru_gen_mm_state *mm_state = &lruvec->mm_state; ++ ++ spin_lock(&mm_list->lock); ++ ++ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); ++ ++ if (max_seq > mm_state->seq && !mm_state->nr_walkers) { ++ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); ++ ++ WRITE_ONCE(mm_state->seq, mm_state->seq + 1); ++ reset_mm_stats(lruvec, NULL, true); ++ success = true; ++ } ++ ++ spin_unlock(&mm_list->lock); ++ ++ return success; ++} ++ + /****************************************************************************** + * refault feedback loop + ******************************************************************************/ +@@ -3048,6 +3415,118 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin + return new_gen; + } + ++static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page, ++ int old_gen, int new_gen) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ ++ VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); ++ ++ walk->batched++; ++ ++ walk->nr_pages[old_gen][type][zone] -= delta; ++ walk->nr_pages[new_gen][type][zone] += delta; ++} ++ ++static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) ++{ ++ int gen, type, zone; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ walk->batched = 0; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ enum lru_list lru = type * LRU_INACTIVE_FILE; ++ int delta = walk->nr_pages[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ walk->nr_pages[gen][type][zone] = 0; ++ WRITE_ONCE(lrugen->nr_pages[gen][type][zone], ++ lrugen->nr_pages[gen][type][zone] + delta); ++ ++ if (lru_gen_is_active(lruvec, gen)) ++ lru += LRU_ACTIVE; ++ __update_lru_size(lruvec, lru, zone, delta); ++ } ++} ++ ++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) ++{ ++ struct address_space *mapping; ++ struct vm_area_struct *vma = args->vma; ++ struct lru_gen_mm_walk *walk = args->private; ++ ++ if (!vma_is_accessible(vma)) ++ return true; ++ ++ if (is_vm_hugetlb_page(vma)) ++ return true; ++ ++ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) ++ return true; ++ ++ if (vma == get_gate_vma(vma->vm_mm)) ++ return true; ++ ++ if (vma_is_anonymous(vma)) ++ return !walk->can_swap; ++ ++ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) ++ return true; ++ ++ mapping = vma->vm_file->f_mapping; ++ if (mapping_unevictable(mapping)) ++ return true; ++ ++ if (shmem_mapping(mapping)) ++ return !walk->can_swap; ++ ++ /* to exclude special mappings like dax, etc. */ ++ return !mapping->a_ops->readpage; ++} ++ ++/* ++ * Some userspace memory allocators map many single-page VMAs. Instead of ++ * returning back to the PGD table for each of such VMAs, finish an entire PMD ++ * table to reduce zigzags and improve cache performance. ++ */ ++static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, ++ unsigned long *vm_start, unsigned long *vm_end) ++{ ++ unsigned long start = round_up(*vm_end, size); ++ unsigned long end = (start | ~mask) + 1; ++ ++ VM_WARN_ON_ONCE(mask & size); ++ VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); ++ ++ while (args->vma) { ++ if (start >= args->vma->vm_end) { ++ args->vma = args->vma->vm_next; ++ continue; ++ } ++ ++ if (end && end <= args->vma->vm_start) ++ return false; ++ ++ if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) { ++ args->vma = args->vma->vm_next; ++ continue; ++ } ++ ++ *vm_start = max(start, args->vma->vm_start); ++ *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; ++ ++ return true; ++ } ++ ++ return false; ++} ++ + static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) + { + unsigned long pfn = pte_pfn(pte); +@@ -3066,8 +3545,28 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned + return pfn; + } + ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) ++static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) ++{ ++ unsigned long pfn = pmd_pfn(pmd); ++ ++ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); ++ ++ if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) ++ return -1; ++ ++ if (WARN_ON_ONCE(pmd_devmap(pmd))) ++ return -1; ++ ++ if (WARN_ON_ONCE(!pfn_valid(pfn))) ++ return -1; ++ ++ return pfn; ++} ++#endif ++ + static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg, +- struct pglist_data *pgdat) ++ struct pglist_data *pgdat, bool can_swap) + { + struct page *page; + +@@ -3082,9 +3581,375 @@ static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg, + if (page_memcg_rcu(page) != memcg) + return NULL; + ++ /* file VMAs can contain anon pages from COW */ ++ if (!page_is_file_lru(page) && !can_swap) ++ return NULL; ++ + return page; + } + ++static bool suitable_to_scan(int total, int young) ++{ ++ int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); ++ ++ /* suitable if the average number of young PTEs per cacheline is >=1 */ ++ return young * n >= total; ++} ++ ++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, ++ struct mm_walk *args) ++{ ++ int i; ++ pte_t *pte; ++ spinlock_t *ptl; ++ unsigned long addr; ++ int total = 0; ++ int young = 0; ++ struct lru_gen_mm_walk *walk = args->private; ++ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); ++ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); ++ ++ VM_WARN_ON_ONCE(pmd_leaf(*pmd)); ++ ++ ptl = pte_lockptr(args->mm, pmd); ++ if (!spin_trylock(ptl)) ++ return false; ++ ++ arch_enter_lazy_mmu_mode(); ++ ++ pte = pte_offset_map(pmd, start & PMD_MASK); ++restart: ++ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn; ++ struct page *page; ++ ++ total++; ++ walk->mm_stats[MM_LEAF_TOTAL]++; ++ ++ pfn = get_pte_pfn(pte[i], args->vma, addr); ++ if (pfn == -1) ++ continue; ++ ++ if (!pte_young(pte[i])) { ++ walk->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap); ++ if (!page) ++ continue; ++ ++ if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) ++ VM_WARN_ON_ONCE(true); ++ ++ young++; ++ walk->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && ++ !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(walk, page, old_gen, new_gen); ++ } ++ ++ if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) ++ goto restart; ++ ++ pte_unmap(pte); ++ ++ arch_leave_lazy_mmu_mode(); ++ spin_unlock(ptl); ++ ++ return suitable_to_scan(total, young); ++} ++ ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) ++static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, ++ struct mm_walk *args, unsigned long *bitmap, unsigned long *start) ++{ ++ int i; ++ pmd_t *pmd; ++ spinlock_t *ptl; ++ struct lru_gen_mm_walk *walk = args->private; ++ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); ++ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); ++ ++ VM_WARN_ON_ONCE(pud_leaf(*pud)); ++ ++ /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ ++ if (*start == -1) { ++ *start = next; ++ return; ++ } ++ ++ i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); ++ if (i && i <= MIN_LRU_BATCH) { ++ __set_bit(i - 1, bitmap); ++ return; ++ } ++ ++ pmd = pmd_offset(pud, *start); ++ ++ ptl = pmd_lockptr(args->mm, pmd); ++ if (!spin_trylock(ptl)) ++ goto done; ++ ++ arch_enter_lazy_mmu_mode(); ++ ++ do { ++ unsigned long pfn; ++ struct page *page; ++ unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; ++ ++ pfn = get_pmd_pfn(pmd[i], vma, addr); ++ if (pfn == -1) ++ goto next; ++ ++ if (!pmd_trans_huge(pmd[i])) { ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ pmdp_test_and_clear_young(vma, addr, pmd + i); ++ goto next; ++ } ++ ++ page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap); ++ if (!page) ++ goto next; ++ ++ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) ++ goto next; ++ ++ walk->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pmd_dirty(pmd[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && ++ !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(walk, page, old_gen, new_gen); ++next: ++ i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; ++ } while (i <= MIN_LRU_BATCH); ++ ++ arch_leave_lazy_mmu_mode(); ++ spin_unlock(ptl); ++done: ++ *start = -1; ++ bitmap_zero(bitmap, MIN_LRU_BATCH); ++} ++#else ++static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, ++ struct mm_walk *args, unsigned long *bitmap, unsigned long *start) ++{ ++} ++#endif ++ ++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, ++ struct mm_walk *args) ++{ ++ int i; ++ pmd_t *pmd; ++ unsigned long next; ++ unsigned long addr; ++ struct vm_area_struct *vma; ++ unsigned long pos = -1; ++ struct lru_gen_mm_walk *walk = args->private; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; ++ ++ VM_WARN_ON_ONCE(pud_leaf(*pud)); ++ ++ /* ++ * Finish an entire PMD in two passes: the first only reaches to PTE ++ * tables to avoid taking the PMD lock; the second, if necessary, takes ++ * the PMD lock to clear the accessed bit in PMD entries. ++ */ ++ pmd = pmd_offset(pud, start & PUD_MASK); ++restart: ++ /* walk_pte_range() may call get_next_vma() */ ++ vma = args->vma; ++ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { ++ pmd_t val = pmd_read_atomic(pmd + i); ++ ++ /* for pmd_read_atomic() */ ++ barrier(); ++ ++ next = pmd_addr_end(addr, end); ++ ++ if (!pmd_present(val) || is_huge_zero_pmd(val)) { ++ walk->mm_stats[MM_LEAF_TOTAL]++; ++ continue; ++ } ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ if (pmd_trans_huge(val)) { ++ unsigned long pfn = pmd_pfn(val); ++ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); ++ ++ walk->mm_stats[MM_LEAF_TOTAL]++; ++ ++ if (!pmd_young(val)) { ++ walk->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ /* try to avoid unnecessary memory loads */ ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ continue; ++ ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ continue; ++ } ++#endif ++ walk->mm_stats[MM_NONLEAF_TOTAL]++; ++ ++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG ++ if (!pmd_young(val)) ++ continue; ++ ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++#endif ++ if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) ++ continue; ++ ++ walk->mm_stats[MM_NONLEAF_FOUND]++; ++ ++ if (!walk_pte_range(&val, addr, next, args)) ++ continue; ++ ++ walk->mm_stats[MM_NONLEAF_ADDED]++; ++ ++ /* carry over to the next generation */ ++ update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); ++ } ++ ++ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); ++ ++ if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) ++ goto restart; ++} ++ ++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, ++ struct mm_walk *args) ++{ ++ int i; ++ pud_t *pud; ++ unsigned long addr; ++ unsigned long next; ++ struct lru_gen_mm_walk *walk = args->private; ++ ++ VM_WARN_ON_ONCE(p4d_leaf(*p4d)); ++ ++ pud = pud_offset(p4d, start & P4D_MASK); ++restart: ++ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { ++ pud_t val = READ_ONCE(pud[i]); ++ ++ next = pud_addr_end(addr, end); ++ ++ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) ++ continue; ++ ++ walk_pmd_range(&val, addr, next, args); ++ ++ /* a racy check to curtail the waiting time */ ++ if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) ++ return 1; ++ ++ if (need_resched() || walk->batched >= MAX_LRU_BATCH) { ++ end = (addr | ~PUD_MASK) + 1; ++ goto done; ++ } ++ } ++ ++ if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) ++ goto restart; ++ ++ end = round_up(end, P4D_SIZE); ++done: ++ if (!end || !args->vma) ++ return 1; ++ ++ walk->next_addr = max(end, args->vma->vm_start); ++ ++ return -EAGAIN; ++} ++ ++static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) ++{ ++ static const struct mm_walk_ops mm_walk_ops = { ++ .test_walk = should_skip_vma, ++ .p4d_entry = walk_pud_range, ++ }; ++ ++ int err; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ walk->next_addr = FIRST_USER_ADDRESS; ++ ++ do { ++ err = -EBUSY; ++ ++ /* page_update_gen() requires stable page_memcg() */ ++ if (!mem_cgroup_trylock_pages(memcg)) ++ break; ++ ++ /* the caller might be holding the lock for write */ ++ if (mmap_read_trylock(mm)) { ++ err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); ++ ++ mmap_read_unlock(mm); ++ } ++ ++ mem_cgroup_unlock_pages(); ++ ++ if (walk->batched) { ++ spin_lock_irq(&lruvec->lru_lock); ++ reset_batch_size(lruvec, walk); ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while (err == -EAGAIN); ++} ++ ++static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) ++{ ++ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; ++ ++ if (pgdat && current_is_kswapd()) { ++ VM_WARN_ON_ONCE(walk); ++ ++ walk = &pgdat->mm_walk; ++ } else if (!pgdat && !walk) { ++ VM_WARN_ON_ONCE(current_is_kswapd()); ++ ++ walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); ++ } ++ ++ current->reclaim_state->mm_walk = walk; ++ ++ return walk; ++} ++ ++static void clear_mm_walk(void) ++{ ++ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; ++ ++ VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); ++ VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); ++ ++ current->reclaim_state->mm_walk = NULL; ++ ++ if (!current_is_kswapd()) ++ kfree(walk); ++} ++ + static void inc_min_seq(struct lruvec *lruvec, int type) + { + struct lru_gen_struct *lrugen = &lruvec->lrugen; +@@ -3136,7 +4001,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + return success; + } + +-static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) ++static void inc_max_seq(struct lruvec *lruvec, bool can_swap) + { + int prev, next; + int type, zone; +@@ -3146,9 +4011,6 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + +- if (max_seq != lrugen->max_seq) +- goto unlock; +- + for (type = ANON_AND_FILE - 1; type >= 0; type--) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; +@@ -3186,10 +4048,76 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s + + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +-unlock: ++ + spin_unlock_irq(&lruvec->lru_lock); + } + ++static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, ++ struct scan_control *sc, bool can_swap) ++{ ++ bool success; ++ struct lru_gen_mm_walk *walk; ++ struct mm_struct *mm = NULL; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); ++ ++ /* see the comment in iterate_mm_list() */ ++ if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { ++ success = false; ++ goto done; ++ } ++ ++ /* ++ * If the hardware doesn't automatically set the accessed bit, fallback ++ * to lru_gen_look_around(), which only clears the accessed bit in a ++ * handful of PTEs. Spreading the work out over a period of time usually ++ * is less efficient, but it avoids bursty page faults. ++ */ ++ if (!arch_has_hw_pte_young()) { ++ success = iterate_mm_list_nowalk(lruvec, max_seq); ++ goto done; ++ } ++ ++ walk = set_mm_walk(NULL); ++ if (!walk) { ++ success = iterate_mm_list_nowalk(lruvec, max_seq); ++ goto done; ++ } ++ ++ walk->lruvec = lruvec; ++ walk->max_seq = max_seq; ++ walk->can_swap = can_swap; ++ walk->force_scan = false; ++ ++ do { ++ success = iterate_mm_list(lruvec, walk, &mm); ++ if (mm) ++ walk_mm(lruvec, mm, walk); ++ ++ cond_resched(); ++ } while (mm); ++done: ++ if (!success) { ++ if (sc->priority <= DEF_PRIORITY - 2) ++ wait_event_killable(lruvec->mm_state.wait, ++ max_seq < READ_ONCE(lrugen->max_seq)); ++ ++ return max_seq < READ_ONCE(lrugen->max_seq); ++ } ++ ++ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); ++ ++ inc_max_seq(lruvec, can_swap); ++ /* either this sees any waiters or they will see updated max_seq */ ++ if (wq_has_sleeper(&lruvec->mm_state.wait)) ++ wake_up_all(&lruvec->mm_state.wait); ++ ++ wakeup_flusher_threads(WB_REASON_VMSCAN); ++ ++ return true; ++} ++ + static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) + { +@@ -3265,7 +4193,7 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) + + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); + if (need_aging) +- inc_max_seq(lruvec, max_seq, swappiness); ++ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); + } + + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +@@ -3274,6 +4202,8 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + + VM_WARN_ON_ONCE(!current_is_kswapd()); + ++ set_mm_walk(pgdat); ++ + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); +@@ -3282,11 +4212,16 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ clear_mm_walk(); + } + + /* + * This function exploits spatial locality when shrink_page_list() walks the +- * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. ++ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If ++ * the scan was done cacheline efficiently, it adds the PMD entry pointing to ++ * the PTE table to the Bloom filter. This forms a feedback loop between the ++ * eviction and the aging. + */ + void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + { +@@ -3295,6 +4230,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + unsigned long start; + unsigned long end; + unsigned long addr; ++ struct lru_gen_mm_walk *walk; ++ int young = 0; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + struct page *page = pvmw->page; + struct mem_cgroup *memcg = page_memcg(page); +@@ -3309,6 +4246,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (spin_is_contended(pvmw->ptl)) + return; + ++ /* avoid taking the LRU lock under the PTL when possible */ ++ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; ++ + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); + end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + +@@ -3338,13 +4278,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (!pte_young(pte[i])) + continue; + +- page = get_pfn_page(pfn, memcg, pgdat); ++ page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap); + if (!page) + continue; + + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + ++ young++; ++ + if (pte_dirty(pte[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page))) +@@ -3360,7 +4302,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + arch_leave_lazy_mmu_mode(); + rcu_read_unlock(); + +- if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { ++ /* feedback from rmap walkers to page table walkers */ ++ if (suitable_to_scan(i, young)) ++ update_bloom_filter(lruvec, max_seq, pvmw->pmd); ++ ++ if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + page = pte_page(pte[i]); + activate_page(page); +@@ -3372,8 +4318,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (!mem_cgroup_trylock_pages(memcg)) + return; + +- spin_lock_irq(&lruvec->lru_lock); +- new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); ++ if (!walk) { ++ spin_lock_irq(&lruvec->lru_lock); ++ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); ++ } + + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + page = compound_head(pte_page(pte[i])); +@@ -3384,10 +4332,14 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (old_gen < 0 || old_gen == new_gen) + continue; + +- lru_gen_update_size(lruvec, page, old_gen, new_gen); ++ if (walk) ++ update_batch_size(walk, page, old_gen, new_gen); ++ else ++ lru_gen_update_size(lruvec, page, old_gen, new_gen); + } + +- spin_unlock_irq(&lruvec->lru_lock); ++ if (!walk) ++ spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_unlock_pages(); + } +@@ -3670,6 +4622,7 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + struct page *page; + enum vm_event_item item; + struct reclaim_stat stat; ++ struct lru_gen_mm_walk *walk; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + +@@ -3706,6 +4659,10 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + + move_pages_to_lru(lruvec, &list); + ++ walk = current->reclaim_state->mm_walk; ++ if (walk && walk->batched) ++ reset_batch_size(lruvec, walk); ++ + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); +@@ -3722,6 +4679,11 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + return scanned; + } + ++/* ++ * For future optimizations: ++ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg ++ * reclaim. ++ */ + static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, + bool can_swap) + { +@@ -3747,7 +4709,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + if (current_is_kswapd()) + return 0; + +- inc_max_seq(lruvec, max_seq, can_swap); ++ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap)) ++ return nr_to_scan; + done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; + } +@@ -3761,6 +4724,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + + blk_start_plug(&plug); + ++ set_mm_walk(lruvec_pgdat(lruvec)); ++ + while (true) { + int delta; + int swappiness; +@@ -3788,6 +4753,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + cond_resched(); + } + ++ clear_mm_walk(); ++ + blk_finish_plug(&plug); + } + +@@ -3804,15 +4771,21 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++ ++ lruvec->mm_state.seq = MIN_NR_GENS; ++ init_waitqueue_head(&lruvec->mm_state.wait); + } + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg) + { ++ INIT_LIST_HEAD(&memcg->mm_list.fifo); ++ spin_lock_init(&memcg->mm_list.lock); + } + + void lru_gen_exit_memcg(struct mem_cgroup *memcg) + { ++ int i; + int nid; + + for_each_node(nid) { +@@ -3820,6 +4793,11 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) + + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); ++ ++ for (i = 0; i < NR_BLOOM_FILTERS; i++) { ++ bitmap_free(lruvec->mm_state.filters[i]); ++ lruvec->mm_state.filters[i] = NULL; ++ } + } + } + #endif +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-08-mm-multigenerational-lru-user-interface.patch b/target/linux/generic/backport-5.15/020-v6.1-08-mm-multigenerational-lru-user-interface.patch deleted file mode 100644 index f0753ea802..0000000000 --- a/target/linux/generic/backport-5.15/020-v6.1-08-mm-multigenerational-lru-user-interface.patch +++ /dev/null @@ -1,496 +0,0 @@ -From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Mon, 25 Jan 2021 21:38:02 -0700 -Subject: [PATCH 08/10] mm: multigenerational lru: user interface - -Add /sys/kernel/mm/lru_gen/enabled to enable and disable the -multigenerational lru at runtime. - -Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a -given number of milliseconds. The OOM killer is invoked if this -working set cannot be kept in memory. - -Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and -invoke the aging and the eviction. This file has the following output: - memcg memcg_id memcg_path - node node_id - min_gen birth_time anon_size file_size - ... - max_gen birth_time anon_size file_size - -min_gen is the oldest generation number and max_gen is the youngest -generation number. birth_time is in milliseconds. anon_size and -file_size are in pages. - -This file takes the following input: - + memcg_id node_id max_gen [swappiness] [use_bloom_filter] - - memcg_id node_id min_gen [swappiness] [nr_to_reclaim] - -The first command line invokes the aging, which scans PTEs for -accessed pages and then creates the next generation max_gen+1. A swap -file and a non-zero swappiness, which overrides vm.swappiness, are -required to scan PTEs mapping anon pages. The second command line -invokes the eviction, which evicts generations less than or equal to -min_gen. min_gen should be less than max_gen-1 as max_gen and -max_gen-1 are not fully aged and therefore cannot be evicted. -Setting nr_to_reclaim to N limits the number of pages to evict. -Setting use_bloom_filter to 0 overrides the default behavior which -only scans PTE tables found populated. Multiple command lines are -supported, as is concatenation with delimiters "," and ";". - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3 ---- - include/linux/nodemask.h | 1 + - mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++ - 2 files changed, 416 insertions(+) - ---- a/include/linux/nodemask.h -+++ b/include/linux/nodemask.h -@@ -485,6 +485,7 @@ static inline int num_node_state(enum no - #define first_online_node 0 - #define first_memory_node 0 - #define next_online_node(nid) (MAX_NUMNODES) -+#define next_memory_node(nid) (MAX_NUMNODES) - #define nr_node_ids 1U - #define nr_online_nodes 1U - ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -53,6 +53,8 @@ - #include <linux/memory.h> - #include <linux/pagewalk.h> - #include <linux/shmem_fs.h> -+#include <linux/ctype.h> -+#include <linux/debugfs.h> - - #include <asm/tlbflush.h> - #include <asm/div64.h> -@@ -4817,6 +4819,413 @@ unlock: - } - - /****************************************************************************** -+ * sysfs interface -+ ******************************************************************************/ -+ -+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); -+} -+ -+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t len) -+{ -+ unsigned int msecs; -+ -+ if (kstrtouint(buf, 10, &msecs)) -+ return -EINVAL; -+ -+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); -+ -+ return len; -+} -+ -+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( -+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl -+); -+ -+static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -+{ -+ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled()); -+} -+ -+static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t len) -+{ -+ bool enable; -+ -+ if (kstrtobool(buf, &enable)) -+ return -EINVAL; -+ -+ lru_gen_change_state(enable, true, false); -+ -+ return len; -+} -+ -+static struct kobj_attribute lru_gen_enabled_attr = __ATTR( -+ enabled, 0644, show_enable, store_enable -+); -+ -+static struct attribute *lru_gen_attrs[] = { -+ &lru_gen_min_ttl_attr.attr, -+ &lru_gen_enabled_attr.attr, -+ NULL -+}; -+ -+static struct attribute_group lru_gen_attr_group = { -+ .name = "lru_gen", -+ .attrs = lru_gen_attrs, -+}; -+ -+/****************************************************************************** -+ * debugfs interface -+ ******************************************************************************/ -+ -+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) -+{ -+ struct mem_cgroup *memcg; -+ loff_t nr_to_skip = *pos; -+ -+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); -+ if (!m->private) -+ return ERR_PTR(-ENOMEM); -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node_state(nid, N_MEMORY) { -+ if (!nr_to_skip--) -+ return get_lruvec(nid, memcg); -+ } -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ return NULL; -+} -+ -+static void lru_gen_seq_stop(struct seq_file *m, void *v) -+{ -+ if (!IS_ERR_OR_NULL(v)) -+ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); -+ -+ kvfree(m->private); -+ m->private = NULL; -+} -+ -+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) -+{ -+ int nid = lruvec_pgdat(v)->node_id; -+ struct mem_cgroup *memcg = lruvec_memcg(v); -+ -+ ++*pos; -+ -+ nid = next_memory_node(nid); -+ if (nid == MAX_NUMNODES) { -+ memcg = mem_cgroup_iter(NULL, memcg, NULL); -+ if (!memcg) -+ return NULL; -+ -+ nid = first_memory_node; -+ } -+ -+ return get_lruvec(nid, memcg); -+} -+ -+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, -+ unsigned long max_seq, unsigned long *min_seq, -+ unsigned long seq) -+{ -+ int i; -+ int type, tier; -+ int hist = lru_hist_from_seq(seq); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ seq_printf(m, " %10d", tier); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ unsigned long n[3] = {}; -+ -+ if (seq == max_seq) { -+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); -+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); -+ -+ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]); -+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { -+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); -+ -+ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]); -+ } else -+ seq_puts(m, " 0 0 0 "); -+ } -+ seq_putc(m, '\n'); -+ } -+ -+ seq_puts(m, " "); -+ for (i = 0; i < NR_MM_STATS; i++) { -+ if (seq == max_seq && NR_HIST_GENS == 1) -+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), -+ toupper(MM_STAT_CODES[i])); -+ else if (seq != max_seq && NR_HIST_GENS > 1) -+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), -+ MM_STAT_CODES[i]); -+ else -+ seq_puts(m, " 0 "); -+ } -+ seq_putc(m, '\n'); -+} -+ -+static int lru_gen_seq_show(struct seq_file *m, void *v) -+{ -+ unsigned long seq; -+ bool full = !debugfs_real_fops(m->file)->write; -+ struct lruvec *lruvec = v; -+ struct lrugen *lrugen = &lruvec->evictable; -+ int nid = lruvec_pgdat(lruvec)->node_id; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (nid == first_memory_node) { -+ const char *path = memcg ? m->private : ""; -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); -+#endif -+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); -+ } -+ -+ seq_printf(m, " node %5d\n", nid); -+ -+ if (!full) -+ seq = min_seq[0]; -+ else if (max_seq >= MAX_NR_GENS) -+ seq = max_seq - MAX_NR_GENS + 1; -+ else -+ seq = 0; -+ -+ for (; seq <= max_seq; seq++) { -+ int gen, type, zone; -+ unsigned int msecs; -+ -+ gen = lru_gen_from_seq(seq); -+ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen])); -+ -+ seq_printf(m, " %10lu %10u", seq, msecs); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ long size = 0; -+ -+ if (seq < min_seq[type]) { -+ seq_puts(m, " -0 "); -+ continue; -+ } -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) -+ size += READ_ONCE(lrugen->sizes[gen][type][zone]); -+ -+ seq_printf(m, " %10lu ", max(size, 0L)); -+ } -+ -+ seq_putc(m, '\n'); -+ -+ if (full) -+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); -+ } -+ -+ return 0; -+} -+ -+static const struct seq_operations lru_gen_seq_ops = { -+ .start = lru_gen_seq_start, -+ .stop = lru_gen_seq_stop, -+ .next = lru_gen_seq_next, -+ .show = lru_gen_seq_show, -+}; -+ -+static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long seq, bool use_filter) -+{ -+ DEFINE_MAX_SEQ(lruvec); -+ -+ if (seq == max_seq) -+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter); -+ -+ return seq > max_seq ? -EINVAL : 0; -+} -+ -+static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long seq, unsigned long nr_to_reclaim) -+{ -+ struct blk_plug plug; -+ int err = -EINTR; -+ DEFINE_MAX_SEQ(lruvec); -+ -+ if (seq >= max_seq - 1) -+ return -EINVAL; -+ -+ sc->nr_reclaimed = 0; -+ -+ blk_start_plug(&plug); -+ -+ while (!signal_pending(current)) { -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim || -+ !evict_pages(lruvec, sc, swappiness)) { -+ err = 0; -+ break; -+ } -+ -+ cond_resched(); -+ } -+ -+ blk_finish_plug(&plug); -+ -+ return err; -+} -+ -+static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc, -+ int swappiness, unsigned long seq, unsigned long opt) -+{ -+ struct lruvec *lruvec; -+ int err = -EINVAL; -+ struct mem_cgroup *memcg = NULL; -+ -+ if (!mem_cgroup_disabled()) { -+ rcu_read_lock(); -+ memcg = mem_cgroup_from_id(memcg_id); -+#ifdef CONFIG_MEMCG -+ if (memcg && !css_tryget(&memcg->css)) -+ memcg = NULL; -+#endif -+ rcu_read_unlock(); -+ -+ if (!memcg) -+ goto done; -+ } -+ if (memcg_id != mem_cgroup_id(memcg)) -+ goto done; -+ -+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) -+ goto done; -+ -+ lruvec = get_lruvec(nid, memcg); -+ -+ if (swappiness < 0) -+ swappiness = get_swappiness(memcg); -+ else if (swappiness > 200) -+ goto done; -+ -+ switch (cmd) { -+ case '+': -+ err = run_aging(lruvec, sc, swappiness, seq, opt); -+ break; -+ case '-': -+ err = run_eviction(lruvec, sc, swappiness, seq, opt); -+ break; -+ } -+done: -+ mem_cgroup_put(memcg); -+ -+ return err; -+} -+ -+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, -+ size_t len, loff_t *pos) -+{ -+ void *buf; -+ char *cur, *next; -+ unsigned int flags; -+ int err = 0; -+ struct scan_control sc = { -+ .may_writepage = 1, -+ .may_unmap = 1, -+ .may_swap = 1, -+ .reclaim_idx = MAX_NR_ZONES - 1, -+ .gfp_mask = GFP_KERNEL, -+ }; -+ -+ buf = kvmalloc(len + 1, GFP_KERNEL); -+ if (!buf) -+ return -ENOMEM; -+ -+ if (copy_from_user(buf, src, len)) { -+ kvfree(buf); -+ return -EFAULT; -+ } -+ -+ next = buf; -+ next[len] = '\0'; -+ -+ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args(); -+ if (!sc.reclaim_state.mm_walk_args) { -+ kvfree(buf); -+ return -ENOMEM; -+ } -+ -+ flags = memalloc_noreclaim_save(); -+ set_task_reclaim_state(current, &sc.reclaim_state); -+ -+ while ((cur = strsep(&next, ",;\n"))) { -+ int n; -+ int end; -+ char cmd; -+ unsigned int memcg_id; -+ unsigned int nid; -+ unsigned long seq; -+ unsigned int swappiness = -1; -+ unsigned long opt = -1; -+ -+ cur = skip_spaces(cur); -+ if (!*cur) -+ continue; -+ -+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, -+ &seq, &end, &swappiness, &end, &opt, &end); -+ if (n < 4 || cur[end]) { -+ err = -EINVAL; -+ break; -+ } -+ -+ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt); -+ if (err) -+ break; -+ } -+ -+ set_task_reclaim_state(current, NULL); -+ memalloc_noreclaim_restore(flags); -+ -+ free_mm_walk_args(sc.reclaim_state.mm_walk_args); -+ kvfree(buf); -+ -+ return err ? : len; -+} -+ -+static int lru_gen_seq_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &lru_gen_seq_ops); -+} -+ -+static const struct file_operations lru_gen_rw_fops = { -+ .open = lru_gen_seq_open, -+ .read = seq_read, -+ .write = lru_gen_seq_write, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static const struct file_operations lru_gen_ro_fops = { -+ .open = lru_gen_seq_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+/****************************************************************************** - * initialization - ******************************************************************************/ - -@@ -4886,6 +5295,12 @@ static int __init init_lru_gen(void) - BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); - BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); - -+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) -+ pr_err("lru_gen: failed to create sysfs group\n"); -+ -+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); -+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); -+ - return 0; - }; - late_initcall(init_lru_gen); diff --git a/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch new file mode 100644 index 0000000000..e47bfc36d4 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch @@ -0,0 +1,320 @@ +From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:06 -0600 +Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When multiple memcgs are available, it is possible to use generations as a +frame of reference to make better choices and improve overall performance +under global memory pressure. This patch adds a basic optimization to +select memcgs that can drop single-use unmapped clean pages first. Doing +so reduces the chance of going into the aging path or swapping, which can +be costly. + +A typical example that benefits from this optimization is a server running +mixed types of workloads, e.g., heavy anon workload in one memcg and heavy +buffered I/O workload in the other. + +Though this optimization can be applied to both kswapd and direct reclaim, +it is only added to kswapd to keep the patchset manageable. Later +improvements may cover the direct reclaim path. + +While ensuring certain fairness to all eligible memcgs, proportional scans +of individual memcgs also require proper backoff to avoid overshooting +their aggregate reclaim target by too much. Otherwise it can cause high +direct reclaim latency. The conditions for backoff are: + +1. At low priorities, for direct reclaim, if aging fairness or direct + reclaim latency is at risk, i.e., aging one memcg multiple times or + swapping after the target is met. +2. At high priorities, for global reclaim, if per-zone free pages are + above respective watermarks. + +Server benchmark results: + Mixed workloads: + fio (buffered I/O): +[19, 21]% + IOPS BW + patch1-8: 1880k 7343MiB/s + patch1-9: 2252k 8796MiB/s + + memcached (anon): +[119, 123]% + Ops/sec KB/sec + patch1-8: 862768.65 33514.68 + patch1-9: 1911022.12 74234.54 + + Mixed workloads: + fio (buffered I/O): +[75, 77]% + IOPS BW + 5.19-rc1: 1279k 4996MiB/s + patch1-9: 2252k 8796MiB/s + + memcached (anon): +[13, 15]% + Ops/sec KB/sec + 5.19-rc1: 1673524.04 65008.87 + patch1-9: 1911022.12 74234.54 + + Configurations: + (changes since patch 6) + + cat mixed.sh + modprobe brd rd_nr=2 rd_size=56623104 + + swapoff -a + mkswap /dev/ram0 + swapon /dev/ram0 + + mkfs.ext4 /dev/ram1 + mount -t ext4 /dev/ram1 /mnt + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \ + --ratio 1:0 --pipeline 8 -d 2000 + + fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \ + --buffered=1 --ioengine=io_uring --iodepth=128 \ + --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ + --rw=randread --random_distribution=random --norandommap \ + --time_based --ramp_time=10m --runtime=90m --group_reporting & + pid=$! + + sleep 200 + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \ + --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed + + kill -INT $pid + wait + +Client benchmark results: + no change (CONFIG_MEMCG=n) + +Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 96 insertions(+), 9 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index a7844c689522..b6f6fc2585e1 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -127,6 +127,12 @@ struct scan_control { + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + ++#ifdef CONFIG_LRU_GEN ++ /* help kswapd make better choices among multiple memcgs */ ++ unsigned int memcgs_need_aging:1; ++ unsigned long last_reclaimed; ++#endif ++ + /* Allocation order */ + s8 order; + +@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + + VM_WARN_ON_ONCE(!current_is_kswapd()); + ++ sc->last_reclaimed = sc->nr_reclaimed; ++ ++ /* ++ * To reduce the chance of going into the aging path, which can be ++ * costly, optimistically skip it if the flag below was cleared in the ++ * eviction path. This improves the overall performance when multiple ++ * memcgs are available. ++ */ ++ if (!sc->memcgs_need_aging) { ++ sc->memcgs_need_aging = true; ++ return; ++ } ++ + set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); +@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swa + return scanned; + } + +-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ bool *need_swapping) + { + int type; + int scanned; +@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + + sc->nr_reclaimed += reclaimed; + ++ if (need_swapping && type == LRU_GEN_ANON) ++ *need_swapping = true; ++ + return scanned; + } + +@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + * reclaim. + */ + static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, +- bool can_swap) ++ bool can_swap, bool *need_aging) + { +- bool need_aging; + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); +@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + +- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); +- if (!need_aging) ++ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); ++ if (!*need_aging) + return nr_to_scan; + + /* skip the aging path at the default priority */ +@@ -4715,10 +4737,68 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; + } + ++static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, ++ struct scan_control *sc, bool need_swapping) ++{ ++ int i; ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (!current_is_kswapd()) { ++ /* age each memcg once to ensure fairness */ ++ if (max_seq - seq > 1) ++ return true; ++ ++ /* over-swapping can increase allocation latency */ ++ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) ++ return true; ++ ++ /* give this thread a chance to exit and free its memory */ ++ if (fatal_signal_pending(current)) { ++ sc->nr_reclaimed += MIN_LRU_BATCH; ++ return true; ++ } ++ ++ if (cgroup_reclaim(sc)) ++ return false; ++ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) ++ return false; ++ ++ /* keep scanning at low priorities to ensure fairness */ ++ if (sc->priority > DEF_PRIORITY - 2) ++ return false; ++ ++ /* ++ * A minimum amount of work was done under global memory pressure. For ++ * kswapd, it may be overshooting. For direct reclaim, the target isn't ++ * met, and yet the allocation may still succeed, since kswapd may have ++ * caught up. In either case, it's better to stop now, and restart if ++ * necessary. ++ */ ++ for (i = 0; i <= sc->reclaim_idx; i++) { ++ unsigned long wmark; ++ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; ++ ++ if (!managed_zone(zone)) ++ continue; ++ ++ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); ++ if (wmark > zone_page_state(zone, NR_FREE_PAGES)) ++ return false; ++ } ++ ++ sc->nr_reclaimed += MIN_LRU_BATCH; ++ ++ return true; ++} ++ + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + struct blk_plug plug; ++ bool need_aging = false; ++ bool need_swapping = false; + unsigned long scanned = 0; ++ unsigned long reclaimed = sc->nr_reclaimed; ++ DEFINE_MAX_SEQ(lruvec); + + lru_add_drain(); + +@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + else + swappiness = 0; + +- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); + if (!nr_to_scan) +- break; ++ goto done; + +- delta = evict_pages(lruvec, sc, swappiness); ++ delta = evict_pages(lruvec, sc, swappiness, &need_swapping); + if (!delta) +- break; ++ goto done; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + ++ if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) ++ break; ++ + cond_resched(); + } + ++ /* see the comment in lru_gen_age_node() */ ++ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) ++ sc->memcgs_need_aging = false; ++done: + clear_mm_walk(); + + blk_finish_plug(&plug); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-09-mm-multigenerational-lru-Kconfig.patch b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multigenerational-lru-Kconfig.patch deleted file mode 100644 index 4462549f99..0000000000 --- a/target/linux/generic/backport-5.15/020-v6.1-09-mm-multigenerational-lru-Kconfig.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Mon, 25 Jan 2021 21:47:24 -0700 -Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig - -Add configuration options for the multigenerational lru. - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635 ---- - mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 59 insertions(+) - ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -899,4 +899,63 @@ config SECRETMEM - - source "mm/damon/Kconfig" - -+# the multigenerational lru { -+config LRU_GEN -+ bool "Multigenerational LRU" -+ depends on MMU -+ # the following options may leave not enough spare bits in page->flags -+ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) -+ help -+ A high performance LRU implementation to heavily overcommit workloads -+ that are not IO bound. See Documentation/vm/multigen_lru.rst for -+ details. -+ -+ Warning: do not enable this option unless you plan to use it because -+ it introduces a small per-process and per-memcg and per-node memory -+ overhead. -+ -+config LRU_GEN_ENABLED -+ bool "Turn on by default" -+ depends on LRU_GEN -+ help -+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option -+ changes it to 1. -+ -+ Warning: the default value is the fast path. See -+ Documentation/static-keys.txt for details. -+ -+config LRU_GEN_STATS -+ bool "Full stats for debugging" -+ depends on LRU_GEN -+ help -+ This option keeps full stats for each generation, which can be read -+ from /sys/kernel/debug/lru_gen_full. -+ -+ Warning: do not enable this option unless you plan to use it because -+ it introduces an additional small per-process and per-memcg and -+ per-node memory overhead. -+ -+config NR_LRU_GENS -+ int "Max number of generations" -+ depends on LRU_GEN -+ range 4 31 -+ default 7 -+ help -+ This will use order_base_2(N+1) spare bits from page flags. -+ -+ Warning: do not use numbers larger than necessary because each -+ generation introduces a small per-node and per-memcg memory overhead. -+ -+config TIERS_PER_GEN -+ int "Number of tiers per generation" -+ depends on LRU_GEN -+ range 2 5 -+ default 4 -+ help -+ This will use N-2 spare bits from page flags. -+ -+ Larger values generally offer better protection to active pages under -+ heavy buffered I/O workloads. -+# } -+ - endmenu diff --git a/target/linux/generic/backport-5.15/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch b/target/linux/generic/backport-5.15/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch new file mode 100644 index 0000000000..0adb15f5e2 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch @@ -0,0 +1,513 @@ +From 640db3a029dca909af47157ca18f52b29d34a1b9 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:07 -0600 +Subject: [PATCH 10/29] mm: multi-gen LRU: kill switch +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that +can be disabled include: + 0x0001: the multi-gen LRU core + 0x0002: walking page table, when arch_has_hw_pte_young() returns + true + 0x0004: clearing the accessed bit in non-leaf PMD entries, when + CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y + [yYnN]: apply to all the components above +E.g., + echo y >/sys/kernel/mm/lru_gen/enabled + cat /sys/kernel/mm/lru_gen/enabled + 0x0007 + echo 5 >/sys/kernel/mm/lru_gen/enabled + cat /sys/kernel/mm/lru_gen/enabled + 0x0005 + +NB: the page table walks happen on the scale of seconds under heavy memory +pressure, in which case the mmap_lock contention is a lesser concern, +compared with the LRU lock contention and the I/O congestion. So far the +only well-known case of the mmap_lock contention happens on Android, due +to Scudo [1] which allocates several thousand VMAs for merely a few +hundred MBs. The SPF and the Maple Tree also have provided their own +assessments [2][3]. However, if walking page tables does worsen the +mmap_lock contention, the kill switch can be used to disable it. In this +case the multi-gen LRU will suffer a minor performance degradation, as +shown previously. + +Clearing the accessed bit in non-leaf PMD entries can also be disabled, +since this behavior was not tested on x86 varieties other than Intel and +AMD. + +[1] https://source.android.com/devices/tech/debug/scudo +[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/ +[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/ + +Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/cgroup.h | 15 ++- + include/linux/mm_inline.h | 15 ++- + include/linux/mmzone.h | 9 ++ + kernel/cgroup/cgroup-internal.h | 1 - + mm/Kconfig | 6 + + mm/vmscan.c | 228 +++++++++++++++++++++++++++++++- + 6 files changed, 265 insertions(+), 9 deletions(-) + +diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h +index 45cdb12243e3..f9a5d6a81101 100644 +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgroup *cgrp) + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgroup *cgrp) + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -708,6 +719,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 58aabb1ba020..e095c1c24311 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -91,10 +91,21 @@ static __always_inline enum lru_list page_lru(struct page *page) + + #ifdef CONFIG_LRU_GEN + ++#ifdef CONFIG_LRU_GEN_ENABLED + static inline bool lru_gen_enabled(void) + { +- return true; ++ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); ++ ++ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); ++} ++#else ++static inline bool lru_gen_enabled(void) ++{ ++ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); ++ ++ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); + } ++#endif + + static inline bool lru_gen_in_fault(void) + { +@@ -207,7 +218,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo + + VM_WARN_ON_ONCE_PAGE(gen != -1, page); + +- if (PageUnevictable(page)) ++ if (PageUnevictable(page) || !lrugen->enabled) + return false; + /* + * There are three common cases for this page: +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 659bab633bdf..edaf035503ed 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -364,6 +364,13 @@ enum { + LRU_GEN_FILE, + }; + ++enum { ++ LRU_GEN_CORE, ++ LRU_GEN_MM_WALK, ++ LRU_GEN_NONLEAF_YOUNG, ++ NR_LRU_GEN_CAPS ++}; ++ + #define MIN_LRU_BATCH BITS_PER_LONG + #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) + +@@ -405,6 +412,8 @@ struct lru_gen_struct { + /* can be modified without holding the LRU lock */ + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ /* whether the multi-gen LRU is enabled */ ++ bool enabled; + }; + + enum { +diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h +index d8fcc139ac05..28c32a01da7d 100644 +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -165,7 +165,6 @@ struct cgroup_mgctx { + #define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + extern struct cgroup_subsys *cgroup_subsys[]; + extern struct list_head cgroup_roots; +diff --git a/mm/Kconfig b/mm/Kconfig +index 62433f3cd7ae..4a7d0af3c39b 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -906,6 +906,12 @@ config LRU_GEN + help + A high performance LRU implementation to overcommit memory. + ++config LRU_GEN_ENABLED ++ bool "Enable by default" ++ depends on LRU_GEN ++ help ++ This option enables the multi-gen LRU by default. ++ + config LRU_GEN_STATS + bool "Full stats for debugging" + depends on LRU_GEN +diff --git a/mm/vmscan.c b/mm/vmscan.c +index b6f6fc2585e1..be37d996bc92 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -52,6 +52,7 @@ + #include <linux/psi.h> + #include <linux/pagewalk.h> + #include <linux/shmem_fs.h> ++#include <linux/ctype.h> + + #include <asm/tlbflush.h> + #include <asm/div64.h> +@@ -2841,6 +2842,14 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + + #ifdef CONFIG_LRU_GEN + ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); ++#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) ++#else ++DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); ++#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) ++#endif ++ + /****************************************************************************** + * shorthand helpers + ******************************************************************************/ +@@ -3717,7 +3726,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area + goto next; + + if (!pmd_trans_huge(pmd[i])) { +- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && ++ get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } +@@ -3815,10 +3825,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + walk->mm_stats[MM_NONLEAF_TOTAL]++; + + #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +- if (!pmd_young(val)) +- continue; ++ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { ++ if (!pmd_young(val)) ++ continue; + +- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ } + #endif + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + continue; +@@ -4080,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ +- if (!arch_has_hw_pte_young()) { ++ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } +@@ -4845,6 +4857,208 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + blk_finish_plug(&plug); + } + ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) ++{ ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ if (lrugen->enabled) { ++ enum lru_list lru; ++ ++ for_each_evictable_lru(lru) { ++ if (!list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ } else { ++ int gen, type, zone; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool fill_evictable(struct lruvec *lruvec) ++{ ++ enum lru_list lru; ++ int remaining = MAX_LRU_BATCH; ++ ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); ++ VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page); ++ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); ++ VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page); ++ ++ del_page_from_lru_list(page, lruvec); ++ success = lru_gen_add_page(lruvec, page, false); ++ VM_WARN_ON_ONCE(!success); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool drain_evictable(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ int remaining = MAX_LRU_BATCH; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); ++ VM_WARN_ON_ONCE_PAGE(PageActive(page), page); ++ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); ++ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); ++ ++ success = lru_gen_del_page(lruvec, page, false); ++ VM_WARN_ON_ONCE(!success); ++ add_page_to_lru_list(page, lruvec); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static void lru_gen_change_state(bool enabled) ++{ ++ static DEFINE_MUTEX(state_mutex); ++ ++ struct mem_cgroup *memcg; ++ ++ cgroup_lock(); ++ cpus_read_lock(); ++ get_online_mems(); ++ mutex_lock(&state_mutex); ++ ++ if (enabled == lru_gen_enabled()) ++ goto unlock; ++ ++ if (enabled) ++ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); ++ else ++ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ if (!lruvec) ++ continue; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); ++ VM_WARN_ON_ONCE(!state_is_valid(lruvec)); ++ ++ lruvec->lrugen.enabled = enabled; ++ ++ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ mutex_unlock(&state_mutex); ++ put_online_mems(); ++ cpus_read_unlock(); ++ cgroup_unlock(); ++} ++ ++/****************************************************************************** ++ * sysfs interface ++ ******************************************************************************/ ++ ++static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ unsigned int caps = 0; ++ ++ if (get_cap(LRU_GEN_CORE)) ++ caps |= BIT(LRU_GEN_CORE); ++ ++ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) ++ caps |= BIT(LRU_GEN_MM_WALK); ++ ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) ++ caps |= BIT(LRU_GEN_NONLEAF_YOUNG); ++ ++ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); ++} ++ ++static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ int i; ++ unsigned int caps; ++ ++ if (tolower(*buf) == 'n') ++ caps = 0; ++ else if (tolower(*buf) == 'y') ++ caps = -1; ++ else if (kstrtouint(buf, 0, &caps)) ++ return -EINVAL; ++ ++ for (i = 0; i < NR_LRU_GEN_CAPS; i++) { ++ bool enabled = caps & BIT(i); ++ ++ if (i == LRU_GEN_CORE) ++ lru_gen_change_state(enabled); ++ else if (enabled) ++ static_branch_enable(&lru_gen_caps[i]); ++ else ++ static_branch_disable(&lru_gen_caps[i]); ++ } ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_enabled_attr = __ATTR( ++ enabled, 0644, show_enabled, store_enabled ++); ++ ++static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_enabled_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group lru_gen_attr_group = { ++ .name = "lru_gen", ++ .attrs = lru_gen_attrs, ++}; ++ + /****************************************************************************** + * initialization + ******************************************************************************/ +@@ -4855,6 +5069,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled = lru_gen_enabled(); + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); +@@ -4894,6 +5109,9 @@ static int __init init_lru_gen(void) + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + ++ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) ++ pr_err("lru_gen: failed to create sysfs group\n"); ++ + return 0; + }; + late_initcall(init_lru_gen); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-10-mm-multigenerational-lru-documentation.patch b/target/linux/generic/backport-5.15/020-v6.1-10-mm-multigenerational-lru-documentation.patch deleted file mode 100644 index f4716fb68d..0000000000 --- a/target/linux/generic/backport-5.15/020-v6.1-10-mm-multigenerational-lru-documentation.patch +++ /dev/null @@ -1,161 +0,0 @@ -From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Tue, 2 Feb 2021 01:27:45 -0700 -Subject: [PATCH 10/10] mm: multigenerational lru: documentation - -Add Documentation/vm/multigen_lru.rst. - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e ---- - Documentation/vm/index.rst | 1 + - Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++ - 2 files changed, 133 insertions(+) - create mode 100644 Documentation/vm/multigen_lru.rst - ---- a/Documentation/vm/index.rst -+++ b/Documentation/vm/index.rst -@@ -17,6 +17,7 @@ various features of the Linux memory man - - swap_numa - zswap -+ multigen_lru - - Kernel developers MM documentation - ================================== ---- /dev/null -+++ b/Documentation/vm/multigen_lru.rst -@@ -0,0 +1,132 @@ -+.. SPDX-License-Identifier: GPL-2.0 -+ -+===================== -+Multigenerational LRU -+===================== -+ -+Quick Start -+=========== -+Build Configurations -+-------------------- -+:Required: Set ``CONFIG_LRU_GEN=y``. -+ -+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by -+ default. -+ -+Runtime Configurations -+---------------------- -+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the -+ feature was not turned on by default. -+ -+:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to -+ protect the working set of ``N`` milliseconds. The OOM killer is -+ invoked if this working set cannot be kept in memory. -+ -+:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature -+ is turned on. This file has the following output: -+ -+:: -+ -+ memcg memcg_id memcg_path -+ node node_id -+ min_gen birth_time anon_size file_size -+ ... -+ max_gen birth_time anon_size file_size -+ -+``min_gen`` is the oldest generation number and ``max_gen`` is the -+youngest generation number. ``birth_time`` is in milliseconds. -+``anon_size`` and ``file_size`` are in pages. -+ -+Phones/Laptops/Workstations -+--------------------------- -+No additional configurations required. -+ -+Servers/Data Centers -+-------------------- -+:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a -+ larger number. -+ -+:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger -+ number. -+ -+:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``. -+ -+:Working set estimation: Write ``+ memcg_id node_id max_gen -+ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to -+ invoke the aging, which scans PTEs for accessed pages and then -+ creates the next generation ``max_gen+1``. A swap file and a non-zero -+ ``swappiness``, which overrides ``vm.swappiness``, are required to -+ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to -+ override the default behavior which only scans PTE tables found -+ populated. -+ -+:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness] -+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the -+ eviction, which evicts generations less than or equal to ``min_gen``. -+ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and -+ ``max_gen-1`` are not fully aged and therefore cannot be evicted. -+ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple -+ command lines are supported, so does concatenation with delimiters -+ ``,`` and ``;``. -+ -+Framework -+========= -+For each ``lruvec``, evictable pages are divided into multiple -+generations. The youngest generation number is stored in -+``lrugen->max_seq`` for both anon and file types as they are aged on -+an equal footing. The oldest generation numbers are stored in -+``lrugen->min_seq[]`` separately for anon and file types as clean -+file pages can be evicted regardless of swap and writeback -+constraints. These three variables are monotonically increasing. -+Generation numbers are truncated into -+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into -+``page->flags``. The sliding window technique is used to prevent -+truncated generation numbers from overlapping. Each truncated -+generation number is an index to an array of per-type and per-zone -+lists ``lrugen->lists``. -+ -+Each generation is divided into multiple tiers. Tiers represent -+different ranges of numbers of accesses from file descriptors only. -+Pages accessed ``N`` times via file descriptors belong to tier -+``order_base_2(N)``. Each generation contains at most -+``CONFIG_TIERS_PER_GEN`` tiers, and they require additional -+``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to -+moving between generations which requires list operations, moving -+between tiers only involves operations on ``page->flags`` and -+therefore has a negligible cost. A feedback loop modeled after the PID -+controller monitors refaulted % across all tiers and decides when to -+protect pages from which tiers. -+ -+The framework comprises two conceptually independent components: the -+aging and the eviction, which can be invoked separately from user -+space for the purpose of working set estimation and proactive reclaim. -+ -+Aging -+----- -+The aging produces young generations. Given an ``lruvec``, the aging -+traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()`` -+to scan PTEs for accessed pages (a ``mm_struct`` list is maintained -+for each ``memcg``). Upon finding one, the aging updates its -+generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``). -+After each round of traversal, the aging increments ``max_seq``. The -+aging is due when ``min_seq[]`` reaches ``max_seq-1``. -+ -+Eviction -+-------- -+The eviction consumes old generations. Given an ``lruvec``, the -+eviction scans pages on the per-zone lists indexed by anon and file -+``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to -+select a type based on the values of ``min_seq[]``. If they are -+equal, it selects the type that has a lower refaulted %. The eviction -+sorts a page according to its updated generation number if the aging -+has found this page accessed. It also moves a page to the next -+generation if this page is from an upper tier that has a higher -+refaulted % than the base tier. The eviction increments ``min_seq[]`` -+of a selected type when it finds all the per-zone lists indexed by -+``min_seq[]`` of this selected type are empty. -+ -+To-do List -+========== -+KVM Optimization -+---------------- -+Support shadow page table walk. diff --git a/target/linux/generic/backport-5.15/020-v6.1-11-mm-multi-gen-LRU-thrashing-prevention.patch b/target/linux/generic/backport-5.15/020-v6.1-11-mm-multi-gen-LRU-thrashing-prevention.patch new file mode 100644 index 0000000000..5b1ed03ca9 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-11-mm-multi-gen-LRU-thrashing-prevention.patch @@ -0,0 +1,233 @@ +From 73d1ff551760f0c79c47ab70faa4c2ca91413f5c Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:08 -0600 +Subject: [PATCH 11/29] mm: multi-gen LRU: thrashing prevention +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as +requested by many desktop users [1]. + +When set to value N, it prevents the working set of N milliseconds from +getting evicted. The OOM killer is triggered if this working set cannot +be kept in memory. Based on the average human detectable lag (~100ms), +N=1000 usually eliminates intolerable lags due to thrashing. Larger +values like N=3000 make lags less noticeable at the risk of premature OOM +kills. + +Compared with the size-based approach [2], this time-based approach +has the following advantages: + +1. It is easier to configure because it is agnostic to applications + and memory sizes. +2. It is more reliable because it is directly wired to the OOM killer. + +[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/ +[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/ + +Link: https://lkml.kernel.org/r/20220918080010.2920238-12-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/mmzone.h | 2 ++ + mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++++++++++-- + 2 files changed, 73 insertions(+), 3 deletions(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index edaf035503ed..6b85ba1f4e18 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -399,6 +399,8 @@ struct lru_gen_struct { + unsigned long max_seq; + /* the eviction increments the oldest generation numbers */ + unsigned long min_seq[ANON_AND_FILE]; ++ /* the birth time of each generation in jiffies */ ++ unsigned long timestamps[MAX_NR_GENS]; + /* the multi-gen LRU lists, lazily sorted on eviction */ + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the multi-gen LRU sizes, eventually consistent */ +diff --git a/mm/vmscan.c b/mm/vmscan.c +index be37d996bc92..642ee7bef61d 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4064,6 +4064,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, type, false); + ++ WRITE_ONCE(lrugen->timestamps[next], jiffies); + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); + +@@ -4193,7 +4194,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig + return false; + } + +-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) + { + bool need_aging; + unsigned long nr_to_scan; +@@ -4207,16 +4208,36 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(memcg)) +- return; ++ return false; + + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); ++ ++ if (min_ttl) { ++ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); ++ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); ++ ++ if (time_is_after_jiffies(birth + min_ttl)) ++ return false; ++ ++ /* the size is likely too small to be helpful */ ++ if (!nr_to_scan && sc->priority != DEF_PRIORITY) ++ return false; ++ } ++ + if (need_aging) + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); ++ ++ return true; + } + ++/* to protect the working set of the last N jiffies */ ++static unsigned long lru_gen_min_ttl __read_mostly; ++ + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { + struct mem_cgroup *memcg; ++ bool success = false; ++ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_WARN_ON_ONCE(!current_is_kswapd()); + +@@ -4239,12 +4260,32 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + +- age_lruvec(lruvec, sc); ++ if (age_lruvec(lruvec, sc, min_ttl)) ++ success = true; + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + clear_mm_walk(); ++ ++ /* check the order to exclude compaction-induced reclaim */ ++ if (success || !min_ttl || sc->order) ++ return; ++ ++ /* ++ * The main goal is to OOM kill if every generation from all memcgs is ++ * younger than min_ttl. However, another possibility is all memcgs are ++ * either below min or empty. ++ */ ++ if (mutex_trylock(&oom_lock)) { ++ struct oom_control oc = { ++ .gfp_mask = sc->gfp_mask, ++ }; ++ ++ out_of_memory(&oc); ++ ++ mutex_unlock(&oom_lock); ++ } + } + + /* +@@ -5002,6 +5043,28 @@ static void lru_gen_change_state(bool enabled) + * sysfs interface + ******************************************************************************/ + ++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); ++} ++ ++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ unsigned int msecs; ++ ++ if (kstrtouint(buf, 0, &msecs)) ++ return -EINVAL; ++ ++ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( ++ min_ttl_ms, 0644, show_min_ttl, store_min_ttl ++); ++ + static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) + { + unsigned int caps = 0; +@@ -5050,6 +5113,7 @@ static struct kobj_attribute lru_gen_enabled_attr = __ATTR( + ); + + static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_min_ttl_attr.attr, + &lru_gen_enabled_attr.attr, + NULL + }; +@@ -5065,12 +5129,16 @@ static struct attribute_group lru_gen_attr_group = { + + void lru_gen_init_lruvec(struct lruvec *lruvec) + { ++ int i; + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); + ++ for (i = 0; i <= MIN_NR_GENS + 1; i++) ++ lrugen->timestamps[i] = jiffies; ++ + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-12-mm-multi-gen-LRU-debugfs-interface.patch b/target/linux/generic/backport-5.15/020-v6.1-12-mm-multi-gen-LRU-debugfs-interface.patch new file mode 100644 index 0000000000..ae9d76faa4 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-12-mm-multi-gen-LRU-debugfs-interface.patch @@ -0,0 +1,586 @@ +From 530716d008ca26315f246cd70dc1cefc636beaa4 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:09 -0600 +Subject: [PATCH 12/29] mm: multi-gen LRU: debugfs interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add /sys/kernel/debug/lru_gen for working set estimation and proactive +reclaim. These techniques are commonly used to optimize job scheduling +(bin packing) in data centers [1][2]. + +Compared with the page table-based approach and the PFN-based +approach, this lruvec-based approach has the following advantages: +1. It offers better choices because it is aware of memcgs, NUMA nodes, + shared mappings and unmapped page cache. +2. It is more scalable because it is O(nr_hot_pages), whereas the + PFN-based approach is O(nr_total_pages). + +Add /sys/kernel/debug/lru_gen_full for debugging. + +[1] https://dl.acm.org/doi/10.1145/3297858.3304053 +[2] https://dl.acm.org/doi/10.1145/3503222.3507731 + +Link: https://lkml.kernel.org/r/20220918080010.2920238-13-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/nodemask.h | 1 + + mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++- + 2 files changed, 402 insertions(+), 10 deletions(-) + +diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h +index 0f233b76c9ce..292ec0ce0d63 100644 +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -485,6 +485,7 @@ static inline int num_node_state(enum node_states state) + #define first_online_node 0 + #define first_memory_node 0 + #define next_online_node(nid) (MAX_NUMNODES) ++#define next_memory_node(nid) (MAX_NUMNODES) + #define nr_node_ids 1U + #define nr_online_nodes 1U + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 642ee7bef61d..b74b334488d8 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -53,6 +53,7 @@ + #include <linux/pagewalk.h> + #include <linux/shmem_fs.h> + #include <linux/ctype.h> ++#include <linux/debugfs.h> + + #include <asm/tlbflush.h> + #include <asm/div64.h> +@@ -3968,12 +3969,40 @@ static void clear_mm_walk(void) + kfree(walk); + } + +-static void inc_min_seq(struct lruvec *lruvec, int type) ++static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) + { ++ int zone; ++ int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ if (type == LRU_GEN_ANON && !can_swap) ++ goto done; ++ ++ /* prevent cold/hot inversion if force_scan is true */ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ struct list_head *head = &lrugen->lists[old_gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ ++ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); ++ VM_WARN_ON_ONCE_PAGE(PageActive(page), page); ++ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); ++ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); + ++ new_gen = page_inc_gen(lruvec, page, false); ++ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++done: + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++ ++ return true; + } + + static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +@@ -4019,7 +4048,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + return success; + } + +-static void inc_max_seq(struct lruvec *lruvec, bool can_swap) ++static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) + { + int prev, next; + int type, zone; +@@ -4033,9 +4062,13 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; + +- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); ++ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); + +- inc_min_seq(lruvec, type); ++ while (!inc_min_seq(lruvec, type, can_swap)) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } + } + + /* +@@ -4072,7 +4105,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) + } + + static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, +- struct scan_control *sc, bool can_swap) ++ struct scan_control *sc, bool can_swap, bool force_scan) + { + bool success; + struct lru_gen_mm_walk *walk; +@@ -4093,7 +4126,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ +- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { ++ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } +@@ -4107,7 +4140,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + walk->lruvec = lruvec; + walk->max_seq = max_seq; + walk->can_swap = can_swap; +- walk->force_scan = false; ++ walk->force_scan = force_scan; + + do { + success = iterate_mm_list(lruvec, walk, &mm); +@@ -4127,7 +4160,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); + +- inc_max_seq(lruvec, can_swap); ++ inc_max_seq(lruvec, can_swap, force_scan); + /* either this sees any waiters or they will see updated max_seq */ + if (wq_has_sleeper(&lruvec->mm_state.wait)) + wake_up_all(&lruvec->mm_state.wait); +@@ -4225,7 +4258,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned + } + + if (need_aging) +- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); ++ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); + + return true; + } +@@ -4784,7 +4817,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + if (current_is_kswapd()) + return 0; + +- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap)) ++ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) + return nr_to_scan; + done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +@@ -5123,6 +5156,361 @@ static struct attribute_group lru_gen_attr_group = { + .attrs = lru_gen_attrs, + }; + ++/****************************************************************************** ++ * debugfs interface ++ ******************************************************************************/ ++ ++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct mem_cgroup *memcg; ++ loff_t nr_to_skip = *pos; ++ ++ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); ++ if (!m->private) ++ return ERR_PTR(-ENOMEM); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node_state(nid, N_MEMORY) { ++ if (!nr_to_skip--) ++ return get_lruvec(memcg, nid); ++ } ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ return NULL; ++} ++ ++static void lru_gen_seq_stop(struct seq_file *m, void *v) ++{ ++ if (!IS_ERR_OR_NULL(v)) ++ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); ++ ++ kvfree(m->private); ++ m->private = NULL; ++} ++ ++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ int nid = lruvec_pgdat(v)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(v); ++ ++ ++*pos; ++ ++ nid = next_memory_node(nid); ++ if (nid == MAX_NUMNODES) { ++ memcg = mem_cgroup_iter(NULL, memcg, NULL); ++ if (!memcg) ++ return NULL; ++ ++ nid = first_memory_node; ++ } ++ ++ return get_lruvec(memcg, nid); ++} ++ ++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, ++ unsigned long max_seq, unsigned long *min_seq, ++ unsigned long seq) ++{ ++ int i; ++ int type, tier; ++ int hist = lru_hist_from_seq(seq); ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ seq_printf(m, " %10d", tier); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ const char *s = " "; ++ unsigned long n[3] = {}; ++ ++ if (seq == max_seq) { ++ s = "RT "; ++ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); ++ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); ++ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { ++ s = "rep"; ++ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); ++ } ++ ++ for (i = 0; i < 3; i++) ++ seq_printf(m, " %10lu%c", n[i], s[i]); ++ } ++ seq_putc(m, '\n'); ++ } ++ ++ seq_puts(m, " "); ++ for (i = 0; i < NR_MM_STATS; i++) { ++ const char *s = " "; ++ unsigned long n = 0; ++ ++ if (seq == max_seq && NR_HIST_GENS == 1) { ++ s = "LOYNFA"; ++ n = READ_ONCE(lruvec->mm_state.stats[hist][i]); ++ } else if (seq != max_seq && NR_HIST_GENS > 1) { ++ s = "loynfa"; ++ n = READ_ONCE(lruvec->mm_state.stats[hist][i]); ++ } ++ ++ seq_printf(m, " %10lu%c", n, s[i]); ++ } ++ seq_putc(m, '\n'); ++} ++ ++static int lru_gen_seq_show(struct seq_file *m, void *v) ++{ ++ unsigned long seq; ++ bool full = !debugfs_real_fops(m->file)->write; ++ struct lruvec *lruvec = v; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ int nid = lruvec_pgdat(lruvec)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (nid == first_memory_node) { ++ const char *path = memcg ? m->private : ""; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); ++#endif ++ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); ++ } ++ ++ seq_printf(m, " node %5d\n", nid); ++ ++ if (!full) ++ seq = min_seq[LRU_GEN_ANON]; ++ else if (max_seq >= MAX_NR_GENS) ++ seq = max_seq - MAX_NR_GENS + 1; ++ else ++ seq = 0; ++ ++ for (; seq <= max_seq; seq++) { ++ int type, zone; ++ int gen = lru_gen_from_seq(seq); ++ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); ++ ++ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ unsigned long size = 0; ++ char mark = full && seq < min_seq[type] ? 'x' : ' '; ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); ++ ++ seq_printf(m, " %10lu%c", size, mark); ++ } ++ ++ seq_putc(m, '\n'); ++ ++ if (full) ++ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); ++ } ++ ++ return 0; ++} ++ ++static const struct seq_operations lru_gen_seq_ops = { ++ .start = lru_gen_seq_start, ++ .stop = lru_gen_seq_stop, ++ .next = lru_gen_seq_next, ++ .show = lru_gen_seq_show, ++}; ++ ++static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, ++ bool can_swap, bool force_scan) ++{ ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (seq < max_seq) ++ return 0; ++ ++ if (seq > max_seq) ++ return -EINVAL; ++ ++ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) ++ return -ERANGE; ++ ++ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); ++ ++ return 0; ++} ++ ++static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, ++ int swappiness, unsigned long nr_to_reclaim) ++{ ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq + MIN_NR_GENS > max_seq) ++ return -EINVAL; ++ ++ sc->nr_reclaimed = 0; ++ ++ while (!signal_pending(current)) { ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (seq < min_seq[!swappiness]) ++ return 0; ++ ++ if (sc->nr_reclaimed >= nr_to_reclaim) ++ return 0; ++ ++ if (!evict_pages(lruvec, sc, swappiness, NULL)) ++ return 0; ++ ++ cond_resched(); ++ } ++ ++ return -EINTR; ++} ++ ++static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, ++ struct scan_control *sc, int swappiness, unsigned long opt) ++{ ++ struct lruvec *lruvec; ++ int err = -EINVAL; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) ++ return -EINVAL; ++ ++ if (!mem_cgroup_disabled()) { ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_id(memcg_id); ++#ifdef CONFIG_MEMCG ++ if (memcg && !css_tryget(&memcg->css)) ++ memcg = NULL; ++#endif ++ rcu_read_unlock(); ++ ++ if (!memcg) ++ return -EINVAL; ++ } ++ ++ if (memcg_id != mem_cgroup_id(memcg)) ++ goto done; ++ ++ lruvec = get_lruvec(memcg, nid); ++ ++ if (swappiness < 0) ++ swappiness = get_swappiness(lruvec, sc); ++ else if (swappiness > 200) ++ goto done; ++ ++ switch (cmd) { ++ case '+': ++ err = run_aging(lruvec, seq, sc, swappiness, opt); ++ break; ++ case '-': ++ err = run_eviction(lruvec, seq, sc, swappiness, opt); ++ break; ++ } ++done: ++ mem_cgroup_put(memcg); ++ ++ return err; ++} ++ ++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, ++ size_t len, loff_t *pos) ++{ ++ void *buf; ++ char *cur, *next; ++ unsigned int flags; ++ struct blk_plug plug; ++ int err = -EINVAL; ++ struct scan_control sc = { ++ .may_writepage = true, ++ .may_unmap = true, ++ .may_swap = true, ++ .reclaim_idx = MAX_NR_ZONES - 1, ++ .gfp_mask = GFP_KERNEL, ++ }; ++ ++ buf = kvmalloc(len + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ if (copy_from_user(buf, src, len)) { ++ kvfree(buf); ++ return -EFAULT; ++ } ++ ++ set_task_reclaim_state(current, &sc.reclaim_state); ++ flags = memalloc_noreclaim_save(); ++ blk_start_plug(&plug); ++ if (!set_mm_walk(NULL)) { ++ err = -ENOMEM; ++ goto done; ++ } ++ ++ next = buf; ++ next[len] = '\0'; ++ ++ while ((cur = strsep(&next, ",;\n"))) { ++ int n; ++ int end; ++ char cmd; ++ unsigned int memcg_id; ++ unsigned int nid; ++ unsigned long seq; ++ unsigned int swappiness = -1; ++ unsigned long opt = -1; ++ ++ cur = skip_spaces(cur); ++ if (!*cur) ++ continue; ++ ++ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, ++ &seq, &end, &swappiness, &end, &opt, &end); ++ if (n < 4 || cur[end]) { ++ err = -EINVAL; ++ break; ++ } ++ ++ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); ++ if (err) ++ break; ++ } ++done: ++ clear_mm_walk(); ++ blk_finish_plug(&plug); ++ memalloc_noreclaim_restore(flags); ++ set_task_reclaim_state(current, NULL); ++ ++ kvfree(buf); ++ ++ return err ? : len; ++} ++ ++static int lru_gen_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &lru_gen_seq_ops); ++} ++ ++static const struct file_operations lru_gen_rw_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .write = lru_gen_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static const struct file_operations lru_gen_ro_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ + /****************************************************************************** + * initialization + ******************************************************************************/ +@@ -5180,6 +5568,9 @@ static int __init init_lru_gen(void) + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + ++ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); ++ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); ++ + return 0; + }; + late_initcall(init_lru_gen); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/021-v6.1-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch b/target/linux/generic/backport-5.15/020-v6.1-13-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch index 6cc4b3368f..a2318499e7 100644 --- a/target/linux/generic/backport-5.15/021-v6.1-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch +++ b/target/linux/generic/backport-5.15/020-v6.1-13-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch @@ -1,7 +1,7 @@ -From 14aa8b2d5c2ebead01b542f62d68029023054774 Mon Sep 17 00:00:00 2001 +From 92d430e8955c976eacb7cc91d7ff849c0dd009af Mon Sep 17 00:00:00 2001 From: Yu Zhao <yuzhao@google.com> Date: Wed, 28 Sep 2022 13:36:58 -0600 -Subject: [PATCH 1/1] mm/mglru: don't sync disk for each aging cycle +Subject: [PATCH 13/29] mm/mglru: don't sync disk for each aging cycle wakeup_flusher_threads() was added under the assumption that if a system runs out of clean cold pages, it might want to write back dirty pages more @@ -19,14 +19,19 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> mm/vmscan.c | 2 -- 1 file changed, 2 deletions(-) +diff --git a/mm/vmscan.c b/mm/vmscan.c +index b74b334488d8..1c0875e6514a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -4007,8 +4007,6 @@ static bool try_to_inc_max_seq(struct lr - if (wq_has_sleeper(&lruvec->mm_walk.wait)) - wake_up_all(&lruvec->mm_walk.wait); +@@ -4165,8 +4165,6 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + if (wq_has_sleeper(&lruvec->mm_state.wait)) + wake_up_all(&lruvec->mm_state.wait); - wakeup_flusher_threads(WB_REASON_VMSCAN); - return true; } +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-14-mm-multi-gen-LRU-retry-pages-written-back-while-isol.patch b/target/linux/generic/backport-5.15/020-v6.1-14-mm-multi-gen-LRU-retry-pages-written-back-while-isol.patch new file mode 100644 index 0000000000..ffdebafa2c --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-14-mm-multi-gen-LRU-retry-pages-written-back-while-isol.patch @@ -0,0 +1,129 @@ +From 6f315879ad750391a0b1fab8c9170bc054a5f5d7 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Tue, 15 Nov 2022 18:38:07 -0700 +Subject: [PATCH 14/29] mm: multi-gen LRU: retry pages written back while + isolated + +The page reclaim isolates a batch of pages from the tail of one of the +LRU lists and works on those pages one by one. For a suitable +swap-backed page, if the swap device is async, it queues that page for +writeback. After the page reclaim finishes an entire batch, it puts back +the pages it queued for writeback to the head of the original LRU list. + +In the meantime, the page writeback flushes the queued pages also by +batches. Its batching logic is independent from that of the page reclaim. +For each of the pages it writes back, the page writeback calls +rotate_reclaimable_page() which tries to rotate a page to the tail. + +rotate_reclaimable_page() only works for a page after the page reclaim +has put it back. If an async swap device is fast enough, the page +writeback can finish with that page while the page reclaim is still +working on the rest of the batch containing it. In this case, that page +will remain at the head and the page reclaim will not retry it before +reaching there. + +This patch adds a retry to evict_pages(). After evict_pages() has +finished an entire batch and before it puts back pages it cannot free +immediately, it retries those that may have missed the rotation. + +Before this patch, ~60% of pages swapped to an Intel Optane missed +rotate_reclaimable_page(). After this patch, ~99% of missed pages were +reclaimed upon retry. + +This problem affects relatively slow async swap devices like Samsung 980 +Pro much less and does not affect sync swap devices like zram or zswap at +all. + +Link: https://lkml.kernel.org/r/20221116013808.3995280-1-yuzhao@google.com +Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation") +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: "Yin, Fengwei" <fengwei.yin@intel.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 48 +++++++++++++++++++++++++++++++++++++----------- + 1 file changed, 37 insertions(+), 11 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 1c0875e6514a..27bc525380f9 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4723,10 +4723,13 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + int scanned; + int reclaimed; + LIST_HEAD(list); ++ LIST_HEAD(clean); + struct page *page; ++ struct page *next; + enum vm_event_item item; + struct reclaim_stat stat; + struct lru_gen_mm_walk *walk; ++ bool skip_retry = false; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + +@@ -4743,20 +4746,37 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + + if (list_empty(&list)) + return scanned; +- ++retry: + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); ++ sc->nr_reclaimed += reclaimed; + +- list_for_each_entry(page, &list, lru) { +- /* restore LRU_REFS_FLAGS cleared by isolate_page() */ +- if (PageWorkingset(page)) +- SetPageReferenced(page); ++ list_for_each_entry_safe_reverse(page, next, &list, lru) { ++ if (!page_evictable(page)) { ++ list_del(&page->lru); ++ putback_lru_page(page); ++ continue; ++ } + +- /* don't add rejected pages to the oldest generation */ + if (PageReclaim(page) && +- (PageDirty(page) || PageWriteback(page))) +- ClearPageActive(page); +- else +- SetPageActive(page); ++ (PageDirty(page) || PageWriteback(page))) { ++ /* restore LRU_REFS_FLAGS cleared by isolate_page() */ ++ if (PageWorkingset(page)) ++ SetPageReferenced(page); ++ continue; ++ } ++ ++ if (skip_retry || PageActive(page) || PageReferenced(page) || ++ page_mapped(page) || PageLocked(page) || ++ PageDirty(page) || PageWriteback(page)) { ++ /* don't add rejected pages to the oldest generation */ ++ set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, ++ BIT(PG_active)); ++ continue; ++ } ++ ++ /* retry pages that may have missed rotate_reclaimable_page() */ ++ list_move(&page->lru, &clean); ++ sc->nr_scanned -= thp_nr_pages(page); + } + + spin_lock_irq(&lruvec->lru_lock); +@@ -4778,7 +4798,13 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + +- sc->nr_reclaimed += reclaimed; ++ INIT_LIST_HEAD(&list); ++ list_splice_init(&clean, &list); ++ ++ if (!list_empty(&list)) { ++ skip_retry = true; ++ goto retry; ++ } + + if (need_swapping && type == LRU_GEN_ANON) + *need_swapping = true; +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-15-mm-multi-gen-LRU-move-lru_gen_add_mm-out-of-IRQ-off-.patch b/target/linux/generic/backport-5.15/020-v6.1-15-mm-multi-gen-LRU-move-lru_gen_add_mm-out-of-IRQ-off-.patch new file mode 100644 index 0000000000..7dc296b5b3 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-15-mm-multi-gen-LRU-move-lru_gen_add_mm-out-of-IRQ-off-.patch @@ -0,0 +1,54 @@ +From 255bb0ac393f1c2818cd75af45a9226300ab3daf Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 26 Oct 2022 15:48:30 +0200 +Subject: [PATCH 15/29] mm: multi-gen LRU: move lru_gen_add_mm() out of IRQ-off + region + +lru_gen_add_mm() has been added within an IRQ-off region in the commit +mentioned below. The other invocations of lru_gen_add_mm() are not within +an IRQ-off region. + +The invocation within IRQ-off region is problematic on PREEMPT_RT because +the function is using a spin_lock_t which must not be used within +IRQ-disabled regions. + +The other invocations of lru_gen_add_mm() occur while +task_struct::alloc_lock is acquired. Move lru_gen_add_mm() after +interrupts are enabled and before task_unlock(). + +Link: https://lkml.kernel.org/r/20221026134830.711887-1-bigeasy@linutronix.de +Fixes: bd74fdaea1460 ("mm: multi-gen LRU: support page table walks") +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Acked-by: Yu Zhao <yuzhao@google.com> +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: "Eric W . Biederman" <ebiederm@xmission.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + fs/exec.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/exec.c b/fs/exec.c +index 1afa15a07d26..718c58947be1 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1013,7 +1013,6 @@ static int exec_mmap(struct mm_struct *mm) + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; +- lru_gen_add_mm(mm); + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for +@@ -1028,6 +1027,7 @@ static int exec_mmap(struct mm_struct *mm) + local_irq_enable(); + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); ++ lru_gen_add_mm(mm); + task_unlock(tsk); + lru_gen_use_mm(mm); + if (old_mm) { +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-17-mm-add-dummy-pmd_young-for-architectures-not-having-.patch b/target/linux/generic/backport-5.15/020-v6.1-17-mm-add-dummy-pmd_young-for-architectures-not-having-.patch new file mode 100644 index 0000000000..4e11e82115 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-17-mm-add-dummy-pmd_young-for-architectures-not-having-.patch @@ -0,0 +1,111 @@ +From c5ec455ebd2b488d91de9d8915a0c8036a2a04dd Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Wed, 30 Nov 2022 14:49:41 -0800 +Subject: [PATCH 17/29] mm: add dummy pmd_young() for architectures not having + it + +In order to avoid #ifdeffery add a dummy pmd_young() implementation as a +fallback. This is required for the later patch "mm: introduce +arch_has_hw_nonleaf_pmd_young()". + +Link: https://lkml.kernel.org/r/fd3ac3cd-7349-6bbd-890a-71a9454ca0b3@suse.com +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Yu Zhao <yuzhao@google.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Geert Uytterhoeven <geert@linux-m68k.org> +Cc: "H. Peter Anvin" <hpa@zytor.com> +Cc: Ingo Molnar <mingo@redhat.com> +Cc: Sander Eikelenboom <linux@eikelenboom.it> +Cc: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + arch/mips/include/asm/pgtable.h | 1 + + arch/riscv/include/asm/pgtable.h | 1 + + arch/s390/include/asm/pgtable.h | 1 + + arch/sparc/include/asm/pgtable_64.h | 1 + + arch/x86/include/asm/pgtable.h | 1 + + include/linux/pgtable.h | 7 +++++++ + 6 files changed, 12 insertions(+) + +diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h +index 804889b70965..89ab8b4cf971 100644 +--- a/arch/mips/include/asm/pgtable.h ++++ b/arch/mips/include/asm/pgtable.h +@@ -632,6 +632,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) + return pmd; + } + ++#define pmd_young pmd_young + static inline int pmd_young(pmd_t pmd) + { + return !!(pmd_val(pmd) & _PAGE_ACCESSED); +diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h +index 39b550310ec6..4a64e03dcdd4 100644 +--- a/arch/riscv/include/asm/pgtable.h ++++ b/arch/riscv/include/asm/pgtable.h +@@ -531,6 +531,7 @@ static inline int pmd_dirty(pmd_t pmd) + return pte_dirty(pmd_pte(pmd)); + } + ++#define pmd_young pmd_young + static inline int pmd_young(pmd_t pmd) + { + return pte_young(pmd_pte(pmd)); +diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h +index b61426c9ef17..55ff4f22da2d 100644 +--- a/arch/s390/include/asm/pgtable.h ++++ b/arch/s390/include/asm/pgtable.h +@@ -748,6 +748,7 @@ static inline int pmd_dirty(pmd_t pmd) + return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; + } + ++#define pmd_young pmd_young + static inline int pmd_young(pmd_t pmd) + { + return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0; +diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h +index 4679e45c8348..bfd10179c137 100644 +--- a/arch/sparc/include/asm/pgtable_64.h ++++ b/arch/sparc/include/asm/pgtable_64.h +@@ -712,6 +712,7 @@ static inline unsigned long pmd_dirty(pmd_t pmd) + return pte_dirty(pte); + } + ++#define pmd_young pmd_young + static inline unsigned long pmd_young(pmd_t pmd) + { + pte_t pte = __pte(pmd_val(pmd)); +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 01a1763123ff..c4b64ee357fd 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -136,6 +136,7 @@ static inline int pmd_dirty(pmd_t pmd) + return pmd_flags(pmd) & _PAGE_DIRTY; + } + ++#define pmd_young pmd_young + static inline int pmd_young(pmd_t pmd) + { + return pmd_flags(pmd) & _PAGE_ACCESSED; +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index e6889556e0bf..dec3d890e814 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -164,6 +164,13 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr) + return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); + } + ++#ifndef pmd_young ++static inline int pmd_young(pmd_t pmd) ++{ ++ return 0; ++} ++#endif ++ + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS + extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.1-18-mm-introduce-arch_has_hw_nonleaf_pmd_young.patch b/target/linux/generic/backport-5.15/020-v6.1-18-mm-introduce-arch_has_hw_nonleaf_pmd_young.patch new file mode 100644 index 0000000000..4c4a2d11f2 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-18-mm-introduce-arch_has_hw_nonleaf_pmd_young.patch @@ -0,0 +1,122 @@ +From 46cbda7b65998a5af4493f745d94417af697bd68 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Wed, 23 Nov 2022 07:45:10 +0100 +Subject: [PATCH 18/29] mm: introduce arch_has_hw_nonleaf_pmd_young() + +When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add +CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation in +pmdp_test_and_clear_young(): + + BUG: unable to handle page fault for address: ffff8880083374d0 + #PF: supervisor write access in kernel mode + #PF: error_code(0x0003) - permissions violation + PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065 + Oops: 0003 [#1] PREEMPT SMP NOPTI + CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1 + RIP: e030:pmdp_test_and_clear_young+0x25/0x40 + +This happens because the Xen hypervisor can't emulate direct writes to +page table entries other than PTEs. + +This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young() +similar to arch_has_hw_pte_young() and test that instead of +CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG. + +Link: https://lkml.kernel.org/r/20221123064510.16225-1-jgross@suse.com +Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") +Signed-off-by: Juergen Gross <jgross@suse.com> +Reported-by: Sander Eikelenboom <linux@eikelenboom.it> +Acked-by: Yu Zhao <yuzhao@google.com> +Tested-by: Sander Eikelenboom <linux@eikelenboom.it> +Acked-by: David Hildenbrand <david@redhat.com> [core changes] +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + arch/x86/include/asm/pgtable.h | 8 ++++++++ + include/linux/pgtable.h | 11 +++++++++++ + mm/vmscan.c | 10 +++++----- + 3 files changed, 24 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index c4b64ee357fd..d8363c676496 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -1405,6 +1405,14 @@ static inline bool arch_has_hw_pte_young(void) + return true; + } + ++#ifdef CONFIG_XEN_PV ++#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young ++static inline bool arch_has_hw_nonleaf_pmd_young(void) ++{ ++ return !cpu_feature_enabled(X86_FEATURE_XENPV); ++} ++#endif ++ + #endif /* __ASSEMBLY__ */ + + #endif /* _ASM_X86_PGTABLE_H */ +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index dec3d890e814..562b4cc82b33 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -266,6 +266,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #endif + ++#ifndef arch_has_hw_nonleaf_pmd_young ++/* ++ * Return whether the accessed bit in non-leaf PMD entries is supported on the ++ * local CPU. ++ */ ++static inline bool arch_has_hw_nonleaf_pmd_young(void) ++{ ++ return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); ++} ++#endif ++ + #ifndef arch_has_hw_pte_young + /* + * Return whether the accessed bit is supported on the local CPU. +diff --git a/mm/vmscan.c b/mm/vmscan.c +index d310e0b9e520..96f1af44bb77 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3730,7 +3730,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area + goto next; + + if (!pmd_trans_huge(pmd[i])) { +- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && ++ if (arch_has_hw_nonleaf_pmd_young() && + get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; +@@ -3828,14 +3828,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + #endif + walk->mm_stats[MM_NONLEAF_TOTAL]++; + +-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +- if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { ++ if (arch_has_hw_nonleaf_pmd_young() && ++ get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (!pmd_young(val)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + } +-#endif ++ + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + continue; + +@@ -5135,7 +5135,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c + if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) + caps |= BIT(LRU_GEN_MM_WALK); + +- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) ++ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + caps |= BIT(LRU_GEN_NONLEAF_YOUNG); + + return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.2-16-mm-multi-gen-LRU-fix-crash-during-cgroup-migration.patch b/target/linux/generic/backport-5.15/020-v6.2-16-mm-multi-gen-LRU-fix-crash-during-cgroup-migration.patch new file mode 100644 index 0000000000..a8cd1fca1b --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.2-16-mm-multi-gen-LRU-fix-crash-during-cgroup-migration.patch @@ -0,0 +1,61 @@ +From c7dfefd4bdfba3d5171038d1cc2d4160288e6ee4 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 15 Jan 2023 20:44:05 -0700 +Subject: [PATCH 16/29] mm: multi-gen LRU: fix crash during cgroup migration + +lru_gen_migrate_mm() assumes lru_gen_add_mm() runs prior to itself. This +isn't true for the following scenario: + + CPU 1 CPU 2 + + clone() + cgroup_can_fork() + cgroup_procs_write() + cgroup_post_fork() + task_lock() + lru_gen_migrate_mm() + task_unlock() + task_lock() + lru_gen_add_mm() + task_unlock() + +And when the above happens, kernel crashes because of linked list +corruption (mm_struct->lru_gen.list). + +Link: https://lore.kernel.org/r/20230115134651.30028-1-msizanoen@qtmlabs.xyz/ +Link: https://lkml.kernel.org/r/20230116034405.2960276-1-yuzhao@google.com +Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") +Signed-off-by: Yu Zhao <yuzhao@google.com> +Reported-by: msizanoen <msizanoen@qtmlabs.xyz> +Tested-by: msizanoen <msizanoen@qtmlabs.xyz> +Cc: <stable@vger.kernel.org> [6.1+] +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 27bc525380f9..d310e0b9e520 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3024,13 +3024,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm) + if (mem_cgroup_disabled()) + return; + ++ /* migration can happen before addition */ ++ if (!mm->lru_gen.memcg) ++ return; ++ + rcu_read_lock(); + memcg = mem_cgroup_from_task(task); + rcu_read_unlock(); + if (memcg == mm->lru_gen.memcg) + return; + +- VM_WARN_ON_ONCE(!mm->lru_gen.memcg); + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); + + lru_gen_del_mm(mm); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-19-mm-add-vma_has_recency.patch b/target/linux/generic/backport-5.15/020-v6.3-19-mm-add-vma_has_recency.patch new file mode 100644 index 0000000000..542bb0c3a8 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-19-mm-add-vma_has_recency.patch @@ -0,0 +1,207 @@ +From 6c7f552a48b49a8612786a28a2239fbc24fac289 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Fri, 30 Dec 2022 14:52:51 -0700 +Subject: [PATCH 19/29] mm: add vma_has_recency() + +Add vma_has_recency() to indicate whether a VMA may exhibit temporal +locality that the LRU algorithm relies on. + +This function returns false for VMAs marked by VM_SEQ_READ or +VM_RAND_READ. While the former flag indicates linear access, i.e., a +special case of spatial locality, both flags indicate a lack of temporal +locality, i.e., the reuse of an area within a relatively small duration. + +"Recency" is chosen over "locality" to avoid confusion between temporal +and spatial localities. + +Before this patch, the active/inactive LRU only ignored the accessed bit +from VMAs marked by VM_SEQ_READ. After this patch, the active/inactive +LRU and MGLRU share the same logic: they both ignore the accessed bit if +vma_has_recency() returns false. + +For the active/inactive LRU, the following fio test showed a [6, 8]% +increase in IOPS when randomly accessing mapped files under memory +pressure. + + kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo) + kb=$((kb - 8*1024*1024)) + + modprobe brd rd_nr=1 rd_size=$kb + dd if=/dev/zero of=/dev/ram0 bs=1M + + mkfs.ext4 /dev/ram0 + mount /dev/ram0 /mnt/ + swapoff -a + + fio --name=test --directory=/mnt/ --ioengine=mmap --numjobs=8 \ + --size=8G --rw=randrw --time_based --runtime=10m \ + --group_reporting + +The discussion that led to this patch is here [1]. Additional test +results are available in that thread. + +[1] https://lore.kernel.org/r/Y31s%2FK8T85jh05wH@google.com/ + +Link: https://lkml.kernel.org/r/20221230215252.2628425-1-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Alexander Viro <viro@zeniv.linux.org.uk> +Cc: Andrea Righi <andrea.righi@canonical.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/mm_inline.h | 9 +++++++++ + mm/memory.c | 8 ++++---- + mm/rmap.c | 42 +++++++++++++++++---------------------- + mm/vmscan.c | 5 ++++- + 4 files changed, 35 insertions(+), 29 deletions(-) + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index e095c1c24311..e8c723053a52 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -333,4 +333,13 @@ static __always_inline void del_page_from_lru_list(struct page *page, + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); + } ++ ++static inline bool vma_has_recency(struct vm_area_struct *vma) ++{ ++ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) ++ return false; ++ ++ return true; ++} ++ + #endif +diff --git a/mm/memory.c b/mm/memory.c +index 7d5be951de9e..1306b1ff0c10 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -41,6 +41,7 @@ + + #include <linux/kernel_stat.h> + #include <linux/mm.h> ++#include <linux/mm_inline.h> + #include <linux/sched/mm.h> + #include <linux/sched/coredump.h> + #include <linux/sched/numa_balancing.h> +@@ -1353,8 +1354,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + force_flush = 1; + set_page_dirty(page); + } +- if (pte_young(ptent) && +- likely(!(vma->vm_flags & VM_SEQ_READ))) ++ if (pte_young(ptent) && likely(vma_has_recency(vma))) + mark_page_accessed(page); + } + rss[mm_counter(page)]--; +@@ -4781,8 +4781,8 @@ static inline void mm_account_fault(struct pt_regs *regs, + #ifdef CONFIG_LRU_GEN + static void lru_gen_enter_fault(struct vm_area_struct *vma) + { +- /* the LRU algorithm doesn't apply to sequential or random reads */ +- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); ++ /* the LRU algorithm only applies to accesses with recency */ ++ current->in_lru_fault = vma_has_recency(vma); + } + + static void lru_gen_exit_fault(void) +diff --git a/mm/rmap.c b/mm/rmap.c +index 22a86122732e..53df47753f3c 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -794,25 +794,14 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, + } + + if (pvmw.pte) { +- if (lru_gen_enabled() && pte_young(*pvmw.pte) && +- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { ++ if (lru_gen_enabled() && pte_young(*pvmw.pte)) { + lru_gen_look_around(&pvmw); + referenced++; + } + + if (ptep_clear_flush_young_notify(vma, address, +- pvmw.pte)) { +- /* +- * Don't treat a reference through +- * a sequentially read mapping as such. +- * If the page has been used in another mapping, +- * we will catch it; if this other mapping is +- * already gone, the unmap path will have set +- * PG_referenced or activated the page. +- */ +- if (likely(!(vma->vm_flags & VM_SEQ_READ))) +- referenced++; +- } ++ pvmw.pte)) ++ referenced++; + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_flush_young_notify(vma, address, + pvmw.pmd)) +@@ -846,7 +835,20 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) + struct page_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; + +- if (!mm_match_cgroup(vma->vm_mm, memcg)) ++ /* ++ * Ignore references from this mapping if it has no recency. If the ++ * page has been used in another mapping, we will catch it; if this ++ * other mapping is already gone, the unmap path will have set the ++ * referenced flag or activated the page in zap_pte_range(). ++ */ ++ if (!vma_has_recency(vma)) ++ return true; ++ ++ /* ++ * If we are reclaiming on behalf of a cgroup, skip counting on behalf ++ * of references from different cgroups. ++ */ ++ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) + return true; + + return false; +@@ -876,6 +878,7 @@ int page_referenced(struct page *page, + .rmap_one = page_referenced_one, + .arg = (void *)&pra, + .anon_lock = page_lock_anon_vma_read, ++ .invalid_vma = invalid_page_referenced_vma, + }; + + *vm_flags = 0; +@@ -891,15 +894,6 @@ int page_referenced(struct page *page, + return 1; + } + +- /* +- * If we are reclaiming on behalf of a cgroup, skip +- * counting on behalf of references from different +- * cgroups +- */ +- if (memcg) { +- rwc.invalid_vma = invalid_page_referenced_vma; +- } +- + rmap_walk(page, &rwc); + *vm_flags = pra.vm_flags; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 96f1af44bb77..4ab376abeaae 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3486,7 +3486,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal + if (is_vm_hugetlb_page(vma)) + return true; + +- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) ++ if (!vma_has_recency(vma)) ++ return true; ++ ++ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) + return true; + + if (vma == get_gate_vma(vma->vm_mm)) +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-20-mm-support-POSIX_FADV_NOREUSE.patch b/target/linux/generic/backport-5.15/020-v6.3-20-mm-support-POSIX_FADV_NOREUSE.patch new file mode 100644 index 0000000000..75f74114c6 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-20-mm-support-POSIX_FADV_NOREUSE.patch @@ -0,0 +1,134 @@ +From 686c3d4f71de9e0e7a27f03a5617a712385f90cd Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Fri, 30 Dec 2022 14:52:52 -0700 +Subject: [PATCH 20/29] mm: support POSIX_FADV_NOREUSE + +This patch adds POSIX_FADV_NOREUSE to vma_has_recency() so that the LRU +algorithm can ignore access to mapped files marked by this flag. + +The advantages of POSIX_FADV_NOREUSE are: +1. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not alter the + default readahead behavior. +2. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not split VMAs and + therefore does not take mmap_lock. +3. Unlike MADV_COLD, setting it has a negligible cost, regardless of + how many pages it affects. + +Its limitations are: +1. Like POSIX_FADV_RANDOM and POSIX_FADV_SEQUENTIAL, it currently does + not support range. IOW, its scope is the entire file. +2. It currently does not ignore access through file descriptors. + Specifically, for the active/inactive LRU, given a file page shared + by two users and one of them having set POSIX_FADV_NOREUSE on the + file, this page will be activated upon the second user accessing + it. This corner case can be covered by checking POSIX_FADV_NOREUSE + before calling mark_page_accessed() on the read path. But it is + considered not worth the effort. + +There have been a few attempts to support POSIX_FADV_NOREUSE, e.g., [1]. +This time the goal is to fill a niche: a few desktop applications, e.g., +large file transferring and video encoding/decoding, want fast file +streaming with mmap() rather than direct IO. Among those applications, an +SVT-AV1 regression was reported when running with MGLRU [2]. The +following test can reproduce that regression. + + kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo) + kb=$((kb - 8*1024*1024)) + + modprobe brd rd_nr=1 rd_size=$kb + dd if=/dev/zero of=/dev/ram0 bs=1M + + mkfs.ext4 /dev/ram0 + mount /dev/ram0 /mnt/ + swapoff -a + + fallocate -l 8G /mnt/swapfile + mkswap /mnt/swapfile + swapon /mnt/swapfile + + wget http://ultravideo.cs.tut.fi/video/Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z + 7z e -o/mnt/ Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z + SvtAv1EncApp --preset 12 -w 3840 -h 2160 \ + -i /mnt/Bosphorus_3840x2160.y4m + +For MGLRU, the following change showed a [9-11]% increase in FPS, +which makes it on par with the active/inactive LRU. + + patch Source/App/EncApp/EbAppMain.c <<EOF + 31a32 + > #include <fcntl.h> + 35d35 + < #include <fcntl.h> /* _O_BINARY */ + 117a118 + > posix_fadvise(config->mmap.fd, 0, 0, POSIX_FADV_NOREUSE); + EOF + +[1] https://lore.kernel.org/r/1308923350-7932-1-git-send-email-andrea@betterlinux.com/ +[2] https://openbenchmarking.org/result/2209259-PTS-MGLRU8GB57 + +Link: https://lkml.kernel.org/r/20221230215252.2628425-2-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Alexander Viro <viro@zeniv.linux.org.uk> +Cc: Andrea Righi <andrea.righi@canonical.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/fs.h | 2 ++ + include/linux/mm_inline.h | 3 +++ + mm/fadvise.c | 5 ++++- + 3 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 23ecfecdc450..601e52991f4a 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -167,6 +167,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, + /* File is stream-like */ + #define FMODE_STREAM ((__force fmode_t)0x200000) + ++#define FMODE_NOREUSE ((__force fmode_t)0x400000) ++ + /* File was opened by fanotify and shouldn't generate fanotify events */ + #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index e8c723053a52..8a6a2a23f9b6 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -339,6 +339,9 @@ static inline bool vma_has_recency(struct vm_area_struct *vma) + if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) + return false; + ++ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) ++ return false; ++ + return true; + } + +diff --git a/mm/fadvise.c b/mm/fadvise.c +index d6baa4f451c5..e8023c69f219 100644 +--- a/mm/fadvise.c ++++ b/mm/fadvise.c +@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) + case POSIX_FADV_NORMAL: + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); +- file->f_mode &= ~FMODE_RANDOM; ++ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE); + spin_unlock(&file->f_lock); + break; + case POSIX_FADV_RANDOM: +@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) + force_page_cache_readahead(mapping, file, start_index, nrpages); + break; + case POSIX_FADV_NOREUSE: ++ spin_lock(&file->f_lock); ++ file->f_mode |= FMODE_NOREUSE; ++ spin_unlock(&file->f_lock); + break; + case POSIX_FADV_DONTNEED: + if (!inode_write_congested(mapping->host)) +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-21-mm-multi-gen-LRU-rename-lru_gen_struct-to-lru_gen_pa.patch b/target/linux/generic/backport-5.15/020-v6.3-21-mm-multi-gen-LRU-rename-lru_gen_struct-to-lru_gen_pa.patch new file mode 100644 index 0000000000..836a16b8c7 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-21-mm-multi-gen-LRU-rename-lru_gen_struct-to-lru_gen_pa.patch @@ -0,0 +1,359 @@ +From 348fdbada9fb3f0bf1a53651be46319105af187f Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Wed, 21 Dec 2022 21:18:59 -0700 +Subject: [PATCH 21/29] mm: multi-gen LRU: rename lru_gen_struct to + lru_gen_page + +Patch series "mm: multi-gen LRU: memcg LRU", v3. + +Overview +======== + +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, +since each node and memcg combination has an LRU of pages (see +mem_cgroup_lruvec()). + +Its goal is to improve the scalability of global reclaim, which is +critical to system-wide memory overcommit in data centers. Note that +memcg reclaim is currently out of scope. + +Its memory bloat is a pointer to each lruvec and negligible to each +pglist_data. In terms of traversing memcgs during global reclaim, it +improves the best-case complexity from O(n) to O(1) and does not affect +the worst-case complexity O(n). Therefore, on average, it has a sublinear +complexity in contrast to the current linear complexity. + +The basic structure of an memcg LRU can be understood by an analogy to +the active/inactive LRU (of pages): +1. It has the young and the old (generations), i.e., the counterparts + to the active and the inactive; +2. The increment of max_seq triggers promotion, i.e., the counterpart + to activation; +3. Other events trigger similar operations, e.g., offlining an memcg + triggers demotion, i.e., the counterpart to deactivation. + +In terms of global reclaim, it has two distinct features: +1. Sharding, which allows each thread to start at a random memcg (in + the old generation) and improves parallelism; +2. Eventual fairness, which allows direct reclaim to bail out at will + and reduces latency without affecting fairness over some time. + +The commit message in patch 6 details the workflow: +https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/ + +The following is a simple test to quickly verify its effectiveness. + + Test design: + 1. Create multiple memcgs. + 2. Each memcg contains a job (fio). + 3. All jobs access the same amount of memory randomly. + 4. The system does not experience global memory pressure. + 5. Periodically write to the root memory.reclaim. + + Desired outcome: + 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal) + over mean(pgsteal) is close to 0%. + 2. The total pgsteal is close to the total requested through + memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close + to 100%. + + Actual outcome [1]: + MGLRU off MGLRU on + stddev(pgsteal) / mean(pgsteal) 75% 20% + sum(pgsteal) / sum(requested) 425% 95% + + #################################################################### + MEMCGS=128 + + for ((memcg = 0; memcg < $MEMCGS; memcg++)); do + mkdir /sys/fs/cgroup/memcg$memcg + done + + start() { + echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs + + fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \ + --filename=/dev/zero --size=1920M --rw=randrw \ + --rate=64m,64m --random_distribution=random \ + --fadvise_hint=0 --time_based --runtime=10h \ + --group_reporting --minimal + } + + for ((memcg = 0; memcg < $MEMCGS; memcg++)); do + start & + done + + sleep 600 + + for ((i = 0; i < 600; i++)); do + echo 256m >/sys/fs/cgroup/memory.reclaim + sleep 6 + done + + for ((memcg = 0; memcg < $MEMCGS; memcg++)); do + grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat + done + #################################################################### + +[1]: This was obtained from running the above script (touches less + than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an + hour. + +This patch (of 8): + +The new name lru_gen_page will be more distinct from the coming +lru_gen_memcg. + +Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com +Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Roman Gushchin <roman.gushchin@linux.dev> +Cc: Suren Baghdasaryan <surenb@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/mm_inline.h | 4 ++-- + include/linux/mmzone.h | 6 +++--- + mm/vmscan.c | 34 +++++++++++++++++----------------- + mm/workingset.c | 4 ++-- + 4 files changed, 24 insertions(+), 24 deletions(-) + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 8a6a2a23f9b6..27c4890503c5 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -168,7 +168,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page, + int zone = page_zonenum(page); + int delta = thp_nr_pages(page); + enum lru_list lru = type * LRU_INACTIVE_FILE; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); +@@ -214,7 +214,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo + int gen = page_lru_gen(page); + int type = page_is_file_lru(page); + int zone = page_zonenum(page); +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_PAGE(gen != -1, page); + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 6b85ba1f4e18..5856b026c089 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -394,7 +394,7 @@ enum { + * The number of pages in each generation is eventually consistent and therefore + * can be transiently negative when reset_batch_size() is pending. + */ +-struct lru_gen_struct { ++struct lru_gen_page { + /* the aging increments the youngest generation number */ + unsigned long max_seq; + /* the eviction increments the oldest generation numbers */ +@@ -451,7 +451,7 @@ struct lru_gen_mm_state { + struct lru_gen_mm_walk { + /* the lruvec under reclaim */ + struct lruvec *lruvec; +- /* unstable max_seq from lru_gen_struct */ ++ /* unstable max_seq from lru_gen_page */ + unsigned long max_seq; + /* the next address within an mm to scan */ + unsigned long next_addr; +@@ -514,7 +514,7 @@ struct lruvec { + unsigned long flags; + #ifdef CONFIG_LRU_GEN + /* evictable pages divided into generations */ +- struct lru_gen_struct lrugen; ++ struct lru_gen_page lrugen; + /* to concurrently iterate lru_gen_mm_list */ + struct lru_gen_mm_state mm_state; + #endif +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 4ab376abeaae..3b1b5bd9736a 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2910,7 +2910,7 @@ static int get_nr_gens(struct lruvec *lruvec, int type) + + static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) + { +- /* see the comment on lru_gen_struct */ ++ /* see the comment on lru_gen_page */ + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; +@@ -3316,7 +3316,7 @@ struct ctrl_pos { + static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + struct ctrl_pos *pos) + { +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + +@@ -3331,7 +3331,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) + { + int hist, tier; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + +@@ -3408,7 +3408,7 @@ static int page_update_gen(struct page *page, int gen) + static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming) + { + int type = page_is_file_lru(page); +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + unsigned long new_flags, old_flags = READ_ONCE(page->flags); + +@@ -3453,7 +3453,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page, + static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) + { + int gen, type, zone; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + walk->batched = 0; + +@@ -3979,7 +3979,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) + { + int zone; + int remaining = MAX_LRU_BATCH; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + + if (type == LRU_GEN_ANON && !can_swap) +@@ -4015,7 +4015,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + { + int gen, type, zone; + bool success = false; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); +@@ -4036,7 +4036,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + ; + } + +- /* see the comment on lru_gen_struct */ ++ /* see the comment on lru_gen_page */ + if (can_swap) { + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); +@@ -4058,7 +4058,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) + { + int prev, next; + int type, zone; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + spin_lock_irq(&lruvec->lru_lock); + +@@ -4116,7 +4116,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + bool success; + struct lru_gen_mm_walk *walk; + struct mm_struct *mm = NULL; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + +@@ -4181,7 +4181,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + for (type = !can_swap; type < ANON_AND_FILE; type++) { +@@ -4466,7 +4466,7 @@ static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) + int delta = thp_nr_pages(page); + int refs = page_lru_refs(page); + int tier = lru_tier_from_refs(refs); +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page); + +@@ -4566,7 +4566,7 @@ static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, + int scanned = 0; + int isolated = 0; + int remaining = MAX_LRU_BATCH; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_WARN_ON_ONCE(!list_empty(list)); +@@ -4967,7 +4967,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + + static bool __maybe_unused state_is_valid(struct lruvec *lruvec) + { +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + if (lrugen->enabled) { + enum lru_list lru; +@@ -5247,7 +5247,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + int i; + int type, tier; + int hist = lru_hist_from_seq(seq); +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + seq_printf(m, " %10d", tier); +@@ -5296,7 +5296,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) + unsigned long seq; + bool full = !debugfs_real_fops(m->file)->write; + struct lruvec *lruvec = v; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); +@@ -5549,7 +5549,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + { + int i; + int gen, type, zone; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); +diff --git a/mm/workingset.c b/mm/workingset.c +index aeba62cebf8c..a5e1798c6d60 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct page *page) + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; +- struct lru_gen_struct *lrugen; ++ struct lru_gen_page *lrugen; + int type = page_is_file_lru(page); + int delta = thp_nr_pages(page); + int refs = page_lru_refs(page); +@@ -252,7 +252,7 @@ static void lru_gen_refault(struct page *page, void *shadow) + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; +- struct lru_gen_struct *lrugen; ++ struct lru_gen_page *lrugen; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + int type = page_is_file_lru(page); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-22-mm-multi-gen-LRU-rename-lrugen-lists-to-lrugen-pages.patch b/target/linux/generic/backport-5.15/020-v6.3-22-mm-multi-gen-LRU-rename-lrugen-lists-to-lrugen-pages.patch new file mode 100644 index 0000000000..2e1783661d --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-22-mm-multi-gen-LRU-rename-lrugen-lists-to-lrugen-pages.patch @@ -0,0 +1,171 @@ +From afd37e73db04c7e6b47411120ac5f6a7eca51fec Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Wed, 21 Dec 2022 21:19:00 -0700 +Subject: [PATCH 22/29] mm: multi-gen LRU: rename lrugen->lists[] to + lrugen->pages[] + +lru_gen_page will be chained into per-node lists by the coming +lrugen->list. + +Link: https://lkml.kernel.org/r/20221222041905.2431096-3-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Roman Gushchin <roman.gushchin@linux.dev> +Cc: Suren Baghdasaryan <surenb@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/mm_inline.h | 4 ++-- + include/linux/mmzone.h | 8 ++++---- + mm/vmscan.c | 20 ++++++++++---------- + 3 files changed, 16 insertions(+), 16 deletions(-) + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 27c4890503c5..4adc9ba59569 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -246,9 +246,9 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo + lru_gen_update_size(lruvec, page, -1, gen); + /* for rotate_reclaimable_page() */ + if (reclaiming) +- list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ list_add_tail(&page->lru, &lrugen->pages[gen][type][zone]); + else +- list_add(&page->lru, &lrugen->lists[gen][type][zone]); ++ list_add(&page->lru, &lrugen->pages[gen][type][zone]); + + return true; + } +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 5856b026c089..7b8a26aaf381 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -302,7 +302,7 @@ enum lruvec_flags { + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An + * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the + * corresponding generation. The gen counter in page->flags stores gen+1 while +- * a page is on one of lrugen->lists[]. Otherwise it stores 0. ++ * a page is on one of lrugen->pages[]. Otherwise it stores 0. + * + * A page is added to the youngest generation on faulting. The aging needs to + * check the accessed bit at least twice before handing this page over to the +@@ -314,8 +314,8 @@ enum lruvec_flags { + * rest of generations, if they exist, are considered inactive. See + * lru_gen_is_active(). + * +- * PG_active is always cleared while a page is on one of lrugen->lists[] so that +- * the aging needs not to worry about it. And it's set again when a page ++ * PG_active is always cleared while a page is on one of lrugen->pages[] so ++ * that the aging needs not to worry about it. And it's set again when a page + * considered active is isolated for non-reclaiming purposes, e.g., migration. + * See lru_gen_add_page() and lru_gen_del_page(). + * +@@ -402,7 +402,7 @@ struct lru_gen_page { + /* the birth time of each generation in jiffies */ + unsigned long timestamps[MAX_NR_GENS]; + /* the multi-gen LRU lists, lazily sorted on eviction */ +- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ struct list_head pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the multi-gen LRU sizes, eventually consistent */ + long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the exponential moving average of refaulted */ +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 3b1b5bd9736a..2322c913aa64 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3987,7 +3987,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) + + /* prevent cold/hot inversion if force_scan is true */ + for (zone = 0; zone < MAX_NR_ZONES; zone++) { +- struct list_head *head = &lrugen->lists[old_gen][type][zone]; ++ struct list_head *head = &lrugen->pages[old_gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); +@@ -3998,7 +3998,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); + + new_gen = page_inc_gen(lruvec, page, false); +- list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ list_move_tail(&page->lru, &lrugen->pages[new_gen][type][zone]); + + if (!--remaining) + return false; +@@ -4026,7 +4026,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { +- if (!list_empty(&lrugen->lists[gen][type][zone])) ++ if (!list_empty(&lrugen->pages[gen][type][zone])) + goto next; + } + +@@ -4491,7 +4491,7 @@ static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) + + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { +- list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ list_move(&page->lru, &lrugen->pages[gen][type][zone]); + return true; + } + +@@ -4500,7 +4500,7 @@ static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + gen = page_inc_gen(lruvec, page, false); +- list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ list_move_tail(&page->lru, &lrugen->pages[gen][type][zone]); + + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); +@@ -4512,7 +4512,7 @@ static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) + if (PageLocked(page) || PageWriteback(page) || + (type == LRU_GEN_FILE && PageDirty(page))) { + gen = page_inc_gen(lruvec, page, true); +- list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ list_move(&page->lru, &lrugen->pages[gen][type][zone]); + return true; + } + +@@ -4579,7 +4579,7 @@ static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; +- struct list_head *head = &lrugen->lists[gen][type][zone]; ++ struct list_head *head = &lrugen->pages[gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); +@@ -4980,7 +4980,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) + int gen, type, zone; + + for_each_gen_type_zone(gen, type, zone) { +- if (!list_empty(&lrugen->lists[gen][type][zone])) ++ if (!list_empty(&lrugen->pages[gen][type][zone])) + return false; + } + } +@@ -5025,7 +5025,7 @@ static bool drain_evictable(struct lruvec *lruvec) + int remaining = MAX_LRU_BATCH; + + for_each_gen_type_zone(gen, type, zone) { +- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; ++ struct list_head *head = &lruvec->lrugen.pages[gen][type][zone]; + + while (!list_empty(head)) { + bool success; +@@ -5558,7 +5558,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) +- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++ INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; + init_waitqueue_head(&lruvec->mm_state.wait); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-23-mm-multi-gen-LRU-remove-eviction-fairness-safeguard.patch b/target/linux/generic/backport-5.15/020-v6.3-23-mm-multi-gen-LRU-remove-eviction-fairness-safeguard.patch new file mode 100644 index 0000000000..1490f9d4bc --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-23-mm-multi-gen-LRU-remove-eviction-fairness-safeguard.patch @@ -0,0 +1,193 @@ +From ce45f1c4b32cf69b166f56ef5bc6c761e06ed4e5 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Wed, 21 Dec 2022 21:19:01 -0700 +Subject: [PATCH 23/29] mm: multi-gen LRU: remove eviction fairness safeguard + +Recall that the eviction consumes the oldest generation: first it +bucket-sorts pages whose gen counters were updated by the aging and +reclaims the rest; then it increments lrugen->min_seq. + +The current eviction fairness safeguard for global reclaim has a +dilemma: when there are multiple eligible memcgs, should it continue +or stop upon meeting the reclaim goal? If it continues, it overshoots +and increases direct reclaim latency; if it stops, it loses fairness +between memcgs it has taken memory away from and those it has yet to. + +With memcg LRU, the eviction, while ensuring eventual fairness, will +stop upon meeting its goal. Therefore the current eviction fairness +safeguard for global reclaim will not be needed. + +Note that memcg LRU only applies to global reclaim. For memcg reclaim, +the eviction will continue, even if it is overshooting. This becomes +unconditional due to code simplification. + +Link: https://lkml.kernel.org/r/20221222041905.2431096-4-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Roman Gushchin <roman.gushchin@linux.dev> +Cc: Suren Baghdasaryan <surenb@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 82 +++++++++++++++-------------------------------------- + 1 file changed, 23 insertions(+), 59 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 2322c913aa64..40e7a947c5c7 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -443,6 +443,11 @@ static bool cgroup_reclaim(struct scan_control *sc) + return sc->target_mem_cgroup; + } + ++static bool global_reclaim(struct scan_control *sc) ++{ ++ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); ++} ++ + /** + * writeback_throttling_sane - is the usual dirty throttling mechanism available? + * @sc: scan_control in question +@@ -493,6 +498,11 @@ static bool cgroup_reclaim(struct scan_control *sc) + return false; + } + ++static bool global_reclaim(struct scan_control *sc) ++{ ++ return true; ++} ++ + static bool writeback_throttling_sane(struct scan_control *sc) + { + return true; +@@ -4722,8 +4732,7 @@ static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swa + return scanned; + } + +-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, +- bool *need_swapping) ++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) + { + int type; + int scanned; +@@ -4812,9 +4821,6 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + goto retry; + } + +- if (need_swapping && type == LRU_GEN_ANON) +- *need_swapping = true; +- + return scanned; + } + +@@ -4853,68 +4859,26 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; + } + +-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, +- struct scan_control *sc, bool need_swapping) ++static unsigned long get_nr_to_reclaim(struct scan_control *sc) + { +- int i; +- DEFINE_MAX_SEQ(lruvec); +- +- if (!current_is_kswapd()) { +- /* age each memcg once to ensure fairness */ +- if (max_seq - seq > 1) +- return true; +- +- /* over-swapping can increase allocation latency */ +- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) +- return true; +- +- /* give this thread a chance to exit and free its memory */ +- if (fatal_signal_pending(current)) { +- sc->nr_reclaimed += MIN_LRU_BATCH; +- return true; +- } +- +- if (cgroup_reclaim(sc)) +- return false; +- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) +- return false; +- +- /* keep scanning at low priorities to ensure fairness */ +- if (sc->priority > DEF_PRIORITY - 2) +- return false; +- +- /* +- * A minimum amount of work was done under global memory pressure. For +- * kswapd, it may be overshooting. For direct reclaim, the target isn't +- * met, and yet the allocation may still succeed, since kswapd may have +- * caught up. In either case, it's better to stop now, and restart if +- * necessary. +- */ +- for (i = 0; i <= sc->reclaim_idx; i++) { +- unsigned long wmark; +- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; +- +- if (!managed_zone(zone)) +- continue; +- +- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); +- if (wmark > zone_page_state(zone, NR_FREE_PAGES)) +- return false; +- } ++ /* don't abort memcg reclaim to ensure fairness */ ++ if (!global_reclaim(sc)) ++ return -1; + +- sc->nr_reclaimed += MIN_LRU_BATCH; ++ /* discount the previous progress for kswapd */ ++ if (current_is_kswapd()) ++ return sc->nr_to_reclaim + sc->last_reclaimed; + +- return true; ++ return max(sc->nr_to_reclaim, compact_gap(sc->order)); + } + + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + struct blk_plug plug; + bool need_aging = false; +- bool need_swapping = false; + unsigned long scanned = 0; + unsigned long reclaimed = sc->nr_reclaimed; +- DEFINE_MAX_SEQ(lruvec); ++ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + + lru_add_drain(); + +@@ -4938,7 +4902,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + if (!nr_to_scan) + goto done; + +- delta = evict_pages(lruvec, sc, swappiness, &need_swapping); ++ delta = evict_pages(lruvec, sc, swappiness); + if (!delta) + goto done; + +@@ -4946,7 +4910,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + if (scanned >= nr_to_scan) + break; + +- if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) ++ if (sc->nr_reclaimed >= nr_to_reclaim) + break; + + cond_resched(); +@@ -5393,7 +5357,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co + if (sc->nr_reclaimed >= nr_to_reclaim) + return 0; + +- if (!evict_pages(lruvec, sc, swappiness, NULL)) ++ if (!evict_pages(lruvec, sc, swappiness)) + return 0; + + cond_resched(); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-24-mm-multi-gen-LRU-remove-aging-fairness-safeguard.patch b/target/linux/generic/backport-5.15/020-v6.3-24-mm-multi-gen-LRU-remove-aging-fairness-safeguard.patch new file mode 100644 index 0000000000..82ba77dec2 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-24-mm-multi-gen-LRU-remove-aging-fairness-safeguard.patch @@ -0,0 +1,292 @@ +From e20b7386fccc18c791796eb1dc1a91eee3ccf801 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Wed, 21 Dec 2022 21:19:02 -0700 +Subject: [PATCH 24/29] mm: multi-gen LRU: remove aging fairness safeguard + +Recall that the aging produces the youngest generation: first it scans +for accessed pages and updates their gen counters; then it increments +lrugen->max_seq. + +The current aging fairness safeguard for kswapd uses two passes to +ensure the fairness to multiple eligible memcgs. On the first pass, +which is shared with the eviction, it checks whether all eligible +memcgs are low on cold pages. If so, it requires a second pass, on +which it ages all those memcgs at the same time. + +With memcg LRU, the aging, while ensuring eventual fairness, will run +when necessary. Therefore the current aging fairness safeguard for +kswapd will not be needed. + +Note that memcg LRU only applies to global reclaim. For memcg reclaim, +the aging can be unfair to different memcgs, i.e., their +lrugen->max_seq can be incremented at different paces. + +Link: https://lkml.kernel.org/r/20221222041905.2431096-5-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Roman Gushchin <roman.gushchin@linux.dev> +Cc: Suren Baghdasaryan <surenb@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 126 ++++++++++++++++++++++++---------------------------- + 1 file changed, 59 insertions(+), 67 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 40e7a947c5c7..7159436872ba 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -131,7 +131,6 @@ struct scan_control { + + #ifdef CONFIG_LRU_GEN + /* help kswapd make better choices among multiple memcgs */ +- unsigned int memcgs_need_aging:1; + unsigned long last_reclaimed; + #endif + +@@ -4184,7 +4183,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + return true; + } + +-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, ++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) + { + int gen, type, zone; +@@ -4193,6 +4192,13 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig + unsigned long total = 0; + struct lru_gen_page *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ /* whether this lruvec is completely out of cold pages */ ++ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { ++ *nr_to_scan = 0; ++ return true; ++ } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; +@@ -4221,8 +4227,6 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ +- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) +- return true; + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + +@@ -4241,40 +4245,54 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig + return false; + } + +-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) ++static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) + { +- bool need_aging; +- unsigned long nr_to_scan; +- int swappiness = get_swappiness(lruvec, sc); ++ int gen, type, zone; ++ unsigned long total = 0; ++ bool can_swap = get_swappiness(lruvec, sc); ++ struct lru_gen_page *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + +- VM_WARN_ON_ONCE(sc->memcg_low_reclaim); ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ unsigned long seq; + +- mem_cgroup_calculate_protection(NULL, memcg); ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ gen = lru_gen_from_seq(seq); + +- if (mem_cgroup_below_min(memcg)) +- return false; ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); ++ } ++ } + +- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); ++ /* whether the size is big enough to be helpful */ ++ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; ++} + +- if (min_ttl) { +- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); +- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); ++static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, ++ unsigned long min_ttl) ++{ ++ int gen; ++ unsigned long birth; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MIN_SEQ(lruvec); + +- if (time_is_after_jiffies(birth + min_ttl)) +- return false; ++ VM_WARN_ON_ONCE(sc->memcg_low_reclaim); + +- /* the size is likely too small to be helpful */ +- if (!nr_to_scan && sc->priority != DEF_PRIORITY) +- return false; +- } ++ /* see the comment on lru_gen_page */ ++ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); ++ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + +- if (need_aging) +- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); ++ if (time_is_after_jiffies(birth + min_ttl)) ++ return false; + +- return true; ++ if (!lruvec_is_sizable(lruvec, sc)) ++ return false; ++ ++ mem_cgroup_calculate_protection(NULL, memcg); ++ ++ return !mem_cgroup_below_min(memcg); + } + + /* to protect the working set of the last N jiffies */ +@@ -4283,46 +4301,32 @@ static unsigned long lru_gen_min_ttl __read_mostly; + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { + struct mem_cgroup *memcg; +- bool success = false; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_WARN_ON_ONCE(!current_is_kswapd()); + + sc->last_reclaimed = sc->nr_reclaimed; + +- /* +- * To reduce the chance of going into the aging path, which can be +- * costly, optimistically skip it if the flag below was cleared in the +- * eviction path. This improves the overall performance when multiple +- * memcgs are available. +- */ +- if (!sc->memcgs_need_aging) { +- sc->memcgs_need_aging = true; ++ /* check the order to exclude compaction-induced reclaim */ ++ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) + return; +- } +- +- set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + +- if (age_lruvec(lruvec, sc, min_ttl)) +- success = true; ++ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { ++ mem_cgroup_iter_break(NULL, memcg); ++ return; ++ } + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + +- clear_mm_walk(); +- +- /* check the order to exclude compaction-induced reclaim */ +- if (success || !min_ttl || sc->order) +- return; +- + /* + * The main goal is to OOM kill if every generation from all memcgs is + * younger than min_ttl. However, another possibility is all memcgs are +- * either below min or empty. ++ * either too small or below min. + */ + if (mutex_trylock(&oom_lock)) { + struct oom_control oc = { +@@ -4830,33 +4834,27 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + * reclaim. + */ + static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, +- bool can_swap, bool *need_aging) ++ bool can_swap) + { + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); +- DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(memcg) || + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + +- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); +- if (!*need_aging) ++ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) + return nr_to_scan; + + /* skip the aging path at the default priority */ + if (sc->priority == DEF_PRIORITY) +- goto done; ++ return nr_to_scan; + +- /* leave the work to lru_gen_age_node() */ +- if (current_is_kswapd()) +- return 0; ++ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false); + +- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) +- return nr_to_scan; +-done: +- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; ++ /* skip this lruvec as it's low on cold pages */ ++ return 0; + } + + static unsigned long get_nr_to_reclaim(struct scan_control *sc) +@@ -4875,9 +4873,7 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + struct blk_plug plug; +- bool need_aging = false; + unsigned long scanned = 0; +- unsigned long reclaimed = sc->nr_reclaimed; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + + lru_add_drain(); +@@ -4898,13 +4894,13 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + else + swappiness = 0; + +- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); + if (!nr_to_scan) +- goto done; ++ break; + + delta = evict_pages(lruvec, sc, swappiness); + if (!delta) +- goto done; ++ break; + + scanned += delta; + if (scanned >= nr_to_scan) +@@ -4916,10 +4912,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + cond_resched(); + } + +- /* see the comment in lru_gen_age_node() */ +- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) +- sc->memcgs_need_aging = false; +-done: + clear_mm_walk(); + + blk_finish_plug(&plug); +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-25-mm-multi-gen-LRU-shuffle-should_run_aging.patch b/target/linux/generic/backport-5.15/020-v6.3-25-mm-multi-gen-LRU-shuffle-should_run_aging.patch new file mode 100644 index 0000000000..bb5402a3f2 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-25-mm-multi-gen-LRU-shuffle-should_run_aging.patch @@ -0,0 +1,166 @@ +From 107d54931df3c28d81648122e219bf0034ef4e99 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Wed, 21 Dec 2022 21:19:03 -0700 +Subject: [PATCH 25/29] mm: multi-gen LRU: shuffle should_run_aging() + +Move should_run_aging() next to its only caller left. + +Link: https://lkml.kernel.org/r/20221222041905.2431096-6-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Roman Gushchin <roman.gushchin@linux.dev> +Cc: Suren Baghdasaryan <surenb@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 124 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 62 insertions(+), 62 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 7159436872ba..cb026e2714d7 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4183,68 +4183,6 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + return true; + } + +-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, +- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +-{ +- int gen, type, zone; +- unsigned long old = 0; +- unsigned long young = 0; +- unsigned long total = 0; +- struct lru_gen_page *lrugen = &lruvec->lrugen; +- struct mem_cgroup *memcg = lruvec_memcg(lruvec); +- DEFINE_MIN_SEQ(lruvec); +- +- /* whether this lruvec is completely out of cold pages */ +- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { +- *nr_to_scan = 0; +- return true; +- } +- +- for (type = !can_swap; type < ANON_AND_FILE; type++) { +- unsigned long seq; +- +- for (seq = min_seq[type]; seq <= max_seq; seq++) { +- unsigned long size = 0; +- +- gen = lru_gen_from_seq(seq); +- +- for (zone = 0; zone < MAX_NR_ZONES; zone++) +- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); +- +- total += size; +- if (seq == max_seq) +- young += size; +- else if (seq + MIN_NR_GENS == max_seq) +- old += size; +- } +- } +- +- /* try to scrape all its memory if this memcg was deleted */ +- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; +- +- /* +- * The aging tries to be lazy to reduce the overhead, while the eviction +- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the +- * ideal number of generations is MIN_NR_GENS+1. +- */ +- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) +- return false; +- +- /* +- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) +- * of the total number of pages for each generation. A reasonable range +- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The +- * aging cares about the upper bound of hot pages, while the eviction +- * cares about the lower bound of cold pages. +- */ +- if (young * MIN_NR_GENS > total) +- return true; +- if (old * (MIN_NR_GENS + 2) < total) +- return true; +- +- return false; +-} +- + static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) + { + int gen, type, zone; +@@ -4828,6 +4766,68 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + return scanned; + } + ++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, ++ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) ++{ ++ int gen, type, zone; ++ unsigned long old = 0; ++ unsigned long young = 0; ++ unsigned long total = 0; ++ struct lru_gen_page *lrugen = &lruvec->lrugen; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ /* whether this lruvec is completely out of cold pages */ ++ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { ++ *nr_to_scan = 0; ++ return true; ++ } ++ ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ unsigned long seq; ++ ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ unsigned long size = 0; ++ ++ gen = lru_gen_from_seq(seq); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); ++ ++ total += size; ++ if (seq == max_seq) ++ young += size; ++ else if (seq + MIN_NR_GENS == max_seq) ++ old += size; ++ } ++ } ++ ++ /* try to scrape all its memory if this memcg was deleted */ ++ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; ++ ++ /* ++ * The aging tries to be lazy to reduce the overhead, while the eviction ++ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the ++ * ideal number of generations is MIN_NR_GENS+1. ++ */ ++ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) ++ return false; ++ ++ /* ++ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) ++ * of the total number of pages for each generation. A reasonable range ++ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The ++ * aging cares about the upper bound of hot pages, while the eviction ++ * cares about the lower bound of cold pages. ++ */ ++ if (young * MIN_NR_GENS > total) ++ return true; ++ if (old * (MIN_NR_GENS + 2) < total) ++ return true; ++ ++ return false; ++} ++ + /* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-26-mm-multi-gen-LRU-per-node-lru_gen_page-lists.patch b/target/linux/generic/backport-5.15/020-v6.3-26-mm-multi-gen-LRU-per-node-lru_gen_page-lists.patch new file mode 100644 index 0000000000..2a62570346 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-26-mm-multi-gen-LRU-per-node-lru_gen_page-lists.patch @@ -0,0 +1,884 @@ +From fa6363828d314e837c5f79e97ea5e8c0d2f7f062 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Wed, 21 Dec 2022 21:19:04 -0700 +Subject: [PATCH 26/29] mm: multi-gen LRU: per-node lru_gen_page lists + +For each node, memcgs are divided into two generations: the old and +the young. For each generation, memcgs are randomly sharded into +multiple bins to improve scalability. For each bin, an RCU hlist_nulls +is virtually divided into three segments: the head, the tail and the +default. + +An onlining memcg is added to the tail of a random bin in the old +generation. The eviction starts at the head of a random bin in the old +generation. The per-node memcg generation counter, whose reminder (mod +2) indexes the old generation, is incremented when all its bins become +empty. + +There are four operations: +1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in + its current generation (old or young) and updates its "seg" to + "head"; +2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in + its current generation (old or young) and updates its "seg" to + "tail"; +3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in + the old generation, updates its "gen" to "old" and resets its "seg" + to "default"; +4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin + in the young generation, updates its "gen" to "young" and resets + its "seg" to "default". + +The events that trigger the above operations are: +1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; +2. The first attempt to reclaim an memcg below low, which triggers + MEMCG_LRU_TAIL; +3. The first attempt to reclaim an memcg below reclaimable size + threshold, which triggers MEMCG_LRU_TAIL; +4. The second attempt to reclaim an memcg below reclaimable size + threshold, which triggers MEMCG_LRU_YOUNG; +5. Attempting to reclaim an memcg below min, which triggers + MEMCG_LRU_YOUNG; +6. Finishing the aging on the eviction path, which triggers + MEMCG_LRU_YOUNG; +7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + +Note that memcg LRU only applies to global reclaim, and the +round-robin incrementing of their max_seq counters ensures the +eventual fairness to all eligible memcgs. For memcg reclaim, it still +relies on mem_cgroup_iter(). + +Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Roman Gushchin <roman.gushchin@linux.dev> +Cc: Suren Baghdasaryan <surenb@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/memcontrol.h | 10 + + include/linux/mm_inline.h | 17 ++ + include/linux/mmzone.h | 117 +++++++++++- + mm/memcontrol.c | 16 ++ + mm/page_alloc.c | 1 + + mm/vmscan.c | 373 +++++++++++++++++++++++++++++++++---- + 6 files changed, 499 insertions(+), 35 deletions(-) + +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 3736405cbcf6..2e405fd88846 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -818,6 +818,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) + percpu_ref_put(&objcg->refcnt); + } + ++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) ++{ ++ return !memcg || css_tryget(&memcg->css); ++} ++ + static inline void mem_cgroup_put(struct mem_cgroup *memcg) + { + if (memcg) +@@ -1283,6 +1288,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) + return NULL; + } + ++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) ++{ ++ return true; ++} ++ + static inline void mem_cgroup_put(struct mem_cgroup *memcg) + { + } +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 4adc9ba59569..9138c2e638ce 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -112,6 +112,18 @@ static inline bool lru_gen_in_fault(void) + return current->in_lru_fault; + } + ++#ifdef CONFIG_MEMCG ++static inline int lru_gen_memcg_seg(struct lruvec *lruvec) ++{ ++ return READ_ONCE(lruvec->lrugen.seg); ++} ++#else ++static inline int lru_gen_memcg_seg(struct lruvec *lruvec) ++{ ++ return 0; ++} ++#endif ++ + static inline int lru_gen_from_seq(unsigned long seq) + { + return seq % MAX_NR_GENS; +@@ -287,6 +299,11 @@ static inline bool lru_gen_in_fault(void) + return false; + } + ++static inline int lru_gen_memcg_seg(struct lruvec *lruvec) ++{ ++ return 0; ++} ++ + static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming) + { + return false; +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 7b8a26aaf381..4bbf191517e2 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -7,6 +7,7 @@ + + #include <linux/spinlock.h> + #include <linux/list.h> ++#include <linux/list_nulls.h> + #include <linux/wait.h> + #include <linux/bitops.h> + #include <linux/cache.h> +@@ -357,6 +358,15 @@ struct page_vma_mapped_walk; + #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) + #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) + ++/* see the comment on MEMCG_NR_GENS */ ++enum { ++ MEMCG_LRU_NOP, ++ MEMCG_LRU_HEAD, ++ MEMCG_LRU_TAIL, ++ MEMCG_LRU_OLD, ++ MEMCG_LRU_YOUNG, ++}; ++ + #ifdef CONFIG_LRU_GEN + + enum { +@@ -416,6 +426,14 @@ struct lru_gen_page { + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multi-gen LRU is enabled */ + bool enabled; ++#ifdef CONFIG_MEMCG ++ /* the memcg generation this lru_gen_page belongs to */ ++ u8 gen; ++ /* the list segment this lru_gen_page belongs to */ ++ u8 seg; ++ /* per-node lru_gen_page list for global reclaim */ ++ struct hlist_nulls_node list; ++#endif + }; + + enum { +@@ -469,12 +487,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); + void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG ++ ++/* ++ * For each node, memcgs are divided into two generations: the old and the ++ * young. For each generation, memcgs are randomly sharded into multiple bins ++ * to improve scalability. For each bin, the hlist_nulls is virtually divided ++ * into three segments: the head, the tail and the default. ++ * ++ * An onlining memcg is added to the tail of a random bin in the old generation. ++ * The eviction starts at the head of a random bin in the old generation. The ++ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes ++ * the old generation, is incremented when all its bins become empty. ++ * ++ * There are four operations: ++ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its ++ * current generation (old or young) and updates its "seg" to "head"; ++ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its ++ * current generation (old or young) and updates its "seg" to "tail"; ++ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old ++ * generation, updates its "gen" to "old" and resets its "seg" to "default"; ++ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the ++ * young generation, updates its "gen" to "young" and resets its "seg" to ++ * "default". ++ * ++ * The events that trigger the above operations are: ++ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; ++ * 2. The first attempt to reclaim an memcg below low, which triggers ++ * MEMCG_LRU_TAIL; ++ * 3. The first attempt to reclaim an memcg below reclaimable size threshold, ++ * which triggers MEMCG_LRU_TAIL; ++ * 4. The second attempt to reclaim an memcg below reclaimable size threshold, ++ * which triggers MEMCG_LRU_YOUNG; ++ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; ++ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; ++ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. ++ * ++ * Note that memcg LRU only applies to global reclaim, and the round-robin ++ * incrementing of their max_seq counters ensures the eventual fairness to all ++ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). ++ */ ++#define MEMCG_NR_GENS 2 ++#define MEMCG_NR_BINS 8 ++ ++struct lru_gen_memcg { ++ /* the per-node memcg generation counter */ ++ unsigned long seq; ++ /* each memcg has one lru_gen_page per node */ ++ unsigned long nr_memcgs[MEMCG_NR_GENS]; ++ /* per-node lru_gen_page list for global reclaim */ ++ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; ++ /* protects the above */ ++ spinlock_t lock; ++}; ++ ++void lru_gen_init_pgdat(struct pglist_data *pgdat); ++ + void lru_gen_init_memcg(struct mem_cgroup *memcg); + void lru_gen_exit_memcg(struct mem_cgroup *memcg); +-#endif ++void lru_gen_online_memcg(struct mem_cgroup *memcg); ++void lru_gen_offline_memcg(struct mem_cgroup *memcg); ++void lru_gen_release_memcg(struct mem_cgroup *memcg); ++void lru_gen_rotate_memcg(struct lruvec *lruvec, int op); ++ ++#else /* !CONFIG_MEMCG */ ++ ++#define MEMCG_NR_GENS 1 ++ ++struct lru_gen_memcg { ++}; ++ ++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) ++{ ++} ++ ++#endif /* CONFIG_MEMCG */ + + #else /* !CONFIG_LRU_GEN */ + ++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) ++{ ++} ++ + static inline void lru_gen_init_lruvec(struct lruvec *lruvec) + { + } +@@ -484,6 +577,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + } + + #ifdef CONFIG_MEMCG ++ + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { + } +@@ -491,7 +585,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) + { + } +-#endif ++ ++static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) ++{ ++} ++ ++#endif /* CONFIG_MEMCG */ + + #endif /* CONFIG_LRU_GEN */ + +@@ -1105,6 +1216,8 @@ typedef struct pglist_data { + #ifdef CONFIG_LRU_GEN + /* kswap mm walk data */ + struct lru_gen_mm_walk mm_walk; ++ /* lru_gen_page list */ ++ struct lru_gen_memcg memcg_lru; + #endif + + ZONE_PADDING(_pad2_) +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index ed87d1256f0e..172adfbee06e 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -549,6 +549,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) + struct mem_cgroup_per_node *mz; + struct mem_cgroup_tree_per_node *mctz; + ++ if (lru_gen_enabled()) { ++ struct lruvec *lruvec = &mem_cgroup_page_nodeinfo(memcg, page)->lruvec; ++ ++ /* see the comment on MEMCG_NR_GENS */ ++ if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) ++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); ++ ++ return; ++ } ++ + mctz = soft_limit_tree_from_page(page); + if (!mctz) + return; +@@ -3433,6 +3443,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + unsigned long excess; + unsigned long nr_scanned; + ++ if (lru_gen_enabled()) ++ return 0; ++ + if (order > 0) + return 0; + +@@ -5321,6 +5334,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + 2UL*HZ); ++ lru_gen_online_memcg(memcg); + return 0; + } + +@@ -5347,6 +5361,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) + memcg_offline_kmem(memcg); + reparent_shrinker_deferred(memcg); + wb_memcg_offline(memcg); ++ lru_gen_offline_memcg(memcg); + + drain_all_stock(memcg); + +@@ -5358,6 +5373,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + invalidate_reclaim_iterators(memcg); ++ lru_gen_release_memcg(memcg); + } + + static void mem_cgroup_css_free(struct cgroup_subsys_state *css) +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index c929357fbefe..6459d9c018be 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7645,6 +7645,7 @@ static void __init free_area_init_node(int nid) + pgdat_set_deferred_range(pgdat); + + free_area_init_core(pgdat); ++ lru_gen_init_pgdat(pgdat); + } + + void __init free_area_init_memoryless_node(int nid) +diff --git a/mm/vmscan.c b/mm/vmscan.c +index cb026e2714d7..3d8e0665186c 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -54,6 +54,8 @@ + #include <linux/shmem_fs.h> + #include <linux/ctype.h> + #include <linux/debugfs.h> ++#include <linux/rculist_nulls.h> ++#include <linux/random.h> + + #include <asm/tlbflush.h> + #include <asm/div64.h> +@@ -129,11 +131,6 @@ struct scan_control { + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + +-#ifdef CONFIG_LRU_GEN +- /* help kswapd make better choices among multiple memcgs */ +- unsigned long last_reclaimed; +-#endif +- + /* Allocation order */ + s8 order; + +@@ -2880,6 +2877,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + ++#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) ++#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) ++ + static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) + { + struct pglist_data *pgdat = NODE_DATA(nid); +@@ -4169,8 +4169,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + if (sc->priority <= DEF_PRIORITY - 2) + wait_event_killable(lruvec->mm_state.wait, + max_seq < READ_ONCE(lrugen->max_seq)); +- +- return max_seq < READ_ONCE(lrugen->max_seq); ++ return false; + } + + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); +@@ -4243,8 +4242,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + + VM_WARN_ON_ONCE(!current_is_kswapd()); + +- sc->last_reclaimed = sc->nr_reclaimed; +- + /* check the order to exclude compaction-induced reclaim */ + if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) + return; +@@ -4833,8 +4830,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ +-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, +- bool can_swap) ++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) + { + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); +@@ -4851,10 +4847,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + if (sc->priority == DEF_PRIORITY) + return nr_to_scan; + +- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false); +- + /* skip this lruvec as it's low on cold pages */ +- return 0; ++ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; + } + + static unsigned long get_nr_to_reclaim(struct scan_control *sc) +@@ -4863,29 +4857,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) + if (!global_reclaim(sc)) + return -1; + +- /* discount the previous progress for kswapd */ +- if (current_is_kswapd()) +- return sc->nr_to_reclaim + sc->last_reclaimed; +- + return max(sc->nr_to_reclaim, compact_gap(sc->order)); + } + +-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { +- struct blk_plug plug; ++ long nr_to_scan; + unsigned long scanned = 0; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + +- lru_add_drain(); +- +- blk_start_plug(&plug); +- +- set_mm_walk(lruvec_pgdat(lruvec)); +- + while (true) { + int delta; + int swappiness; +- unsigned long nr_to_scan; + + if (sc->may_swap) + swappiness = get_swappiness(lruvec, sc); +@@ -4895,7 +4878,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); +- if (!nr_to_scan) ++ if (nr_to_scan <= 0) + break; + + delta = evict_pages(lruvec, sc, swappiness); +@@ -4912,11 +4895,251 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + cond_resched(); + } + ++ /* whether try_to_inc_max_seq() was successful */ ++ return nr_to_scan < 0; ++} ++ ++static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool success; ++ unsigned long scanned = sc->nr_scanned; ++ unsigned long reclaimed = sc->nr_reclaimed; ++ int seg = lru_gen_memcg_seg(lruvec); ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ /* see the comment on MEMCG_NR_GENS */ ++ if (!lruvec_is_sizable(lruvec, sc)) ++ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; ++ ++ mem_cgroup_calculate_protection(NULL, memcg); ++ ++ if (mem_cgroup_below_min(memcg)) ++ return MEMCG_LRU_YOUNG; ++ ++ if (mem_cgroup_below_low(memcg)) { ++ /* see the comment on MEMCG_NR_GENS */ ++ if (seg != MEMCG_LRU_TAIL) ++ return MEMCG_LRU_TAIL; ++ ++ memcg_memory_event(memcg, MEMCG_LOW); ++ } ++ ++ success = try_to_shrink_lruvec(lruvec, sc); ++ ++ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); ++ ++ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, ++ sc->nr_reclaimed - reclaimed); ++ ++ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; ++ current->reclaim_state->reclaimed_slab = 0; ++ ++ return success ? MEMCG_LRU_YOUNG : 0; ++} ++ ++#ifdef CONFIG_MEMCG ++ ++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ int gen; ++ int bin; ++ int first_bin; ++ struct lruvec *lruvec; ++ struct lru_gen_page *lrugen; ++ const struct hlist_nulls_node *pos; ++ int op = 0; ++ struct mem_cgroup *memcg = NULL; ++ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); ++ ++ bin = first_bin = prandom_u32_max(MEMCG_NR_BINS); ++restart: ++ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); ++ ++ rcu_read_lock(); ++ ++ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { ++ if (op) ++ lru_gen_rotate_memcg(lruvec, op); ++ ++ mem_cgroup_put(memcg); ++ ++ lruvec = container_of(lrugen, struct lruvec, lrugen); ++ memcg = lruvec_memcg(lruvec); ++ ++ if (!mem_cgroup_tryget(memcg)) { ++ op = 0; ++ memcg = NULL; ++ continue; ++ } ++ ++ rcu_read_unlock(); ++ ++ op = shrink_one(lruvec, sc); ++ ++ if (sc->nr_reclaimed >= nr_to_reclaim) ++ goto success; ++ ++ rcu_read_lock(); ++ } ++ ++ rcu_read_unlock(); ++ ++ /* restart if raced with lru_gen_rotate_memcg() */ ++ if (gen != get_nulls_value(pos)) ++ goto restart; ++ ++ /* try the rest of the bins of the current generation */ ++ bin = get_memcg_bin(bin + 1); ++ if (bin != first_bin) ++ goto restart; ++success: ++ if (op) ++ lru_gen_rotate_memcg(lruvec, op); ++ ++ mem_cgroup_put(memcg); ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ ++ VM_WARN_ON_ONCE(global_reclaim(sc)); ++ ++ lru_add_drain(); ++ ++ blk_start_plug(&plug); ++ ++ set_mm_walk(lruvec_pgdat(lruvec)); ++ ++ if (try_to_shrink_lruvec(lruvec, sc)) ++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); ++ ++ clear_mm_walk(); ++ ++ blk_finish_plug(&plug); ++} ++ ++#else /* !CONFIG_MEMCG */ ++ ++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ BUILD_BUG(); ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ BUILD_BUG(); ++} ++ ++#endif ++ ++static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ int priority; ++ unsigned long reclaimable; ++ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); ++ ++ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) ++ return; ++ /* ++ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> ++ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the ++ * estimated reclaimed_to_scanned_ratio = inactive / total. ++ */ ++ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); ++ if (get_swappiness(lruvec, sc)) ++ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ reclaimable /= MEMCG_NR_GENS; ++ ++ /* round down reclaimable and round up sc->nr_to_reclaim */ ++ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); ++ ++ sc->priority = clamp(priority, 0, DEF_PRIORITY); ++} ++ ++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ unsigned long reclaimed = sc->nr_reclaimed; ++ ++ VM_WARN_ON_ONCE(!global_reclaim(sc)); ++ ++ lru_add_drain(); ++ ++ blk_start_plug(&plug); ++ ++ set_mm_walk(pgdat); ++ ++ set_initial_priority(pgdat, sc); ++ ++ if (current_is_kswapd()) ++ sc->nr_reclaimed = 0; ++ ++ if (mem_cgroup_disabled()) ++ shrink_one(&pgdat->__lruvec, sc); ++ else ++ shrink_many(pgdat, sc); ++ ++ if (current_is_kswapd()) ++ sc->nr_reclaimed += reclaimed; ++ + clear_mm_walk(); + + blk_finish_plug(&plug); ++ ++ /* kswapd should never fail */ ++ pgdat->kswapd_failures = 0; + } + ++#ifdef CONFIG_MEMCG ++void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) ++{ ++ int seg; ++ int old, new; ++ int bin = prandom_u32_max(MEMCG_NR_BINS); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock(&pgdat->memcg_lru.lock); ++ ++ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); ++ ++ seg = 0; ++ new = old = lruvec->lrugen.gen; ++ ++ /* see the comment on MEMCG_NR_GENS */ ++ if (op == MEMCG_LRU_HEAD) ++ seg = MEMCG_LRU_HEAD; ++ else if (op == MEMCG_LRU_TAIL) ++ seg = MEMCG_LRU_TAIL; ++ else if (op == MEMCG_LRU_OLD) ++ new = get_memcg_gen(pgdat->memcg_lru.seq); ++ else if (op == MEMCG_LRU_YOUNG) ++ new = get_memcg_gen(pgdat->memcg_lru.seq + 1); ++ else ++ VM_WARN_ON_ONCE(true); ++ ++ hlist_nulls_del_rcu(&lruvec->lrugen.list); ++ ++ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) ++ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); ++ else ++ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); ++ ++ pgdat->memcg_lru.nr_memcgs[old]--; ++ pgdat->memcg_lru.nr_memcgs[new]++; ++ ++ lruvec->lrugen.gen = new; ++ WRITE_ONCE(lruvec->lrugen.seg, seg); ++ ++ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) ++ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); ++ ++ spin_unlock(&pgdat->memcg_lru.lock); ++} ++#endif ++ + /****************************************************************************** + * state change + ******************************************************************************/ +@@ -5370,11 +5593,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, + + if (!mem_cgroup_disabled()) { + rcu_read_lock(); ++ + memcg = mem_cgroup_from_id(memcg_id); +-#ifdef CONFIG_MEMCG +- if (memcg && !css_tryget(&memcg->css)) ++ if (!mem_cgroup_tryget(memcg)) + memcg = NULL; +-#endif ++ + rcu_read_unlock(); + + if (!memcg) +@@ -5521,6 +5744,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + } + + #ifdef CONFIG_MEMCG ++ ++void lru_gen_init_pgdat(struct pglist_data *pgdat) ++{ ++ int i, j; ++ ++ spin_lock_init(&pgdat->memcg_lru.lock); ++ ++ for (i = 0; i < MEMCG_NR_GENS; i++) { ++ for (j = 0; j < MEMCG_NR_BINS; j++) ++ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); ++ } ++} ++ + void lru_gen_init_memcg(struct mem_cgroup *memcg) + { + INIT_LIST_HEAD(&memcg->mm_list.fifo); +@@ -5544,7 +5780,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) + } + } + } +-#endif ++ ++void lru_gen_online_memcg(struct mem_cgroup *memcg) ++{ ++ int gen; ++ int nid; ++ int bin = prandom_u32_max(MEMCG_NR_BINS); ++ ++ for_each_node(nid) { ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ spin_lock(&pgdat->memcg_lru.lock); ++ ++ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); ++ ++ gen = get_memcg_gen(pgdat->memcg_lru.seq); ++ ++ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); ++ pgdat->memcg_lru.nr_memcgs[gen]++; ++ ++ lruvec->lrugen.gen = gen; ++ ++ spin_unlock(&pgdat->memcg_lru.lock); ++ } ++} ++ ++void lru_gen_offline_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); ++ } ++} ++ ++void lru_gen_release_memcg(struct mem_cgroup *memcg) ++{ ++ int gen; ++ int nid; ++ ++ for_each_node(nid) { ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ spin_lock(&pgdat->memcg_lru.lock); ++ ++ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); ++ ++ gen = lruvec->lrugen.gen; ++ ++ hlist_nulls_del_rcu(&lruvec->lrugen.list); ++ pgdat->memcg_lru.nr_memcgs[gen]--; ++ ++ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) ++ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); ++ ++ spin_unlock(&pgdat->memcg_lru.lock); ++ } ++} ++ ++#endif /* CONFIG_MEMCG */ + + static int __init init_lru_gen(void) + { +@@ -5571,6 +5869,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + { + } + ++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -5584,7 +5886,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + bool proportional_reclaim; + struct blk_plug plug; + +- if (lru_gen_enabled()) { ++ if (lru_gen_enabled() && !global_reclaim(sc)) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } +@@ -5826,6 +6128,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + struct lruvec *target_lruvec; + bool reclaimable = false; + ++ if (lru_gen_enabled() && global_reclaim(sc)) { ++ lru_gen_shrink_node(pgdat, sc); ++ return; ++ } ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + again: +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-27-mm-multi-gen-LRU-clarify-scan_control-flags.patch b/target/linux/generic/backport-5.15/020-v6.3-27-mm-multi-gen-LRU-clarify-scan_control-flags.patch new file mode 100644 index 0000000000..59d2c82b56 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-27-mm-multi-gen-LRU-clarify-scan_control-flags.patch @@ -0,0 +1,201 @@ +From 93147736b5b3a21bea24313bfc7a696829932009 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Wed, 21 Dec 2022 21:19:05 -0700 +Subject: [PATCH 27/29] mm: multi-gen LRU: clarify scan_control flags + +Among the flags in scan_control: +1. sc->may_swap, which indicates swap constraint due to memsw.max, is + supported as usual. +2. sc->proactive, which indicates reclaim by memory.reclaim, may not + opportunistically skip the aging path, since it is considered less + latency sensitive. +3. !(sc->gfp_mask & __GFP_IO), which indicates IO constraint, lowers + swappiness to prioritize file LRU, since clean file pages are more + likely to exist. +4. sc->may_writepage and sc->may_unmap, which indicates opportunistic + reclaim, are rejected, since unmapped clean pages are already + prioritized. Scanning for more of them is likely futile and can + cause high reclaim latency when there is a large number of memcgs. + +The rest are handled by the existing code. + +Link: https://lkml.kernel.org/r/20221222041905.2431096-8-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Roman Gushchin <roman.gushchin@linux.dev> +Cc: Suren Baghdasaryan <surenb@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 55 +++++++++++++++++++++++++++-------------------------- + 1 file changed, 28 insertions(+), 27 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 3d8e0665186c..4bcb93df316c 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2905,6 +2905,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + ++ if (!sc->may_swap) ++ return 0; ++ + if (!can_demote(pgdat->node_id, sc) && + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) + return 0; +@@ -3952,7 +3955,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ + } while (err == -EAGAIN); + } + +-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) ++static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) + { + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + +@@ -3960,7 +3963,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) + VM_WARN_ON_ONCE(walk); + + walk = &pgdat->mm_walk; +- } else if (!pgdat && !walk) { ++ } else if (!walk && force_alloc) { + VM_WARN_ON_ONCE(current_is_kswapd()); + + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); +@@ -4146,7 +4149,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + goto done; + } + +- walk = set_mm_walk(NULL); ++ walk = set_mm_walk(NULL, true); + if (!walk) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; +@@ -4215,8 +4218,6 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + +- VM_WARN_ON_ONCE(sc->memcg_low_reclaim); +- + /* see the comment on lru_gen_page */ + gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); +@@ -4472,12 +4473,8 @@ static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_c + { + bool success; + +- /* unmapping inhibited */ +- if (!sc->may_unmap && page_mapped(page)) +- return false; +- + /* swapping inhibited */ +- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ if (!(sc->gfp_mask & __GFP_IO) && + (PageDirty(page) || + (PageAnon(page) && !PageSwapCache(page)))) + return false; +@@ -4574,9 +4571,8 @@ static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, + __count_vm_events(PGSCAN_ANON + type, isolated); + + /* +- * There might not be eligible pages due to reclaim_idx, may_unmap and +- * may_writepage. Check the remaining to prevent livelock if it's not +- * making progress. ++ * There might not be eligible pages due to reclaim_idx. Check the ++ * remaining to prevent livelock if it's not making progress. + */ + return isolated || !remaining ? scanned : 0; + } +@@ -4836,8 +4832,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + +- if (mem_cgroup_below_min(memcg) || +- (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) ++ if (mem_cgroup_below_min(memcg)) + return 0; + + if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) +@@ -4865,17 +4860,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + long nr_to_scan; + unsigned long scanned = 0; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); ++ int swappiness = get_swappiness(lruvec, sc); ++ ++ /* clean file pages are more likely to exist */ ++ if (swappiness && !(sc->gfp_mask & __GFP_IO)) ++ swappiness = 1; + + while (true) { + int delta; +- int swappiness; +- +- if (sc->may_swap) +- swappiness = get_swappiness(lruvec, sc); +- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) +- swappiness = 1; +- else +- swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); + if (nr_to_scan <= 0) +@@ -5005,12 +4997,13 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + struct blk_plug plug; + + VM_WARN_ON_ONCE(global_reclaim(sc)); ++ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); + + lru_add_drain(); + + blk_start_plug(&plug); + +- set_mm_walk(lruvec_pgdat(lruvec)); ++ set_mm_walk(NULL, false); + + if (try_to_shrink_lruvec(lruvec, sc)) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); +@@ -5066,11 +5059,19 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * + + VM_WARN_ON_ONCE(!global_reclaim(sc)); + ++ /* ++ * Unmapped clean pages are already prioritized. Scanning for more of ++ * them is likely futile and can cause high reclaim latency when there ++ * is a large number of memcgs. ++ */ ++ if (!sc->may_writepage || !sc->may_unmap) ++ goto done; ++ + lru_add_drain(); + + blk_start_plug(&plug); + +- set_mm_walk(pgdat); ++ set_mm_walk(pgdat, false); + + set_initial_priority(pgdat, sc); + +@@ -5088,7 +5089,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * + clear_mm_walk(); + + blk_finish_plug(&plug); +- ++done: + /* kswapd should never fail */ + pgdat->kswapd_failures = 0; + } +@@ -5656,7 +5657,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + blk_start_plug(&plug); +- if (!set_mm_walk(NULL)) { ++ if (!set_mm_walk(NULL, true)) { + err = -ENOMEM; + goto done; + } +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-28-mm-multi-gen-LRU-simplify-arch_has_hw_pte_young-chec.patch b/target/linux/generic/backport-5.15/020-v6.3-28-mm-multi-gen-LRU-simplify-arch_has_hw_pte_young-chec.patch new file mode 100644 index 0000000000..a6c7b019ce --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-28-mm-multi-gen-LRU-simplify-arch_has_hw_pte_young-chec.patch @@ -0,0 +1,39 @@ +From cf3297e4c7a928da8b2b2f0baff2f9c69ea57952 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Wed, 21 Dec 2022 21:19:06 -0700 +Subject: [PATCH 28/29] mm: multi-gen LRU: simplify arch_has_hw_pte_young() + check + +Scanning page tables when hardware does not set the accessed bit has +no real use cases. + +Link: https://lkml.kernel.org/r/20221222041905.2431096-9-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Roman Gushchin <roman.gushchin@linux.dev> +Cc: Suren Baghdasaryan <surenb@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 4bcb93df316c..3f6874a69886 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4144,7 +4144,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ +- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { ++ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } +-- +2.40.0 + diff --git a/target/linux/generic/backport-5.15/020-v6.3-29-mm-multi-gen-LRU-avoid-futile-retries.patch b/target/linux/generic/backport-5.15/020-v6.3-29-mm-multi-gen-LRU-avoid-futile-retries.patch new file mode 100644 index 0000000000..14e1aff177 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.3-29-mm-multi-gen-LRU-avoid-futile-retries.patch @@ -0,0 +1,93 @@ +From cc67f962cc53f6e1dfa92eb85b7b26fe83a3c66f Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Mon, 13 Feb 2023 00:53:22 -0700 +Subject: [PATCH 29/29] mm: multi-gen LRU: avoid futile retries + +Recall that the per-node memcg LRU has two generations and they alternate +when the last memcg (of a given node) is moved from one to the other. +Each generation is also sharded into multiple bins to improve scalability. +A reclaimer starts with a random bin (in the old generation) and, if it +fails, it will retry, i.e., to try the rest of the bins. + +If a reclaimer fails with the last memcg, it should move this memcg to the +young generation first, which causes the generations to alternate, and +then retry. Otherwise, the retries will be futile because all other bins +are empty. + +Link: https://lkml.kernel.org/r/20230213075322.1416966-1-yuzhao@google.com +Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") +Signed-off-by: Yu Zhao <yuzhao@google.com> +Reported-by: T.J. Mercier <tjmercier@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 25 +++++++++++++++---------- + 1 file changed, 15 insertions(+), 10 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 3f6874a69886..0b76774963ff 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4934,18 +4934,20 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) + + static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) + { ++ int op; + int gen; + int bin; + int first_bin; + struct lruvec *lruvec; + struct lru_gen_page *lrugen; ++ struct mem_cgroup *memcg; + const struct hlist_nulls_node *pos; +- int op = 0; +- struct mem_cgroup *memcg = NULL; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + + bin = first_bin = prandom_u32_max(MEMCG_NR_BINS); + restart: ++ op = 0; ++ memcg = NULL; + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); + + rcu_read_lock(); +@@ -4969,14 +4971,22 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) + + op = shrink_one(lruvec, sc); + +- if (sc->nr_reclaimed >= nr_to_reclaim) +- goto success; +- + rcu_read_lock(); ++ ++ if (sc->nr_reclaimed >= nr_to_reclaim) ++ break; + } + + rcu_read_unlock(); + ++ if (op) ++ lru_gen_rotate_memcg(lruvec, op); ++ ++ mem_cgroup_put(memcg); ++ ++ if (sc->nr_reclaimed >= nr_to_reclaim) ++ return; ++ + /* restart if raced with lru_gen_rotate_memcg() */ + if (gen != get_nulls_value(pos)) + goto restart; +@@ -4985,11 +4995,6 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) + bin = get_memcg_bin(bin + 1); + if (bin != first_bin) + goto restart; +-success: +- if (op) +- lru_gen_rotate_memcg(lruvec, op); +- +- mem_cgroup_put(memcg); + } + + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +-- +2.40.0 + diff --git a/target/linux/generic/config-5.15 b/target/linux/generic/config-5.15 index b41de9cc28..c9581ba5d4 100644 --- a/target/linux/generic/config-5.15 +++ b/target/linux/generic/config-5.15 @@ -4425,7 +4425,6 @@ CONFIG_NMI_LOG_BUF_SHIFT=13 # CONFIG_NO_HZ is not set # CONFIG_NO_HZ_FULL is not set # CONFIG_NO_HZ_IDLE is not set -CONFIG_NR_LRU_GENS=7 # CONFIG_NS83820 is not set # CONFIG_NTB is not set # CONFIG_NTFS3_64BIT_CLUSTER is not set @@ -6529,7 +6528,6 @@ CONFIG_THIN_ARCHIVES=y # CONFIG_THUNDER_NIC_VF is not set # CONFIG_TICK_CPU_ACCOUNTING is not set CONFIG_TICK_ONESHOT=y -CONFIG_TIERS_PER_GEN=4 # CONFIG_TIFM_CORE is not set # CONFIG_TIGON3 is not set # CONFIG_TIMB_DMA is not set |