From 0d0928f58795e336646ad31ea96d2919b5328f39 Mon Sep 17 00:00:00 2001 From: Kazuki H Date: Tue, 21 Mar 2023 06:51:03 +0900 Subject: kernel: Update MGLRU patchset The current patches are old, update them from mainline. Backports taken from https://github.com/yuzhaogoogle/linux/commits/mglru-5.15 Tested-by: Kazuki H #mt7622/Linksys E8450 UBI Signed-off-by: Kazuki H --- .../020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch | 513 +++++++++++++++++++++ 1 file changed, 513 insertions(+) create mode 100644 target/linux/generic/backport-5.15/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch (limited to 'target/linux/generic/backport-5.15/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch') diff --git a/target/linux/generic/backport-5.15/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch b/target/linux/generic/backport-5.15/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch new file mode 100644 index 0000000000..0adb15f5e2 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch @@ -0,0 +1,513 @@ +From 640db3a029dca909af47157ca18f52b29d34a1b9 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Sun, 18 Sep 2022 02:00:07 -0600 +Subject: [PATCH 10/29] mm: multi-gen LRU: kill switch +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that +can be disabled include: + 0x0001: the multi-gen LRU core + 0x0002: walking page table, when arch_has_hw_pte_young() returns + true + 0x0004: clearing the accessed bit in non-leaf PMD entries, when + CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y + [yYnN]: apply to all the components above +E.g., + echo y >/sys/kernel/mm/lru_gen/enabled + cat /sys/kernel/mm/lru_gen/enabled + 0x0007 + echo 5 >/sys/kernel/mm/lru_gen/enabled + cat /sys/kernel/mm/lru_gen/enabled + 0x0005 + +NB: the page table walks happen on the scale of seconds under heavy memory +pressure, in which case the mmap_lock contention is a lesser concern, +compared with the LRU lock contention and the I/O congestion. So far the +only well-known case of the mmap_lock contention happens on Android, due +to Scudo [1] which allocates several thousand VMAs for merely a few +hundred MBs. The SPF and the Maple Tree also have provided their own +assessments [2][3]. However, if walking page tables does worsen the +mmap_lock contention, the kill switch can be used to disable it. In this +case the multi-gen LRU will suffer a minor performance degradation, as +shown previously. + +Clearing the accessed bit in non-leaf PMD entries can also be disabled, +since this behavior was not tested on x86 varieties other than Intel and +AMD. + +[1] https://source.android.com/devices/tech/debug/scudo +[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/ +[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/ + +Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +Cc: Andi Kleen +Cc: Aneesh Kumar K.V +Cc: Barry Song +Cc: Catalin Marinas +Cc: Dave Hansen +Cc: Hillf Danton +Cc: Jens Axboe +Cc: Johannes Weiner +Cc: Jonathan Corbet +Cc: Linus Torvalds +Cc: Matthew Wilcox +Cc: Mel Gorman +Cc: Miaohe Lin +Cc: Michael Larabel +Cc: Michal Hocko +Cc: Mike Rapoport +Cc: Mike Rapoport +Cc: Peter Zijlstra +Cc: Qi Zheng +Cc: Tejun Heo +Cc: Vlastimil Babka +Cc: Will Deacon +Signed-off-by: Andrew Morton +--- + include/linux/cgroup.h | 15 ++- + include/linux/mm_inline.h | 15 ++- + include/linux/mmzone.h | 9 ++ + kernel/cgroup/cgroup-internal.h | 1 - + mm/Kconfig | 6 + + mm/vmscan.c | 228 +++++++++++++++++++++++++++++++- + 6 files changed, 265 insertions(+), 9 deletions(-) + +diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h +index 45cdb12243e3..f9a5d6a81101 100644 +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgroup *cgrp) + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgroup *cgrp) + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -708,6 +719,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 58aabb1ba020..e095c1c24311 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -91,10 +91,21 @@ static __always_inline enum lru_list page_lru(struct page *page) + + #ifdef CONFIG_LRU_GEN + ++#ifdef CONFIG_LRU_GEN_ENABLED + static inline bool lru_gen_enabled(void) + { +- return true; ++ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); ++ ++ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); ++} ++#else ++static inline bool lru_gen_enabled(void) ++{ ++ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); ++ ++ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); + } ++#endif + + static inline bool lru_gen_in_fault(void) + { +@@ -207,7 +218,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo + + VM_WARN_ON_ONCE_PAGE(gen != -1, page); + +- if (PageUnevictable(page)) ++ if (PageUnevictable(page) || !lrugen->enabled) + return false; + /* + * There are three common cases for this page: +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 659bab633bdf..edaf035503ed 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -364,6 +364,13 @@ enum { + LRU_GEN_FILE, + }; + ++enum { ++ LRU_GEN_CORE, ++ LRU_GEN_MM_WALK, ++ LRU_GEN_NONLEAF_YOUNG, ++ NR_LRU_GEN_CAPS ++}; ++ + #define MIN_LRU_BATCH BITS_PER_LONG + #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) + +@@ -405,6 +412,8 @@ struct lru_gen_struct { + /* can be modified without holding the LRU lock */ + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ /* whether the multi-gen LRU is enabled */ ++ bool enabled; + }; + + enum { +diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h +index d8fcc139ac05..28c32a01da7d 100644 +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -165,7 +165,6 @@ struct cgroup_mgctx { + #define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + extern struct cgroup_subsys *cgroup_subsys[]; + extern struct list_head cgroup_roots; +diff --git a/mm/Kconfig b/mm/Kconfig +index 62433f3cd7ae..4a7d0af3c39b 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -906,6 +906,12 @@ config LRU_GEN + help + A high performance LRU implementation to overcommit memory. + ++config LRU_GEN_ENABLED ++ bool "Enable by default" ++ depends on LRU_GEN ++ help ++ This option enables the multi-gen LRU by default. ++ + config LRU_GEN_STATS + bool "Full stats for debugging" + depends on LRU_GEN +diff --git a/mm/vmscan.c b/mm/vmscan.c +index b6f6fc2585e1..be37d996bc92 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -2841,6 +2842,14 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + + #ifdef CONFIG_LRU_GEN + ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); ++#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) ++#else ++DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); ++#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) ++#endif ++ + /****************************************************************************** + * shorthand helpers + ******************************************************************************/ +@@ -3717,7 +3726,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area + goto next; + + if (!pmd_trans_huge(pmd[i])) { +- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && ++ get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } +@@ -3815,10 +3825,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + walk->mm_stats[MM_NONLEAF_TOTAL]++; + + #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +- if (!pmd_young(val)) +- continue; ++ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { ++ if (!pmd_young(val)) ++ continue; + +- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ } + #endif + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + continue; +@@ -4080,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ +- if (!arch_has_hw_pte_young()) { ++ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } +@@ -4845,6 +4857,208 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + blk_finish_plug(&plug); + } + ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) ++{ ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ if (lrugen->enabled) { ++ enum lru_list lru; ++ ++ for_each_evictable_lru(lru) { ++ if (!list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ } else { ++ int gen, type, zone; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool fill_evictable(struct lruvec *lruvec) ++{ ++ enum lru_list lru; ++ int remaining = MAX_LRU_BATCH; ++ ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); ++ VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page); ++ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); ++ VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page); ++ ++ del_page_from_lru_list(page, lruvec); ++ success = lru_gen_add_page(lruvec, page, false); ++ VM_WARN_ON_ONCE(!success); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool drain_evictable(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ int remaining = MAX_LRU_BATCH; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); ++ VM_WARN_ON_ONCE_PAGE(PageActive(page), page); ++ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); ++ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); ++ ++ success = lru_gen_del_page(lruvec, page, false); ++ VM_WARN_ON_ONCE(!success); ++ add_page_to_lru_list(page, lruvec); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static void lru_gen_change_state(bool enabled) ++{ ++ static DEFINE_MUTEX(state_mutex); ++ ++ struct mem_cgroup *memcg; ++ ++ cgroup_lock(); ++ cpus_read_lock(); ++ get_online_mems(); ++ mutex_lock(&state_mutex); ++ ++ if (enabled == lru_gen_enabled()) ++ goto unlock; ++ ++ if (enabled) ++ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); ++ else ++ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ if (!lruvec) ++ continue; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); ++ VM_WARN_ON_ONCE(!state_is_valid(lruvec)); ++ ++ lruvec->lrugen.enabled = enabled; ++ ++ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ mutex_unlock(&state_mutex); ++ put_online_mems(); ++ cpus_read_unlock(); ++ cgroup_unlock(); ++} ++ ++/****************************************************************************** ++ * sysfs interface ++ ******************************************************************************/ ++ ++static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ unsigned int caps = 0; ++ ++ if (get_cap(LRU_GEN_CORE)) ++ caps |= BIT(LRU_GEN_CORE); ++ ++ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) ++ caps |= BIT(LRU_GEN_MM_WALK); ++ ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) ++ caps |= BIT(LRU_GEN_NONLEAF_YOUNG); ++ ++ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); ++} ++ ++static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ int i; ++ unsigned int caps; ++ ++ if (tolower(*buf) == 'n') ++ caps = 0; ++ else if (tolower(*buf) == 'y') ++ caps = -1; ++ else if (kstrtouint(buf, 0, &caps)) ++ return -EINVAL; ++ ++ for (i = 0; i < NR_LRU_GEN_CAPS; i++) { ++ bool enabled = caps & BIT(i); ++ ++ if (i == LRU_GEN_CORE) ++ lru_gen_change_state(enabled); ++ else if (enabled) ++ static_branch_enable(&lru_gen_caps[i]); ++ else ++ static_branch_disable(&lru_gen_caps[i]); ++ } ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_enabled_attr = __ATTR( ++ enabled, 0644, show_enabled, store_enabled ++); ++ ++static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_enabled_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group lru_gen_attr_group = { ++ .name = "lru_gen", ++ .attrs = lru_gen_attrs, ++}; ++ + /****************************************************************************** + * initialization + ******************************************************************************/ +@@ -4855,6 +5069,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled = lru_gen_enabled(); + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); +@@ -4894,6 +5109,9 @@ static int __init init_lru_gen(void) + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + ++ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) ++ pr_err("lru_gen: failed to create sysfs group\n"); ++ + return 0; + }; + late_initcall(init_lru_gen); +-- +2.40.0 + -- cgit v1.2.3