diff options
Diffstat (limited to 'target/linux/generic/backport-5.15/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch')
-rw-r--r-- | target/linux/generic/backport-5.15/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch | 508 |
1 files changed, 508 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.15/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch new file mode 100644 index 0000000000..e0c6380b5f --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch @@ -0,0 +1,508 @@ +From e4277535f6d6708bb19b88c4bad155832671d69b Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:04 -0600 +Subject: [PATCH 07/29] mm: multi-gen LRU: exploit locality in rmap +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Searching the rmap for PTEs mapping each page on an LRU list (to test and +clear the accessed bit) can be expensive because pages from different VMAs +(PA space) are not cache friendly to the rmap (VA space). For workloads +mostly using mapped pages, searching the rmap can incur the highest CPU +cost in the reclaim path. + +This patch exploits spatial locality to reduce the trips into the rmap. +When shrink_page_list() walks the rmap and finds a young PTE, a new +function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent +PTEs. On finding another young PTE, it clears the accessed bit and +updates the gen counter of the page mapped by this PTE to +(max_seq%MAX_NR_GENS)+1. + +Server benchmark results: + Single workload: + fio (buffered I/O): no change + + Single workload: + memcached (anon): +[3, 5]% + Ops/sec KB/sec + patch1-6: 1106168.46 43025.04 + patch1-7: 1147696.57 44640.29 + + Configurations: + no change + +Client benchmark results: + kswapd profiles: + patch1-6 + 39.03% lzo1x_1_do_compress (real work) + 18.47% page_vma_mapped_walk (overhead) + 6.74% _raw_spin_unlock_irq + 3.97% do_raw_spin_lock + 2.49% ptep_clear_flush + 2.48% anon_vma_interval_tree_iter_first + 1.92% page_referenced_one + 1.88% __zram_bvec_write + 1.48% memmove + 1.31% vma_interval_tree_iter_next + + patch1-7 + 48.16% lzo1x_1_do_compress (real work) + 8.20% page_vma_mapped_walk (overhead) + 7.06% _raw_spin_unlock_irq + 2.92% ptep_clear_flush + 2.53% __zram_bvec_write + 2.11% do_raw_spin_lock + 2.02% memmove + 1.93% lru_gen_look_around + 1.56% free_unref_page_list + 1.40% memset + + Configurations: + no change + +Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Barry Song <baohua@kernel.org> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + include/linux/memcontrol.h | 31 +++++++ + include/linux/mmzone.h | 6 ++ + mm/internal.h | 1 + + mm/memcontrol.c | 1 + + mm/rmap.c | 7 ++ + mm/swap.c | 4 +- + mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++ + 7 files changed, 232 insertions(+), 2 deletions(-) + +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 4f189b17dafc..8d6a0329bc59 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -442,6 +442,7 @@ static inline struct obj_cgroup *__page_objcg(struct page *page) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. +@@ -497,6 +498,7 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. +@@ -953,6 +955,23 @@ void unlock_page_memcg(struct page *page); + + void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); + ++/* try to stablize page_memcg() for all the pages in a memcg */ ++static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) ++{ ++ rcu_read_lock(); ++ ++ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) ++ return true; ++ ++ rcu_read_unlock(); ++ return false; ++} ++ ++static inline void mem_cgroup_unlock_pages(void) ++{ ++ rcu_read_unlock(); ++} ++ + /* idx can be of type enum memcg_stat_item or node_stat_item */ + static inline void mod_memcg_state(struct mem_cgroup *memcg, + int idx, int val) +@@ -1369,6 +1388,18 @@ static inline void unlock_page_memcg(struct page *page) + { + } + ++static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) ++{ ++ /* to match page_memcg_rcu() */ ++ rcu_read_lock(); ++ return true; ++} ++ ++static inline void mem_cgroup_unlock_pages(void) ++{ ++ rcu_read_unlock(); ++} ++ + static inline void mem_cgroup_handle_over_high(void) + { + } +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index fce8945c507c..4db2b877fcf9 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -352,6 +352,7 @@ enum lruvec_flags { + #ifndef __GENERATING_BOUNDS_H + + struct lruvec; ++struct page_vma_mapped_walk; + + #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) + #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +@@ -407,6 +408,7 @@ struct lru_gen_struct { + }; + + void lru_gen_init_lruvec(struct lruvec *lruvec); ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); +@@ -419,6 +421,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) + { + } + ++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++} ++ + #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { +diff --git a/mm/internal.h b/mm/internal.h +index cf3cb933eba3..5c73246a092e 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -35,6 +35,7 @@ + void page_writeback_init(void); + + vm_fault_t do_swap_page(struct vm_fault *vmf); ++void activate_page(struct page *page); + + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 8b634dc72e7f..cc3431c5d9ba 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2798,6 +2798,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + */ + page->memcg_data = (unsigned long)memcg; + } +diff --git a/mm/rmap.c b/mm/rmap.c +index 330b361a460e..22a86122732e 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -73,6 +73,7 @@ + #include <linux/page_idle.h> + #include <linux/memremap.h> + #include <linux/userfaultfd_k.h> ++#include <linux/mm_inline.h> + + #include <asm/tlbflush.h> + +@@ -793,6 +794,12 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, + } + + if (pvmw.pte) { ++ if (lru_gen_enabled() && pte_young(*pvmw.pte) && ++ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { ++ lru_gen_look_around(&pvmw); ++ referenced++; ++ } ++ + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* +diff --git a/mm/swap.c b/mm/swap.c +index 5d227577b609..966ff2d83343 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -325,7 +325,7 @@ static bool need_activate_page_drain(int cpu) + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; + } + +-static void activate_page(struct page *page) ++void activate_page(struct page *page) + { + page = compound_head(page); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { +@@ -345,7 +345,7 @@ static inline void activate_page_drain(int cpu) + { + } + +-static void activate_page(struct page *page) ++void activate_page(struct page *page) + { + struct lruvec *lruvec; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 932abd24c1b3..1d0b25ae378c 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1409,6 +1409,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + ++ /* page_update_gen() tried to promote this page? */ ++ if (lru_gen_enabled() && !ignore_references && ++ page_mapped(page) && PageReferenced(page)) ++ goto keep_locked; ++ + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + +@@ -2990,6 +2995,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) + * the aging + ******************************************************************************/ + ++/* promote pages accessed through page tables */ ++static int page_update_gen(struct page *page, int gen) ++{ ++ unsigned long new_flags, old_flags = READ_ONCE(page->flags); ++ ++ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(!rcu_read_lock_held()); ++ ++ do { ++ /* lru_gen_del_page() has isolated this page? */ ++ if (!(old_flags & LRU_GEN_MASK)) { ++ /* for shrink_page_list() */ ++ new_flags = old_flags | BIT(PG_referenced); ++ continue; ++ } ++ ++ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); ++ ++ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ + /* protect pages accessed multiple times through file descriptors */ + static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming) + { +@@ -3001,6 +3029,11 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin + VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page); + + do { ++ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ /* page_update_gen() has promoted this page? */ ++ if (new_gen >= 0 && new_gen != old_gen) ++ return new_gen; ++ + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); +@@ -3015,6 +3048,43 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin + return new_gen; + } + ++static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) ++{ ++ unsigned long pfn = pte_pfn(pte); ++ ++ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); ++ ++ if (!pte_present(pte) || is_zero_pfn(pfn)) ++ return -1; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) ++ return -1; ++ ++ if (WARN_ON_ONCE(!pfn_valid(pfn))) ++ return -1; ++ ++ return pfn; ++} ++ ++static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg, ++ struct pglist_data *pgdat) ++{ ++ struct page *page; ++ ++ /* try to avoid unnecessary memory loads */ ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ return NULL; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != pgdat->node_id) ++ return NULL; ++ ++ if (page_memcg_rcu(page) != memcg) ++ return NULL; ++ ++ return page; ++} ++ + static void inc_min_seq(struct lruvec *lruvec, int type) + { + struct lru_gen_struct *lrugen = &lruvec->lrugen; +@@ -3214,6 +3284,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + } + ++/* ++ * This function exploits spatial locality when shrink_page_list() walks the ++ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. ++ */ ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++ int i; ++ pte_t *pte; ++ unsigned long start; ++ unsigned long end; ++ unsigned long addr; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; ++ struct page *page = pvmw->page; ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ DEFINE_MAX_SEQ(lruvec); ++ int old_gen, new_gen = lru_gen_from_seq(max_seq); ++ ++ lockdep_assert_held(pvmw->ptl); ++ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); ++ ++ if (spin_is_contended(pvmw->ptl)) ++ return; ++ ++ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); ++ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; ++ ++ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { ++ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) ++ end = start + MIN_LRU_BATCH * PAGE_SIZE; ++ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) ++ start = end - MIN_LRU_BATCH * PAGE_SIZE; ++ else { ++ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; ++ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; ++ } ++ } ++ ++ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ ++ rcu_read_lock(); ++ arch_enter_lazy_mmu_mode(); ++ ++ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn; ++ ++ pfn = get_pte_pfn(pte[i], pvmw->vma, addr); ++ if (pfn == -1) ++ continue; ++ ++ if (!pte_young(pte[i])) ++ continue; ++ ++ page = get_pfn_page(pfn, memcg, pgdat); ++ if (!page) ++ continue; ++ ++ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) ++ VM_WARN_ON_ONCE(true); ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && ++ !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_lru_gen(page); ++ if (old_gen < 0) ++ SetPageReferenced(page); ++ else if (old_gen != new_gen) ++ __set_bit(i, bitmap); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ rcu_read_unlock(); ++ ++ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { ++ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { ++ page = pte_page(pte[i]); ++ activate_page(page); ++ } ++ return; ++ } ++ ++ /* page_update_gen() requires stable page_memcg() */ ++ if (!mem_cgroup_trylock_pages(memcg)) ++ return; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); ++ ++ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { ++ page = compound_head(pte_page(pte[i])); ++ if (page_memcg_rcu(page) != memcg) ++ continue; ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen < 0 || old_gen == new_gen) ++ continue; ++ ++ lru_gen_update_size(lruvec, page, old_gen, new_gen); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_unlock_pages(); ++} ++ + /****************************************************************************** + * the eviction + ******************************************************************************/ +@@ -3250,6 +3428,12 @@ static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) + return true; + } + ++ /* promoted */ ++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { ++ list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ + /* protected */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); +-- +2.40.0 + |