diff options
Diffstat (limited to 'target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch')
-rw-r--r-- | target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch | 996 |
1 files changed, 996 insertions, 0 deletions
diff --git a/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch new file mode 100644 index 0000000000..146f510d28 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch @@ -0,0 +1,996 @@ +From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Mon, 25 Jan 2021 21:12:33 -0700 +Subject: [PATCH 04/10] mm: multigenerational lru: groundwork + +For each lruvec, evictable pages are divided into multiple +generations. The youngest generation number is stored in +lrugen->max_seq for both anon and file types as they are aged on an +equal footing. The oldest generation numbers are stored in +lrugen->min_seq[] separately for anon and file types as clean file +pages can be evicted regardless of swap constraints. These three +variables are monotonically increasing. Generation numbers are +truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into +page->flags. The sliding window technique is used to prevent truncated +generation numbers from overlapping. Each truncated generation number +is an index to +lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. + +The framework comprises two conceptually independent components: the +aging, which produces young generations, and the eviction, which +consumes old generations. Both can be invoked independently from user +space for the purpose of working set estimation and proactive reclaim. + +The protection of hot pages and the selection of cold pages are based +on page access types and patterns. There are two access types: one via +page tables and the other via file descriptors. The protection of the +former type is by design stronger because: + 1) The uncertainty in determining the access patterns of the former + type is higher due to the coalesced nature of the accessed bit. + 2) The cost of evicting the former type is higher due to the TLB + flushes required and the likelihood of involving I/O. + 3) The penalty of under-protecting the former type is higher because + applications usually do not prepare themselves for major faults like + they do for blocked I/O. For example, client applications commonly + dedicate blocked I/O to separate threads to avoid UI janks that + negatively affect user experience. + +There are also two access patterns: one with temporal locality and the +other without. The latter pattern, e.g., random and sequential, needs +to be explicitly excluded to avoid weakening the protection of the +former pattern. Generally the former type follows the former pattern +unless MADV_SEQUENTIAL is specified and the latter type follows the +latter pattern unless outlying refaults have been observed. + +Upon faulting, a page is added to the youngest generation, which +provides the strongest protection as the eviction will not consider +this page before the aging has scanned it at least twice. The first +scan clears the accessed bit set during the initial fault. And the +second scan makes sure this page has not been used since the first +scan. A page from any other generations is brought back to the +youngest generation whenever the aging finds the accessed bit set on +any of the PTEs mapping this page. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. This is done later [PATCH 07/10]. + +Signed-off-by: Yu Zhao <yuzhao@google.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7 +--- + fs/fuse/dev.c | 3 +- + include/linux/cgroup.h | 15 +- + include/linux/mm.h | 36 ++++ + include/linux/mm_inline.h | 182 ++++++++++++++++++++ + include/linux/mmzone.h | 70 ++++++++ + include/linux/page-flags-layout.h | 19 ++- + include/linux/page-flags.h | 4 +- + include/linux/sched.h | 3 + + kernel/bounds.c | 3 + + kernel/cgroup/cgroup-internal.h | 1 - + mm/huge_memory.c | 3 +- + mm/memcontrol.c | 1 + + mm/memory.c | 7 + + mm/mm_init.c | 6 +- + mm/page_alloc.c | 1 + + mm/swap.c | 9 +- + mm/swapfile.c | 2 + + mm/vmscan.c | 268 ++++++++++++++++++++++++++++++ + 18 files changed, 618 insertions(+), 15 deletions(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -785,7 +785,8 @@ static int fuse_check_page(struct page * + 1 << PG_active | + 1 << PG_workingset | + 1 << PG_reclaim | +- 1 << PG_waiters))) { ++ 1 << PG_waiters | ++ LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); + return 1; + } +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -707,6 +718,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v + #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) + #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) ++#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) ++#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) + + /* + * Define the bit shifts to access each section. For non-existent +@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s + loff_t const holebegin, loff_t const holelen, int even_cows) { } + #endif + ++#ifdef CONFIG_LRU_GEN ++static inline void task_enter_nonseq_fault(void) ++{ ++ WARN_ON(current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 1; ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++ WARN_ON(!current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 0; ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return current->in_nonseq_fault; ++} ++#else ++static inline void task_enter_nonseq_fault(void) ++{ ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return false; ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) + { +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag + return lru; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static inline bool lru_gen_enabled(void) ++{ ++#ifdef CONFIG_LRU_GEN_ENABLED ++ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); ++ ++ return static_branch_likely(&lru_gen_static_key); ++#else ++ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); ++ ++ return static_branch_unlikely(&lru_gen_static_key); ++#endif ++} ++ ++/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ ++static inline int lru_gen_from_seq(unsigned long seq) ++{ ++ return seq % MAX_NR_GENS; ++} ++ ++/* The youngest and the second youngest generations are counted as active. */ ++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) ++{ ++ unsigned long max_seq = lruvec->evictable.max_seq; ++ ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); ++} ++ ++/* Update the sizes of the multigenerational lru lists. */ ++static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, ++ int old_gen, int new_gen) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ enum lru_list lru = type * LRU_FILE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lockdep_assert_held(&lruvec->lru_lock); ++ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); ++ VM_BUG_ON(old_gen == -1 && new_gen == -1); ++ ++ if (old_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], ++ lrugen->sizes[old_gen][type][zone] - delta); ++ if (new_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], ++ lrugen->sizes[new_gen][type][zone] + delta); ++ ++ if (old_gen < 0) { ++ if (lru_gen_is_active(lruvec, new_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, delta); ++ return; ++ } ++ ++ if (new_gen < 0) { ++ if (lru_gen_is_active(lruvec, old_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, -delta); ++ return; ++ } ++ ++ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { ++ update_lru_size(lruvec, lru, zone, -delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ ++ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); ++} ++ ++/* Add a page to one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ if (PageUnevictable(page) || !lrugen->enabled[type]) ++ return false; ++ /* ++ * If a page shouldn't be considered for eviction, i.e., a page mapped ++ * upon fault during which the accessed bit is set, add it to the ++ * youngest generation. ++ * ++ * If a page can't be evicted immediately, i.e., an anon page not in ++ * swap cache or a dirty page pending writeback, add it to the second ++ * oldest generation. ++ * ++ * If a page could be evicted immediately, e.g., a clean page, add it to ++ * the oldest generation. ++ */ ++ if (PageActive(page)) ++ gen = lru_gen_from_seq(lrugen->max_seq); ++ else if ((!type && !PageSwapCache(page)) || ++ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) ++ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); ++ else ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); ++ ++ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, -1, gen); ++ /* for rotate_reclaimable_page() */ ++ if (reclaiming) ++ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ else ++ list_add(&page->lru, &lrugen->lists[gen][type][zone]); ++ ++ return true; ++} ++ ++/* Delete a page from one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ if (!(new_flags & LRU_GEN_MASK)) ++ return false; ++ ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ ++ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ ++ new_flags &= ~LRU_GEN_MASK; ++ /* for shrink_page_list() */ ++ if (reclaiming) ++ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); ++ else if (lru_gen_is_active(lruvec, gen)) ++ new_flags |= BIT(PG_active); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, gen, -1); ++ list_del(&page->lru); ++ ++ return true; ++} ++ ++#else ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static __always_inline void add_page_to_lru_list(struct page *page, + struct lruvec *lruvec) + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, false)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add(&page->lru, &lruvec->lists[lru]); + } +@@ -93,6 +269,9 @@ static __always_inline void add_page_to_ + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, true)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); + } +@@ -100,6 +279,9 @@ static __always_inline void add_page_to_ + static __always_inline void del_page_from_lru_list(struct page *page, + struct lruvec *lruvec) + { ++ if (lru_gen_del_page(page, lruvec, false)) ++ return; ++ + list_del(&page->lru); + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -294,6 +294,72 @@ enum lruvec_flags { + */ + }; + ++struct lruvec; ++ ++#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) ++#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) ++ ++#ifdef CONFIG_LRU_GEN ++ ++/* ++ * For each lruvec, evictable pages are divided into multiple generations. The ++ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are ++ * monotonically increasing. The sliding window technique is used to track at ++ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the ++ * window, AKA gen, indexes an array of per-type and per-zone lists for the ++ * corresponding generation. The counter in page->flags stores gen+1 while a ++ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. ++ * ++ * After a page is faulted in, the aging must check the accessed bit at least ++ * twice before the eviction would consider it. The first check clears the ++ * accessed bit set during the initial fault. The second check makes sure this ++ * page hasn't been used since then. ++ */ ++#define MIN_NR_GENS 2 ++#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++ ++struct lrugen { ++ /* the aging increments the max generation number */ ++ unsigned long max_seq; ++ /* the eviction increments the min generation numbers */ ++ unsigned long min_seq[ANON_AND_FILE]; ++ /* the birth time of each generation in jiffies */ ++ unsigned long timestamps[MAX_NR_GENS]; ++ /* the multigenerational lru lists */ ++ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the sizes of the multigenerational lru lists in pages */ ++ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* whether the multigenerational lru is enabled */ ++ bool enabled[ANON_AND_FILE]; ++}; ++ ++#define MAX_BATCH_SIZE 8192 ++ ++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); ++void lru_gen_change_state(bool enable, bool main, bool swap); ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg); ++#endif ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) ++{ ++} ++ ++static inline void lru_gen_change_state(bool enable, bool main, bool swap) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++} ++#endif ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct lruvec { + struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ +@@ -311,6 +377,10 @@ struct lruvec { + unsigned long refaults[ANON_AND_FILE]; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; ++#ifdef CONFIG_LRU_GEN ++ /* unevictable pages are on LRU_UNEVICTABLE */ ++ struct lrugen evictable; ++#endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; + #endif +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -26,6 +26,14 @@ + + #define ZONES_WIDTH ZONES_SHIFT + ++#ifdef CONFIG_LRU_GEN ++/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ ++#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) ++#else ++#define LRU_GEN_WIDTH 0 ++#define LRU_REFS_WIDTH 0 ++#endif /* CONFIG_LRU_GEN */ ++ + #ifdef CONFIG_SPARSEMEM + #include <asm/sparsemem.h> + #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) +@@ -55,7 +63,8 @@ + #define SECTIONS_WIDTH 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ ++ <= BITS_PER_LONG - NR_PAGEFLAGS + #define NODES_WIDTH NODES_SHIFT + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) + #error "Vmemmap: No space for nodes field in page flags" +@@ -89,8 +98,8 @@ + #define LAST_CPUPID_SHIFT 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ +- <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS + #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT + #else + #define LAST_CPUPID_WIDTH 0 +@@ -100,8 +109,8 @@ + #define LAST_CPUPID_NOT_IN_PAGE_FLAGS + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ +- > BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS + #error "Not enough bits in page flags" + #endif + +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall + 1UL << PG_private | 1UL << PG_private_2 | \ + 1UL << PG_writeback | 1UL << PG_reserved | \ + 1UL << PG_slab | 1UL << PG_active | \ +- 1UL << PG_unevictable | __PG_MLOCKED) ++ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + + /* + * Flags checked when a page is prepped for return by the page allocator. +@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall + * alloc-free cycle to prevent from reusing the page. + */ + #define PAGE_FLAGS_CHECK_AT_PREP \ +- (PAGEFLAGS_MASK & ~__PG_HWPOISON) ++ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) + + #define PAGE_FLAGS_PRIVATE \ + (1UL << PG_private | 1UL << PG_private_2) +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -911,6 +911,9 @@ struct task_struct { + #ifdef CONFIG_MEMCG + unsigned in_user_fault:1; + #endif ++#ifdef CONFIG_LRU_GEN ++ unsigned in_nonseq_fault:1; ++#endif + #ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; + #endif +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -22,6 +22,9 @@ int main(void) + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + #endif + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); ++#ifdef CONFIG_LRU_GEN ++ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); ++#endif + /* End of constants */ + + return 0; +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -165,7 +165,6 @@ struct cgroup_mgctx { + #define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + extern struct cgroup_subsys *cgroup_subsys[]; + extern struct list_head cgroup_roots; +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc + #ifdef CONFIG_64BIT + (1L << PG_arch_2) | + #endif +- (1L << PG_dirty))); ++ (1L << PG_dirty) | ++ LRU_GEN_MASK | LRU_REFS_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all + memcg->deferred_split_queue.split_queue_len = 0; + #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ lru_gen_init_memcg(memcg); + return memcg; + fail: + mem_cgroup_id_remove(memcg); +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are + unsigned int flags, struct pt_regs *regs) + { + vm_fault_t ret; ++ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); + + __set_current_state(TASK_RUNNING); + +@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + ++ if (nonseq_fault) ++ task_enter_nonseq_fault(); ++ + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); + ++ if (nonseq_fault) ++ task_exit_nonseq_fault(); ++ + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH +- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; ++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", +- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", ++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, ++ LRU_GEN_WIDTH, ++ LRU_REFS_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7411,6 +7411,7 @@ static void __meminit pgdat_init_interna + + pgdat_page_ext_init(pgdat); + lruvec_init(&pgdat->__lruvec); ++ lru_gen_init_state(NULL, &pgdat->__lruvec); + } + + static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page) + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + ++ /* see the comment in lru_gen_add_page() */ ++ if (lru_gen_enabled() && !PageUnevictable(page) && ++ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) ++ SetPageActive(page); ++ + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); +@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc + + static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) + { +- if (PageActive(page) && !PageUnevictable(page)) { ++ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + int nr_pages = thp_nr_pages(page); + + del_page_from_lru_list(page, lruvec); +@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p + */ + void deactivate_page(struct page *page) + { +- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { ++ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_change_state(false, false, true); + + out_dput: + filp_close(victim, NULL); +@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use + mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_change_state(true, false, true); + + error = 0; + goto out; +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -50,6 +50,7 @@ + #include <linux/printk.h> + #include <linux/dax.h> + #include <linux/psi.h> ++#include <linux/memory.h> + + #include <asm/tlbflush.h> + #include <asm/div64.h> +@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg + return can_demote(pgdat->node_id, sc); + } + ++#ifdef CONFIG_LRU_GEN ++ ++/****************************************************************************** ++ * shorthand helpers ++ ******************************************************************************/ ++ ++#define for_each_gen_type_zone(gen, type, zone) \ ++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++static int page_lru_gen(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) ++{ ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) { ++ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; ++ ++ if (lruvec->pgdat != pgdat) ++ lruvec->pgdat = pgdat; ++ ++ return lruvec; ++ } ++#endif ++ return pgdat ? &pgdat->__lruvec : NULL; ++} ++ ++static int get_nr_gens(struct lruvec *lruvec, int type) ++{ ++ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; ++} ++ ++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) ++{ ++ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && ++ get_nr_gens(lruvec, 0) <= MAX_NR_GENS; ++} ++ ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); ++#else ++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); ++#endif ++ ++static int lru_gen_nr_swapfiles; ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ enum lru_list lru; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for_each_evictable_lru(lru) { ++ type = is_file_lru(lru); ++ ++ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ ++ /* unlikely but not a bug when reset_batch_size() is pending */ ++ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); ++ } ++ ++ return true; ++} ++ ++static bool fill_lists(struct lruvec *lruvec) ++{ ++ enum lru_list lru; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; ++ ++ if (!lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page) != active, page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ del_page_from_lru_list(page, lruvec); ++ success = lru_gen_add_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool drain_lists(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; ++ ++ if (lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ success = lru_gen_del_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ add_page_to_lru_list(page, lruvec); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++/* ++ * For file page tracking, we enable/disable it according to the main switch. ++ * For anon page tracking, we only enabled it when the main switch is on and ++ * there is at least one swapfile; we disable it when there are no swapfiles ++ * regardless of the value of the main switch. Otherwise, we will eventually ++ * reach the max size of the sliding window and have to call inc_min_seq(). ++ */ ++void lru_gen_change_state(bool enable, bool main, bool swap) ++{ ++ static DEFINE_MUTEX(state_mutex); ++ ++ struct mem_cgroup *memcg; ++ ++ mem_hotplug_begin(); ++ cgroup_lock(); ++ mutex_lock(&state_mutex); ++ ++ if (swap) { ++ if (enable) ++ swap = !lru_gen_nr_swapfiles++; ++ else ++ swap = !--lru_gen_nr_swapfiles; ++ } ++ ++ if (main && enable != lru_gen_enabled()) { ++ if (enable) ++ static_branch_enable(&lru_gen_static_key); ++ else ++ static_branch_disable(&lru_gen_static_key); ++ } else if (!swap || !lru_gen_enabled()) ++ goto unlock; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ VM_BUG_ON(!state_is_valid(lruvec)); ++ ++ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lruvec->evictable.enabled[1] = lru_gen_enabled(); ++ ++ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ mutex_unlock(&state_mutex); ++ cgroup_unlock(); ++ mem_hotplug_done(); ++} ++ ++/****************************************************************************** ++ * initialization ++ ******************************************************************************/ ++ ++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) ++{ ++ int i; ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lrugen->enabled[1] = lru_gen_enabled(); ++ ++ for (i = 0; i <= MIN_NR_GENS + 1; i++) ++ lrugen->timestamps[i] = jiffies; ++ ++ for_each_gen_type_zone(gen, type, zone) ++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ lru_gen_init_state(memcg, lruvec); ++ } ++} ++#endif ++ ++static int __init init_lru_gen(void) ++{ ++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); ++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ ++ return 0; ++}; ++late_initcall(init_lru_gen); ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + unsigned long nr[NR_LRU_LISTS]; |