1 files changed, 1466 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.15/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch b/target/linux/generic/backport-5.15/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch
new file mode 100644
index 0000000000..1e310ae211
--- /dev/null
+++ b/target/linux/generic/backport-5.15/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch
@@ -0,0 +1,1466 @@
+From b564b9471cd60ef1ee3961a224898ce4a9620d84 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Sun, 18 Sep 2022 02:00:03 -0600
+Subject: [PATCH 06/29] mm: multi-gen LRU: minimal implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+To avoid confusion, the terms "promotion" and "demotion" will be applied
+to the multi-gen LRU, as a new convention; the terms "activation" and
+"deactivation" will be applied to the active/inactive LRU, as usual.
+
+The aging produces young generations.  Given an lruvec, it increments
+max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS.  The aging promotes
+hot pages to the youngest generation when it finds them accessed through
+page tables; the demotion of cold pages happens consequently when it
+increments max_seq.  Promotion in the aging path does not involve any LRU
+list operations, only the updates of the gen counter and
+lrugen->nr_pages[]; demotion, unless as the result of the increment of
+max_seq, requires LRU list operations, e.g., lru_deactivate_fn().  The
+aging has the complexity O(nr_hot_pages), since it is only interested in
+hot pages.
+
+The eviction consumes old generations.  Given an lruvec, it increments
+min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes empty.
+A feedback loop modeled after the PID controller monitors refaults over
+anon and file types and decides which type to evict when both types are
+available from the same generation.
+
+The protection of pages accessed multiple times through file descriptors
+takes place in the eviction path.  Each generation is divided into
+multiple tiers.  A page accessed N times through file descriptors is in
+tier order_base_2(N).  Tiers do not have dedicated lrugen->lists[], only
+bits in page->flags.  The aforementioned feedback loop also monitors
+refaults over all tiers and decides when to protect pages in which tiers
+(N>1), using the first tier (N=0,1) as a baseline.  The first tier
+contains single-use unmapped clean pages, which are most likely the best
+choices.  In contrast to promotion in the aging path, the protection of a
+page in the eviction path is achieved by moving this page to the next
+generation, i.e., min_seq+1, if the feedback loop decides so.  This
+approach has the following advantages:
+
+1. It removes the cost of activation in the buffered access path by
+   inferring whether pages accessed multiple times through file
+   descriptors are statistically hot and thus worth protecting in the
+   eviction path.
+2. It takes pages accessed through page tables into account and avoids
+   overprotecting pages accessed multiple times through file
+   descriptors. (Pages accessed through page tables are in the first
+   tier, since N=0.)
+3. More tiers provide better protection for pages accessed more than
+   twice through file descriptors, when under heavy buffered I/O
+   workloads.
+
+Server benchmark results:
+  Single workload:
+    fio (buffered I/O): +[30, 32]%
+                IOPS         BW
+      5.19-rc1: 2673k        10.2GiB/s
+      patch1-6: 3491k        13.3GiB/s
+
+  Single workload:
+    memcached (anon): -[4, 6]%
+                Ops/sec      KB/sec
+      5.19-rc1: 1161501.04   45177.25
+      patch1-6: 1106168.46   43025.04
+
+  Configurations:
+    CPU: two Xeon 6154
+    Mem: total 256G
+
+    Node 1 was only used as a ram disk to reduce the variance in the
+    results.
+
+    patch drivers/block/brd.c <<EOF
+    99,100c99,100
+    < 	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
+    < 	page = alloc_page(gfp_flags);
+    ---
+    > 	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE;
+    > 	page = alloc_pages_node(1, gfp_flags, 0);
+    EOF
+
+    cat >>/etc/systemd/system.conf <<EOF
+    CPUAffinity=numa
+    NUMAPolicy=bind
+    NUMAMask=0
+    EOF
+
+    cat >>/etc/memcached.conf <<EOF
+    -m 184320
+    -s /var/run/memcached/memcached.sock
+    -a 0766
+    -t 36
+    -B binary
+    EOF
+
+    cat fio.sh
+    modprobe brd rd_nr=1 rd_size=113246208
+    swapoff -a
+    mkfs.ext4 /dev/ram0
+    mount -t ext4 /dev/ram0 /mnt
+
+    mkdir /sys/fs/cgroup/user.slice/test
+    echo 38654705664 >/sys/fs/cgroup/user.slice/test/memory.max
+    echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs
+    fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \
+      --buffered=1 --ioengine=io_uring --iodepth=128 \
+      --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
+      --rw=randread --random_distribution=random --norandommap \
+      --time_based --ramp_time=10m --runtime=5m --group_reporting
+
+    cat memcached.sh
+    modprobe brd rd_nr=1 rd_size=113246208
+    swapoff -a
+    mkswap /dev/ram0
+    swapon /dev/ram0
+
+    memtier_benchmark -S /var/run/memcached/memcached.sock \
+      -P memcache_binary -n allkeys --key-minimum=1 \
+      --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \
+      --ratio 1:0 --pipeline 8 -d 2000
+
+    memtier_benchmark -S /var/run/memcached/memcached.sock \
+      -P memcache_binary -n allkeys --key-minimum=1 \
+      --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \
+      --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
+
+Client benchmark results:
+  kswapd profiles:
+    5.19-rc1
+      40.33%  page_vma_mapped_walk (overhead)
+      21.80%  lzo1x_1_do_compress (real work)
+       7.53%  do_raw_spin_lock
+       3.95%  _raw_spin_unlock_irq
+       2.52%  vma_interval_tree_iter_next
+       2.37%  page_referenced_one
+       2.28%  vma_interval_tree_subtree_search
+       1.97%  anon_vma_interval_tree_iter_first
+       1.60%  ptep_clear_flush
+       1.06%  __zram_bvec_write
+
+    patch1-6
+      39.03%  lzo1x_1_do_compress (real work)
+      18.47%  page_vma_mapped_walk (overhead)
+       6.74%  _raw_spin_unlock_irq
+       3.97%  do_raw_spin_lock
+       2.49%  ptep_clear_flush
+       2.48%  anon_vma_interval_tree_iter_first
+       1.92%  page_referenced_one
+       1.88%  __zram_bvec_write
+       1.48%  memmove
+       1.31%  vma_interval_tree_iter_next
+
+  Configurations:
+    CPU: single Snapdragon 7c
+    Mem: total 4G
+
+    ChromeOS MemoryPressure [1]
+
+[1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/
+
+Link: https://lkml.kernel.org/r/20220918080010.2920238-7-yuzhao@google.com
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Acked-by: Brian Geffon <bgeffon@google.com>
+Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
+Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+Acked-by: Steven Barrett <steven@liquorix.net>
+Acked-by: Suleiman Souhlal <suleiman@google.com>
+Tested-by: Daniel Byrne <djbyrne@mtu.edu>
+Tested-by: Donald Carr <d@chaos-reins.com>
+Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
+Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
+Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
+Tested-by: Sofia Trinh <sofia.trinh@edi.works>
+Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Cc: Barry Song <baohua@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Hillf Danton <hdanton@sina.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Michael Larabel <Michael@MichaelLarabel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mike Rapoport <rppt@kernel.org>
+Cc: Mike Rapoport <rppt@linux.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ include/linux/mm_inline.h         |  36 ++
+ include/linux/mmzone.h            |  41 ++
+ include/linux/page-flags-layout.h |   5 +-
+ kernel/bounds.c                   |   2 +
+ mm/Kconfig                        |  11 +
+ mm/swap.c                         |  39 ++
+ mm/vmscan.c                       | 792 +++++++++++++++++++++++++++++-
+ mm/workingset.c                   | 110 ++++-
+ 8 files changed, 1025 insertions(+), 11 deletions(-)
+
+diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
+index 65320d2b8f60..58aabb1ba020 100644
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -106,6 +106,33 @@ static inline int lru_gen_from_seq(unsigned long seq)
+ 	return seq % MAX_NR_GENS;
+ }
+ 
++static inline int lru_hist_from_seq(unsigned long seq)
++{
++	return seq % NR_HIST_GENS;
++}
++
++static inline int lru_tier_from_refs(int refs)
++{
++	VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
++
++	/* see the comment in page_lru_refs() */
++	return order_base_2(refs + 1);
++}
++
++static inline int page_lru_refs(struct page *page)
++{
++	unsigned long flags = READ_ONCE(page->flags);
++	bool workingset = flags & BIT(PG_workingset);
++
++	/*
++	 * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
++	 * total number of accesses is N>1, since N=0,1 both map to the first
++	 * tier. lru_tier_from_refs() will account for this off-by-one. Also see
++	 * the comment on MAX_NR_TIERS.
++	 */
++	return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
++}
++
+ static inline int page_lru_gen(struct page *page)
+ {
+ 	unsigned long flags = READ_ONCE(page->flags);
+@@ -158,6 +185,15 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
+ 		__update_lru_size(lruvec, lru, zone, -delta);
+ 		return;
+ 	}
++
++	/* promotion */
++	if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
++		__update_lru_size(lruvec, lru, zone, -delta);
++		__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
++	}
++
++	/* demotion requires isolation, e.g., lru_deactivate_fn() */
++	VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
+ }
+ 
+ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 0c39f72184d0..fce8945c507c 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -327,6 +327,28 @@ enum lruvec_flags {
+ #define MIN_NR_GENS		2U
+ #define MAX_NR_GENS		4U
+ 
++/*
++ * Each generation is divided into multiple tiers. A page accessed N times
++ * through file descriptors is in tier order_base_2(N). A page in the first tier
++ * (N=0,1) is marked by PG_referenced unless it was faulted in through page
++ * tables or read ahead. A page in any other tier (N>1) is marked by
++ * PG_referenced and PG_workingset. This implies a minimum of two tiers is
++ * supported without using additional bits in page->flags.
++ *
++ * In contrast to moving across generations which requires the LRU lock, moving
++ * across tiers only involves atomic operations on page->flags and therefore
++ * has a negligible cost in the buffered access path. In the eviction path,
++ * comparisons of refaulted/(evicted+protected) from the first tier and the
++ * rest infer whether pages accessed multiple times through file descriptors
++ * are statistically hot and thus worth protecting.
++ *
++ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
++ * number of categories of the active/inactive LRU when keeping track of
++ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
++ * page->flags.
++ */
++#define MAX_NR_TIERS		4U
++
+ #ifndef __GENERATING_BOUNDS_H
+ 
+ struct lruvec;
+@@ -341,6 +363,16 @@ enum {
+ 	LRU_GEN_FILE,
+ };
+ 
++#define MIN_LRU_BATCH		BITS_PER_LONG
++#define MAX_LRU_BATCH		(MIN_LRU_BATCH * 64)
++
++/* whether to keep historical stats from evicted generations */
++#ifdef CONFIG_LRU_GEN_STATS
++#define NR_HIST_GENS		MAX_NR_GENS
++#else
++#define NR_HIST_GENS		1U
++#endif
++
+ /*
+  * The youngest generation number is stored in max_seq for both anon and file
+  * types as they are aged on an equal footing. The oldest generation numbers are
+@@ -363,6 +395,15 @@ struct lru_gen_struct {
+ 	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ 	/* the multi-gen LRU sizes, eventually consistent */
+ 	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++	/* the exponential moving average of refaulted */
++	unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
++	/* the exponential moving average of evicted+protected */
++	unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
++	/* the first tier doesn't need protection, hence the minus one */
++	unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
++	/* can be modified without holding the LRU lock */
++	atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
++	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ };
+ 
+ void lru_gen_init_lruvec(struct lruvec *lruvec);
+diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
+index 240905407a18..7d79818dc065 100644
+--- a/include/linux/page-flags-layout.h
++++ b/include/linux/page-flags-layout.h
+@@ -106,7 +106,10 @@
+ #error "Not enough bits in page flags"
+ #endif
+ 
+-#define LRU_REFS_WIDTH	0
++/* see the comment on MAX_NR_TIERS */
++#define LRU_REFS_WIDTH	min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
++			    ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
++			    NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
+ 
+ #endif
+ #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
+diff --git a/kernel/bounds.c b/kernel/bounds.c
+index 5ee60777d8e4..b529182e8b04 100644
+--- a/kernel/bounds.c
++++ b/kernel/bounds.c
+@@ -24,8 +24,10 @@ int main(void)
+ 	DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+ #ifdef CONFIG_LRU_GEN
+ 	DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
++	DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+ #else
+ 	DEFINE(LRU_GEN_WIDTH, 0);
++	DEFINE(__LRU_REFS_WIDTH, 0);
+ #endif
+ 	/* End of constants */
+ 
+diff --git a/mm/Kconfig b/mm/Kconfig
+index 0eeb27397884..62433f3cd7ae 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -897,6 +897,7 @@ config IO_MAPPING
+ config SECRETMEM
+ 	def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+ 
++# multi-gen LRU {
+ config LRU_GEN
+ 	bool "Multi-Gen LRU"
+ 	depends on MMU
+@@ -905,6 +906,16 @@ config LRU_GEN
+ 	help
+ 	  A high performance LRU implementation to overcommit memory.
+ 
++config LRU_GEN_STATS
++	bool "Full stats for debugging"
++	depends on LRU_GEN
++	help
++	  Do not enable this option unless you plan to look at historical stats
++	  from evicted generations for debugging purpose.
++
++	  This option has a per-memcg and per-node memory overhead.
++# }
++
+ source "mm/damon/Kconfig"
+ 
+ endmenu
+diff --git a/mm/swap.c b/mm/swap.c
+index 0bdc96661fb6..5d227577b609 100644
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -389,6 +389,40 @@ static void __lru_cache_activate_page(struct page *page)
+ 	local_unlock(&lru_pvecs.lock);
+ }
+ 
++#ifdef CONFIG_LRU_GEN
++static void page_inc_refs(struct page *page)
++{
++	unsigned long new_flags, old_flags = READ_ONCE(page->flags);
++
++	if (PageUnevictable(page))
++		return;
++
++	if (!PageReferenced(page)) {
++		SetPageReferenced(page);
++		return;
++	}
++
++	if (!PageWorkingset(page)) {
++		SetPageWorkingset(page);
++		return;
++	}
++
++	/* see the comment on MAX_NR_TIERS */
++	do {
++		new_flags = old_flags & LRU_REFS_MASK;
++		if (new_flags == LRU_REFS_MASK)
++			break;
++
++		new_flags += BIT(LRU_REFS_PGOFF);
++		new_flags |= old_flags & ~LRU_REFS_MASK;
++	} while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
++}
++#else
++static void page_inc_refs(struct page *page)
++{
++}
++#endif /* CONFIG_LRU_GEN */
++
+ /*
+  * Mark a page as having seen activity.
+  *
+@@ -403,6 +437,11 @@ void mark_page_accessed(struct page *page)
+ {
+ 	page = compound_head(page);
+ 
++	if (lru_gen_enabled()) {
++		page_inc_refs(page);
++		return;
++	}
++
+ 	if (!PageReferenced(page)) {
+ 		SetPageReferenced(page);
+ 	} else if (PageUnevictable(page)) {
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 41826fe17eb3..932abd24c1b3 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1142,9 +1142,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
+ 
+ 	if (PageSwapCache(page)) {
+ 		swp_entry_t swap = { .val = page_private(page) };
+-		mem_cgroup_swapout(page, swap);
++
++		/* get a shadow entry before mem_cgroup_swapout() clears page_memcg() */
+ 		if (reclaimed && !mapping_exiting(mapping))
+ 			shadow = workingset_eviction(page, target_memcg);
++		mem_cgroup_swapout(page, swap);
+ 		__delete_from_swap_cache(page, swap, shadow);
+ 		xa_unlock_irq(&mapping->i_pages);
+ 		put_swap_page(page, swap);
+@@ -2502,6 +2504,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+ 	unsigned long file;
+ 	struct lruvec *target_lruvec;
+ 
++	if (lru_gen_enabled())
++		return;
++
+ 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+ 
+ 	/*
+@@ -2827,6 +2832,17 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
+  *                          shorthand helpers
+  ******************************************************************************/
+ 
++#define LRU_REFS_FLAGS	(BIT(PG_referenced) | BIT(PG_workingset))
++
++#define DEFINE_MAX_SEQ(lruvec)						\
++	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
++
++#define DEFINE_MIN_SEQ(lruvec)						\
++	unsigned long min_seq[ANON_AND_FILE] = {			\
++		READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),	\
++		READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),	\
++	}
++
+ #define for_each_gen_type_zone(gen, type, zone)				\
+ 	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)			\
+ 		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
+@@ -2852,6 +2868,745 @@ static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int ni
+ 	return pgdat ? &pgdat->__lruvec : NULL;
+ }
+ 
++static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
++{
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++	if (!can_demote(pgdat->node_id, sc) &&
++	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
++		return 0;
++
++	return mem_cgroup_swappiness(memcg);
++}
++
++static int get_nr_gens(struct lruvec *lruvec, int type)
++{
++	return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
++}
++
++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
++{
++	/* see the comment on lru_gen_struct */
++	return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
++	       get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
++	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
++}
++
++/******************************************************************************
++ *                          refault feedback loop
++ ******************************************************************************/
++
++/*
++ * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
++ *
++ * The P term is refaulted/(evicted+protected) from a tier in the generation
++ * currently being evicted; the I term is the exponential moving average of the
++ * P term over the generations previously evicted, using the smoothing factor
++ * 1/2; the D term isn't supported.
++ *
++ * The setpoint (SP) is always the first tier of one type; the process variable
++ * (PV) is either any tier of the other type or any other tier of the same
++ * type.
++ *
++ * The error is the difference between the SP and the PV; the correction is to
++ * turn off protection when SP>PV or turn on protection when SP<PV.
++ *
++ * For future optimizations:
++ * 1. The D term may discount the other two terms over time so that long-lived
++ *    generations can resist stale information.
++ */
++struct ctrl_pos {
++	unsigned long refaulted;
++	unsigned long total;
++	int gain;
++};
++
++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
++			  struct ctrl_pos *pos)
++{
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
++
++	pos->refaulted = lrugen->avg_refaulted[type][tier] +
++			 atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++	pos->total = lrugen->avg_total[type][tier] +
++		     atomic_long_read(&lrugen->evicted[hist][type][tier]);
++	if (tier)
++		pos->total += lrugen->protected[hist][type][tier - 1];
++	pos->gain = gain;
++}
++
++static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
++{
++	int hist, tier;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
++	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
++
++	lockdep_assert_held(&lruvec->lru_lock);
++
++	if (!carryover && !clear)
++		return;
++
++	hist = lru_hist_from_seq(seq);
++
++	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
++		if (carryover) {
++			unsigned long sum;
++
++			sum = lrugen->avg_refaulted[type][tier] +
++			      atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++			WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
++
++			sum = lrugen->avg_total[type][tier] +
++			      atomic_long_read(&lrugen->evicted[hist][type][tier]);
++			if (tier)
++				sum += lrugen->protected[hist][type][tier - 1];
++			WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
++		}
++
++		if (clear) {
++			atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
++			atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
++			if (tier)
++				WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
++		}
++	}
++}
++
++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
++{
++	/*
++	 * Return true if the PV has a limited number of refaults or a lower
++	 * refaulted/total than the SP.
++	 */
++	return pv->refaulted < MIN_LRU_BATCH ||
++	       pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
++	       (sp->refaulted + 1) * pv->total * pv->gain;
++}
++
++/******************************************************************************
++ *                          the aging
++ ******************************************************************************/
++
++/* protect pages accessed multiple times through file descriptors */
++static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
++{
++	int type = page_is_file_lru(page);
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
++	unsigned long new_flags, old_flags = READ_ONCE(page->flags);
++
++	VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
++
++	do {
++		new_gen = (old_gen + 1) % MAX_NR_GENS;
++
++		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
++		new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
++		/* for end_page_writeback() */
++		if (reclaiming)
++			new_flags |= BIT(PG_reclaim);
++	} while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
++
++	lru_gen_update_size(lruvec, page, old_gen, new_gen);
++
++	return new_gen;
++}
++
++static void inc_min_seq(struct lruvec *lruvec, int type)
++{
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	reset_ctrl_pos(lruvec, type, true);
++	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
++}
++
++static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
++{
++	int gen, type, zone;
++	bool success = false;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	DEFINE_MIN_SEQ(lruvec);
++
++	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
++
++	/* find the oldest populated generation */
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
++			gen = lru_gen_from_seq(min_seq[type]);
++
++			for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++				if (!list_empty(&lrugen->lists[gen][type][zone]))
++					goto next;
++			}
++
++			min_seq[type]++;
++		}
++next:
++		;
++	}
++
++	/* see the comment on lru_gen_struct */
++	if (can_swap) {
++		min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
++		min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
++	}
++
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		if (min_seq[type] == lrugen->min_seq[type])
++			continue;
++
++		reset_ctrl_pos(lruvec, type, true);
++		WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
++		success = true;
++	}
++
++	return success;
++}
++
++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
++{
++	int prev, next;
++	int type, zone;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	spin_lock_irq(&lruvec->lru_lock);
++
++	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
++
++	if (max_seq != lrugen->max_seq)
++		goto unlock;
++
++	for (type = ANON_AND_FILE - 1; type >= 0; type--) {
++		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
++			continue;
++
++		VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
++
++		inc_min_seq(lruvec, type);
++	}
++
++	/*
++	 * Update the active/inactive LRU sizes for compatibility. Both sides of
++	 * the current max_seq need to be covered, since max_seq+1 can overlap
++	 * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
++	 * overlap, cold/hot inversion happens.
++	 */
++	prev = lru_gen_from_seq(lrugen->max_seq - 1);
++	next = lru_gen_from_seq(lrugen->max_seq + 1);
++
++	for (type = 0; type < ANON_AND_FILE; type++) {
++		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++			enum lru_list lru = type * LRU_INACTIVE_FILE;
++			long delta = lrugen->nr_pages[prev][type][zone] -
++				     lrugen->nr_pages[next][type][zone];
++
++			if (!delta)
++				continue;
++
++			__update_lru_size(lruvec, lru, zone, delta);
++			__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
++		}
++	}
++
++	for (type = 0; type < ANON_AND_FILE; type++)
++		reset_ctrl_pos(lruvec, type, false);
++
++	/* make sure preceding modifications appear */
++	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
++unlock:
++	spin_unlock_irq(&lruvec->lru_lock);
++}
++
++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
++			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
++{
++	int gen, type, zone;
++	unsigned long old = 0;
++	unsigned long young = 0;
++	unsigned long total = 0;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		unsigned long seq;
++
++		for (seq = min_seq[type]; seq <= max_seq; seq++) {
++			unsigned long size = 0;
++
++			gen = lru_gen_from_seq(seq);
++
++			for (zone = 0; zone < MAX_NR_ZONES; zone++)
++				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
++
++			total += size;
++			if (seq == max_seq)
++				young += size;
++			else if (seq + MIN_NR_GENS == max_seq)
++				old += size;
++		}
++	}
++
++	/* try to scrape all its memory if this memcg was deleted */
++	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
++
++	/*
++	 * The aging tries to be lazy to reduce the overhead, while the eviction
++	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
++	 * ideal number of generations is MIN_NR_GENS+1.
++	 */
++	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
++		return true;
++	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
++		return false;
++
++	/*
++	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
++	 * of the total number of pages for each generation. A reasonable range
++	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
++	 * aging cares about the upper bound of hot pages, while the eviction
++	 * cares about the lower bound of cold pages.
++	 */
++	if (young * MIN_NR_GENS > total)
++		return true;
++	if (old * (MIN_NR_GENS + 2) < total)
++		return true;
++
++	return false;
++}
++
++static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++	bool need_aging;
++	unsigned long nr_to_scan;
++	int swappiness = get_swappiness(lruvec, sc);
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MAX_SEQ(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
++
++	mem_cgroup_calculate_protection(NULL, memcg);
++
++	if (mem_cgroup_below_min(memcg))
++		return;
++
++	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
++	if (need_aging)
++		inc_max_seq(lruvec, max_seq, swappiness);
++}
++
++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	struct mem_cgroup *memcg;
++
++	VM_WARN_ON_ONCE(!current_is_kswapd());
++
++	memcg = mem_cgroup_iter(NULL, NULL, NULL);
++	do {
++		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
++
++		age_lruvec(lruvec, sc);
++
++		cond_resched();
++	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++}
++
++/******************************************************************************
++ *                          the eviction
++ ******************************************************************************/
++
++static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx)
++{
++	bool success;
++	int gen = page_lru_gen(page);
++	int type = page_is_file_lru(page);
++	int zone = page_zonenum(page);
++	int delta = thp_nr_pages(page);
++	int refs = page_lru_refs(page);
++	int tier = lru_tier_from_refs(refs);
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
++
++	/* unevictable */
++	if (!page_evictable(page)) {
++		success = lru_gen_del_page(lruvec, page, true);
++		VM_WARN_ON_ONCE_PAGE(!success, page);
++		SetPageUnevictable(page);
++		add_page_to_lru_list(page, lruvec);
++		__count_vm_events(UNEVICTABLE_PGCULLED, delta);
++		return true;
++	}
++
++	/* dirty lazyfree */
++	if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) {
++		success = lru_gen_del_page(lruvec, page, true);
++		VM_WARN_ON_ONCE_PAGE(!success, page);
++		SetPageSwapBacked(page);
++		add_page_to_lru_list_tail(page, lruvec);
++		return true;
++	}
++
++	/* protected */
++	if (tier > tier_idx) {
++		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
++
++		gen = page_inc_gen(lruvec, page, false);
++		list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
++
++		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
++			   lrugen->protected[hist][type][tier - 1] + delta);
++		__mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
++		return true;
++	}
++
++	/* waiting for writeback */
++	if (PageLocked(page) || PageWriteback(page) ||
++	    (type == LRU_GEN_FILE && PageDirty(page))) {
++		gen = page_inc_gen(lruvec, page, true);
++		list_move(&page->lru, &lrugen->lists[gen][type][zone]);
++		return true;
++	}
++
++	return false;
++}
++
++static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc)
++{
++	bool success;
++
++	/* unmapping inhibited */
++	if (!sc->may_unmap && page_mapped(page))
++		return false;
++
++	/* swapping inhibited */
++	if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
++	    (PageDirty(page) ||
++	     (PageAnon(page) && !PageSwapCache(page))))
++		return false;
++
++	/* raced with release_pages() */
++	if (!get_page_unless_zero(page))
++		return false;
++
++	/* raced with another isolation */
++	if (!TestClearPageLRU(page)) {
++		put_page(page);
++		return false;
++	}
++
++	/* see the comment on MAX_NR_TIERS */
++	if (!PageReferenced(page))
++		set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
++
++	/* for shrink_page_list() */
++	ClearPageReclaim(page);
++	ClearPageReferenced(page);
++
++	success = lru_gen_del_page(lruvec, page, true);
++	VM_WARN_ON_ONCE_PAGE(!success, page);
++
++	return true;
++}
++
++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
++		      int type, int tier, struct list_head *list)
++{
++	int gen, zone;
++	enum vm_event_item item;
++	int sorted = 0;
++	int scanned = 0;
++	int isolated = 0;
++	int remaining = MAX_LRU_BATCH;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++
++	VM_WARN_ON_ONCE(!list_empty(list));
++
++	if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
++		return 0;
++
++	gen = lru_gen_from_seq(lrugen->min_seq[type]);
++
++	for (zone = sc->reclaim_idx; zone >= 0; zone--) {
++		LIST_HEAD(moved);
++		int skipped = 0;
++		struct list_head *head = &lrugen->lists[gen][type][zone];
++
++		while (!list_empty(head)) {
++			struct page *page = lru_to_page(head);
++			int delta = thp_nr_pages(page);
++
++			VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
++			VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
++			VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
++			VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
++
++			scanned += delta;
++
++			if (sort_page(lruvec, page, tier))
++				sorted += delta;
++			else if (isolate_page(lruvec, page, sc)) {
++				list_add(&page->lru, list);
++				isolated += delta;
++			} else {
++				list_move(&page->lru, &moved);
++				skipped += delta;
++			}
++
++			if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
++				break;
++		}
++
++		if (skipped) {
++			list_splice(&moved, head);
++			__count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
++		}
++
++		if (!remaining || isolated >= MIN_LRU_BATCH)
++			break;
++	}
++
++	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
++	if (!cgroup_reclaim(sc)) {
++		__count_vm_events(item, isolated);
++		__count_vm_events(PGREFILL, sorted);
++	}
++	__count_memcg_events(memcg, item, isolated);
++	__count_memcg_events(memcg, PGREFILL, sorted);
++	__count_vm_events(PGSCAN_ANON + type, isolated);
++
++	/*
++	 * There might not be eligible pages due to reclaim_idx, may_unmap and
++	 * may_writepage. Check the remaining to prevent livelock if it's not
++	 * making progress.
++	 */
++	return isolated || !remaining ? scanned : 0;
++}
++
++static int get_tier_idx(struct lruvec *lruvec, int type)
++{
++	int tier;
++	struct ctrl_pos sp, pv;
++
++	/*
++	 * To leave a margin for fluctuations, use a larger gain factor (1:2).
++	 * This value is chosen because any other tier would have at least twice
++	 * as many refaults as the first tier.
++	 */
++	read_ctrl_pos(lruvec, type, 0, 1, &sp);
++	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
++		read_ctrl_pos(lruvec, type, tier, 2, &pv);
++		if (!positive_ctrl_err(&sp, &pv))
++			break;
++	}
++
++	return tier - 1;
++}
++
++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
++{
++	int type, tier;
++	struct ctrl_pos sp, pv;
++	int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
++
++	/*
++	 * Compare the first tier of anon with that of file to determine which
++	 * type to scan. Also need to compare other tiers of the selected type
++	 * with the first tier of the other type to determine the last tier (of
++	 * the selected type) to evict.
++	 */
++	read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
++	read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
++	type = positive_ctrl_err(&sp, &pv);
++
++	read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
++	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
++		read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
++		if (!positive_ctrl_err(&sp, &pv))
++			break;
++	}
++
++	*tier_idx = tier - 1;
++
++	return type;
++}
++
++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++			 int *type_scanned, struct list_head *list)
++{
++	int i;
++	int type;
++	int scanned;
++	int tier = -1;
++	DEFINE_MIN_SEQ(lruvec);
++
++	/*
++	 * Try to make the obvious choice first. When anon and file are both
++	 * available from the same generation, interpret swappiness 1 as file
++	 * first and 200 as anon first.
++	 */
++	if (!swappiness)
++		type = LRU_GEN_FILE;
++	else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
++		type = LRU_GEN_ANON;
++	else if (swappiness == 1)
++		type = LRU_GEN_FILE;
++	else if (swappiness == 200)
++		type = LRU_GEN_ANON;
++	else
++		type = get_type_to_scan(lruvec, swappiness, &tier);
++
++	for (i = !swappiness; i < ANON_AND_FILE; i++) {
++		if (tier < 0)
++			tier = get_tier_idx(lruvec, type);
++
++		scanned = scan_pages(lruvec, sc, type, tier, list);
++		if (scanned)
++			break;
++
++		type = !type;
++		tier = -1;
++	}
++
++	*type_scanned = type;
++
++	return scanned;
++}
++
++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++{
++	int type;
++	int scanned;
++	int reclaimed;
++	LIST_HEAD(list);
++	struct page *page;
++	enum vm_event_item item;
++	struct reclaim_stat stat;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++	spin_lock_irq(&lruvec->lru_lock);
++
++	scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
++
++	scanned += try_to_inc_min_seq(lruvec, swappiness);
++
++	if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
++		scanned = 0;
++
++	spin_unlock_irq(&lruvec->lru_lock);
++
++	if (list_empty(&list))
++		return scanned;
++
++	reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
++
++	list_for_each_entry(page, &list, lru) {
++		/* restore LRU_REFS_FLAGS cleared by isolate_page() */
++		if (PageWorkingset(page))
++			SetPageReferenced(page);
++
++		/* don't add rejected pages to the oldest generation */
++		if (PageReclaim(page) &&
++		    (PageDirty(page) || PageWriteback(page)))
++			ClearPageActive(page);
++		else
++			SetPageActive(page);
++	}
++
++	spin_lock_irq(&lruvec->lru_lock);
++
++	move_pages_to_lru(lruvec, &list);
++
++	item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
++	if (!cgroup_reclaim(sc))
++		__count_vm_events(item, reclaimed);
++	__count_memcg_events(memcg, item, reclaimed);
++	__count_vm_events(PGSTEAL_ANON + type, reclaimed);
++
++	spin_unlock_irq(&lruvec->lru_lock);
++
++	mem_cgroup_uncharge_list(&list);
++	free_unref_page_list(&list);
++
++	sc->nr_reclaimed += reclaimed;
++
++	return scanned;
++}
++
++static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
++				    bool can_swap)
++{
++	bool need_aging;
++	unsigned long nr_to_scan;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MAX_SEQ(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	if (mem_cgroup_below_min(memcg) ||
++	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
++		return 0;
++
++	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
++	if (!need_aging)
++		return nr_to_scan;
++
++	/* skip the aging path at the default priority */
++	if (sc->priority == DEF_PRIORITY)
++		goto done;
++
++	/* leave the work to lru_gen_age_node() */
++	if (current_is_kswapd())
++		return 0;
++
++	inc_max_seq(lruvec, max_seq, can_swap);
++done:
++	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++	struct blk_plug plug;
++	unsigned long scanned = 0;
++
++	lru_add_drain();
++
++	blk_start_plug(&plug);
++
++	while (true) {
++		int delta;
++		int swappiness;
++		unsigned long nr_to_scan;
++
++		if (sc->may_swap)
++			swappiness = get_swappiness(lruvec, sc);
++		else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
++			swappiness = 1;
++		else
++			swappiness = 0;
++
++		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
++		if (!nr_to_scan)
++			break;
++
++		delta = evict_pages(lruvec, sc, swappiness);
++		if (!delta)
++			break;
++
++		scanned += delta;
++		if (scanned >= nr_to_scan)
++			break;
++
++		cond_resched();
++	}
++
++	blk_finish_plug(&plug);
++}
++
+ /******************************************************************************
+  *                          initialization
+  ******************************************************************************/
+@@ -2894,6 +3649,16 @@ static int __init init_lru_gen(void)
+ };
+ late_initcall(init_lru_gen);
+ 
++#else /* !CONFIG_LRU_GEN */
++
++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++}
++
+ #endif /* CONFIG_LRU_GEN */
+ 
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -2907,6 +3672,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ 	bool proportional_reclaim;
+ 	struct blk_plug plug;
+ 
++	if (lru_gen_enabled()) {
++		lru_gen_shrink_lruvec(lruvec, sc);
++		return;
++	}
++
+ 	get_scan_count(lruvec, sc, nr);
+ 
+ 	/* Record the original scan target for proportional adjustments later */
+@@ -3372,6 +4142,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
+ 	struct lruvec *target_lruvec;
+ 	unsigned long refaults;
+ 
++	if (lru_gen_enabled())
++		return;
++
+ 	target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ 	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ 	target_lruvec->refaults[0] = refaults;
+@@ -3736,12 +4509,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ }
+ #endif
+ 
+-static void age_active_anon(struct pglist_data *pgdat,
+-				struct scan_control *sc)
++static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
+ 	struct mem_cgroup *memcg;
+ 	struct lruvec *lruvec;
+ 
++	if (lru_gen_enabled()) {
++		lru_gen_age_node(pgdat, sc);
++		return;
++	}
++
+ 	if (!can_age_anon_pages(pgdat, sc))
+ 		return;
+ 
+@@ -4058,12 +4835,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
+ 		sc.may_swap = !nr_boost_reclaim;
+ 
+ 		/*
+-		 * Do some background aging of the anon list, to give
+-		 * pages a chance to be referenced before reclaiming. All
+-		 * pages are rotated regardless of classzone as this is
+-		 * about consistent aging.
++		 * Do some background aging, to give pages a chance to be
++		 * referenced before reclaiming. All pages are rotated
++		 * regardless of classzone as this is about consistent aging.
+ 		 */
+-		age_active_anon(pgdat, &sc);
++		kswapd_age_node(pgdat, &sc);
+ 
+ 		/*
+ 		 * If we're getting trouble reclaiming, start doing writepage
+diff --git a/mm/workingset.c b/mm/workingset.c
+index 880d882f3325..aeba62cebf8c 100644
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly;
+ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ 			 bool workingset)
+ {
+-	eviction >>= bucket_order;
+ 	eviction &= EVICTION_MASK;
+ 	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
+ 	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+@@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
+ 
+ 	*memcgidp = memcgid;
+ 	*pgdat = NODE_DATA(nid);
+-	*evictionp = entry << bucket_order;
++	*evictionp = entry;
+ 	*workingsetp = workingset;
+ }
+ 
++#ifdef CONFIG_LRU_GEN
++
++static void *lru_gen_eviction(struct page *page)
++{
++	int hist;
++	unsigned long token;
++	unsigned long min_seq;
++	struct lruvec *lruvec;
++	struct lru_gen_struct *lrugen;
++	int type = page_is_file_lru(page);
++	int delta = thp_nr_pages(page);
++	int refs = page_lru_refs(page);
++	int tier = lru_tier_from_refs(refs);
++	struct mem_cgroup *memcg = page_memcg(page);
++	struct pglist_data *pgdat = page_pgdat(page);
++
++	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
++
++	lruvec = mem_cgroup_lruvec(memcg, pgdat);
++	lrugen = &lruvec->lrugen;
++	min_seq = READ_ONCE(lrugen->min_seq[type]);
++	token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
++
++	hist = lru_hist_from_seq(min_seq);
++	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
++
++	return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
++}
++
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++	int hist, tier, refs;
++	int memcg_id;
++	bool workingset;
++	unsigned long token;
++	unsigned long min_seq;
++	struct lruvec *lruvec;
++	struct lru_gen_struct *lrugen;
++	struct mem_cgroup *memcg;
++	struct pglist_data *pgdat;
++	int type = page_is_file_lru(page);
++	int delta = thp_nr_pages(page);
++
++	unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
++
++	if (pgdat != page_pgdat(page))
++		return;
++
++	rcu_read_lock();
++
++	memcg = page_memcg_rcu(page);
++	if (memcg_id != mem_cgroup_id(memcg))
++		goto unlock;
++
++	lruvec = mem_cgroup_lruvec(memcg, pgdat);
++	lrugen = &lruvec->lrugen;
++
++	min_seq = READ_ONCE(lrugen->min_seq[type]);
++	if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
++		goto unlock;
++
++	hist = lru_hist_from_seq(min_seq);
++	/* see the comment in page_lru_refs() */
++	refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
++	tier = lru_tier_from_refs(refs);
++
++	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
++	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
++
++	/*
++	 * Count the following two cases as stalls:
++	 * 1. For pages accessed through page tables, hotter pages pushed out
++	 *    hot pages which refaulted immediately.
++	 * 2. For pages accessed multiple times through file descriptors,
++	 *    numbers of accesses might have been out of the range.
++	 */
++	if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
++		SetPageWorkingset(page);
++		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
++	}
++unlock:
++	rcu_read_unlock();
++}
++
++#else /* !CONFIG_LRU_GEN */
++
++static void *lru_gen_eviction(struct page *page)
++{
++	return NULL;
++}
++
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ /**
+  * workingset_age_nonresident - age non-resident entries as LRU ages
+  * @lruvec: the lruvec that was aged
+@@ -264,10 +360,14 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
+ 	VM_BUG_ON_PAGE(page_count(page), page);
+ 	VM_BUG_ON_PAGE(!PageLocked(page), page);
+ 
++	if (lru_gen_enabled())
++		return lru_gen_eviction(page);
++
+ 	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ 	/* XXX: target_memcg can be NULL, go through lruvec */
+ 	memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ 	eviction = atomic_long_read(&lruvec->nonresident_age);
++	eviction >>= bucket_order;
+ 	workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ 	return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ }
+@@ -296,7 +396,13 @@ void workingset_refault(struct page *page, void *shadow)
+ 	bool workingset;
+ 	int memcgid;
+ 
++	if (lru_gen_enabled()) {
++		lru_gen_refault(page, shadow);
++		return;
++	}
++
+ 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
++	eviction <<= bucket_order;
+ 
+ 	rcu_read_lock();
+ 	/*
+-- 
+2.40.0
+