aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch')
-rw-r--r--target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch1002
1 files changed, 1002 insertions, 0 deletions
diff --git a/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch b/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch
new file mode 100644
index 0000000000..09e4315dcc
--- /dev/null
+++ b/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch
@@ -0,0 +1,1002 @@
+From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Tue, 29 Jun 2021 20:46:47 -0600
+Subject: [PATCH 07/10] mm: multigenerational lru: eviction
+
+The eviction consumes old generations. Given an lruvec, the eviction
+scans pages on lrugen->lists indexed by anon and file min_seq[]
+(modulo MAX_NR_GENS). It first tries to select a type based on the
+values of min_seq[]. If they are equal, it selects the type that has
+a lower refaulted %. The eviction sorts a page according to its
+updated generation number if the aging has found this page accessed.
+It also moves a page to the next generation if this page is from an
+upper tier that has a higher refaulted % than the base tier. The
+eviction increments min_seq[] of a selected type when it finds
+lrugen->lists indexed by min_seq[] of this selected type are empty.
+
+Each generation is divided into multiple tiers. Tiers represent
+different ranges of numbers of accesses from file descriptors only.
+Pages accessed N times via file descriptors belong to tier
+order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers,
+and they require additional MAX_NR_TIERS-2 bits in page->flags. In
+contrast to moving between generations which requires list operations,
+moving between tiers only involves operations on page->flags and
+therefore has a negligible cost. A feedback loop modeled after the PID
+controller monitors refaulted % across all tiers and decides when to
+protect pages from which tiers.
+
+Unmapped pages are initially added to the oldest generation and then
+conditionally protected by tiers. Each tier keeps track of how many
+pages from it have refaulted. Tier 0 is the base tier and pages from
+it are evicted unconditionally because there are no better candidates.
+Pages from an upper tier are either evicted or moved to the next
+generation, depending on whether this upper tier has a higher
+refaulted % than the base tier. This model has the following
+advantages:
+ 1) It removes the cost in the buffered access path and reduces the
+ overall cost of protection because pages are conditionally protected
+ in the reclaim path.
+ 2) It takes mapped pages into account and avoids overprotecting
+ pages accessed multiple times via file descriptors.
+ 3 Additional tiers improve the protection of pages accessed more
+ than twice.
+
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
+Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac
+---
+ include/linux/mm_inline.h | 10 +
+ include/linux/mmzone.h | 33 +++
+ mm/swap.c | 42 +++
+ mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++-
+ mm/workingset.c | 120 ++++++++-
+ 5 files changed, 757 insertions(+), 3 deletions(-)
+
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi
+ return seq % NR_HIST_GENS;
+ }
+
++/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */
++static inline int lru_tier_from_refs(int refs)
++{
++ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
++
++ return order_base_2(refs + 1);
++}
++
+ /* The youngest and the second youngest generations are counted as active. */
+ static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+ {
+@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru
+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ new_flags &= ~LRU_GEN_MASK;
++ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS)
++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
+ /* for shrink_page_list() */
+ if (reclaiming)
+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -319,6 +319,30 @@ struct page_vma_mapped_walk;
+ #define MIN_NR_GENS 2
+ #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+
++/*
++ * Each generation is divided into multiple tiers. Tiers represent different
++ * ranges of numbers of accesses from file descriptors, i.e.,
++ * mark_page_accessed(). In contrast to moving between generations which
++ * requires the lru lock, moving between tiers only involves an atomic
++ * operation on page->flags and therefore has a negligible cost.
++ *
++ * The purposes of tiers are to:
++ * 1) estimate whether pages accessed multiple times via file descriptors are
++ * more active than pages accessed only via page tables by separating the two
++ * access types into upper tiers and the base tier, and comparing refaulted %
++ * across all tiers.
++ * 2) improve buffered io performance by deferring the protection of pages
++ * accessed multiple times until the eviction. That is the protection happens
++ * in the reclaim path, not the access path.
++ *
++ * Pages accessed N times via file descriptors belong to tier order_base_2(N).
++ * The base tier may be marked by PageReferenced(). All upper tiers are marked
++ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are
++ * used to support more than one upper tier.
++ */
++#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN)
++#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
++
+ /* Whether to keep stats for historical generations. */
+ #ifdef CONFIG_LRU_GEN_STATS
+ #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+@@ -337,6 +361,15 @@ struct lrugen {
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the sizes of the multigenerational lru lists in pages */
+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++ /* the exponential moving average of refaulted */
++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
++ /* the exponential moving average of protected+evicted */
++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
++ /* the base tier isn't protected, hence the minus one */
++ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
++ /* incremented without holding the lru lock */
++ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
++ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multigenerational lru is enabled */
+ bool enabled[ANON_AND_FILE];
+ };
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st
+ local_unlock(&lru_pvecs.lock);
+ }
+
++#ifdef CONFIG_LRU_GEN
++static void page_inc_refs(struct page *page)
++{
++ unsigned long refs;
++ unsigned long old_flags, new_flags;
++
++ if (PageUnevictable(page))
++ return;
++
++ /* see the comment on MAX_NR_TIERS */
++ do {
++ new_flags = old_flags = READ_ONCE(page->flags);
++
++ if (!(new_flags & BIT(PG_referenced))) {
++ new_flags |= BIT(PG_referenced);
++ continue;
++ }
++
++ if (!(new_flags & BIT(PG_workingset))) {
++ new_flags |= BIT(PG_workingset);
++ continue;
++ }
++
++ refs = new_flags & LRU_REFS_MASK;
++ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK);
++
++ new_flags &= ~LRU_REFS_MASK;
++ new_flags |= refs;
++ } while (new_flags != old_flags &&
++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
++}
++#else
++static void page_inc_refs(struct page *page)
++{
++}
++#endif /* CONFIG_LRU_GEN */
++
+ /*
+ * Mark a page as having seen activity.
+ *
+@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag
+ {
+ page = compound_head(page);
+
++ if (lru_gen_enabled()) {
++ page_inc_refs(page);
++ return;
++ }
++
+ if (!PageReferenced(page)) {
+ SetPageReferenced(page);
+ } else if (PageUnevictable(page)) {
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre
+
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+- mem_cgroup_swapout(page, swap);
++
++ /* get a shadow entry before page_memcg() is cleared */
+ if (reclaimed && !mapping_exiting(mapping))
+ shadow = workingset_eviction(page, target_memcg);
++ mem_cgroup_swapout(page, swap);
+ __delete_from_swap_cache(page, swap, shadow);
+ xa_unlock_irq(&mapping->i_pages);
+ put_swap_page(page, swap);
+@@ -1410,6 +1412,11 @@ retry:
+ if (!sc->may_unmap && page_mapped(page))
+ goto keep_locked;
+
++ /* lru_gen_look_around() has updated this page? */
++ if (lru_gen_enabled() && !ignore_references &&
++ page_mapped(page) && PageReferenced(page))
++ goto keep_locked;
++
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
++ if (lru_gen_enabled())
++ return;
++
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ }
+
++static int page_lru_tier(struct page *page)
++{
++ int refs;
++ unsigned long flags = READ_ONCE(page->flags);
++
++ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ?
++ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0;
++
++ return lru_tier_from_refs(refs);
++}
++
+ static int get_swappiness(struct mem_cgroup *memcg)
+ {
+ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
+@@ -3246,6 +3267,91 @@ done:
+ }
+
+ /******************************************************************************
++ * refault feedback loop
++ ******************************************************************************/
++
++/*
++ * A feedback loop modeled after the PID controller. Currently supports the
++ * proportional (P) and the integral (I) terms; the derivative (D) term can be
++ * added if necessary. The setpoint (SP) is the desired position; the process
++ * variable (PV) is the measured position. The error is the difference between
++ * the SP and the PV. A positive error results in a positive control output
++ * correction, which, in our case, is to allow eviction.
++ *
++ * The P term is refaulted % of the current generation being evicted. The I
++ * term is the exponential moving average of refaulted % of previously evicted
++ * generations, using the smoothing factor 1/2.
++ *
++ * Our goal is to maintain proportional refaulted % across all tiers.
++ */
++struct ctrl_pos {
++ unsigned long refaulted;
++ unsigned long total;
++ int gain;
++};
++
++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
++ struct ctrl_pos *pos)
++{
++ struct lrugen *lrugen = &lruvec->evictable;
++ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
++
++ pos->refaulted = lrugen->avg_refaulted[type][tier] +
++ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++ pos->total = lrugen->avg_total[type][tier] +
++ atomic_long_read(&lrugen->evicted[hist][type][tier]);
++ if (tier)
++ pos->total += lrugen->protected[hist][type][tier - 1];
++ pos->gain = gain;
++}
++
++static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type)
++{
++ int tier;
++ int hist = lru_hist_from_seq(gen);
++ struct lrugen *lrugen = &lruvec->evictable;
++ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
++ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
++
++ if (!carryover && !clear)
++ return;
++
++ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
++ if (carryover) {
++ unsigned long sum;
++
++ sum = lrugen->avg_refaulted[type][tier] +
++ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
++
++ sum = lrugen->avg_total[type][tier] +
++ atomic_long_read(&lrugen->evicted[hist][type][tier]);
++ if (tier)
++ sum += lrugen->protected[hist][type][tier - 1];
++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
++ }
++
++ if (clear) {
++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
++ if (tier)
++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
++ }
++ }
++}
++
++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
++{
++ /*
++ * Allow eviction if the PV has a limited number of refaulted pages or a
++ * lower refaulted % than the SP.
++ */
++ return pv->refaulted < MIN_BATCH_SIZE ||
++ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
++ sp->refaulted * max(pv->total, 1UL) * pv->gain;
++}
++
++/******************************************************************************
+ * the aging
+ ******************************************************************************/
+
+@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page *
+
+ new_flags &= ~LRU_GEN_MASK;
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa
+
+ new_flags &= ~LRU_GEN_MASK;
+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
+ /* for end_page_writeback() */
+ if (reclaiming)
+ new_flags |= BIT(PG_reclaim);
+@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l
+ }
+ }
+
++ reset_ctrl_pos(lruvec, gen, type);
+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+
+ return true;
+@@ -3824,6 +3933,8 @@ next:
+ if (min_seq[type] == lrugen->min_seq[type])
+ continue;
+
++ gen = lru_gen_from_seq(lrugen->min_seq[type]);
++ reset_ctrl_pos(lruvec, gen, type);
+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
+ success = true;
+ }
+@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l
+ }
+ }
+
++ for (type = 0; type < ANON_AND_FILE; type++)
++ reset_ctrl_pos(lruvec, gen, type);
++
+ WRITE_ONCE(lrugen->timestamps[gen], jiffies);
+ /* make sure all preceding modifications appear first */
+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma
+ }
+
+ /******************************************************************************
++ * the eviction
++ ******************************************************************************/
++
++static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx)
++{
++ bool success;
++ int gen = page_lru_gen(page);
++ int type = page_is_file_lru(page);
++ int zone = page_zonenum(page);
++ int tier = page_lru_tier(page);
++ int delta = thp_nr_pages(page);
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page);
++
++ /* an mlocked page? */
++ if (!page_evictable(page)) {
++ success = lru_gen_del_page(page, lruvec, true);
++ VM_BUG_ON_PAGE(!success, page);
++ SetPageUnevictable(page);
++ add_page_to_lru_list(page, lruvec);
++ __count_vm_events(UNEVICTABLE_PGCULLED, delta);
++ return true;
++ }
++
++ /* a lazy-free page that has been written into? */
++ if (type && PageDirty(page) && PageAnon(page)) {
++ success = lru_gen_del_page(page, lruvec, true);
++ VM_BUG_ON_PAGE(!success, page);
++ SetPageSwapBacked(page);
++ add_page_to_lru_list_tail(page, lruvec);
++ return true;
++ }
++
++ /* page_update_gen() has updated this page? */
++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
++ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
++ return true;
++ }
++
++ /* protect this page if its tier has a higher refaulted % */
++ if (tier > tier_idx) {
++ int hist = lru_hist_from_seq(gen);
++
++ page_inc_gen(page, lruvec, false);
++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
++ lrugen->protected[hist][type][tier - 1] + delta);
++ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
++ return true;
++ }
++
++ /* mark this page for reclaim if it's pending writeback */
++ if (PageWriteback(page) || (type && PageDirty(page))) {
++ page_inc_gen(page, lruvec, true);
++ return true;
++ }
++
++ return false;
++}
++
++static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc)
++{
++ bool success;
++
++ if (!sc->may_unmap && page_mapped(page))
++ return false;
++
++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
++ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
++ return false;
++
++ if (!get_page_unless_zero(page))
++ return false;
++
++ if (!TestClearPageLRU(page)) {
++ put_page(page);
++ return false;
++ }
++
++ success = lru_gen_del_page(page, lruvec, true);
++ VM_BUG_ON_PAGE(!success, page);
++
++ return true;
++}
++
++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
++ int type, int tier, struct list_head *list)
++{
++ int gen, zone;
++ enum vm_event_item item;
++ int sorted = 0;
++ int scanned = 0;
++ int isolated = 0;
++ int remaining = MAX_BATCH_SIZE;
++ struct lrugen *lrugen = &lruvec->evictable;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++
++ VM_BUG_ON(!list_empty(list));
++
++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
++ return 0;
++
++ gen = lru_gen_from_seq(lrugen->min_seq[type]);
++
++ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
++ LIST_HEAD(moved);
++ int skipped = 0;
++ struct list_head *head = &lrugen->lists[gen][type][zone];
++
++ while (!list_empty(head)) {
++ struct page *page = lru_to_page(head);
++ int delta = thp_nr_pages(page);
++
++ VM_BUG_ON_PAGE(PageTail(page), page);
++ VM_BUG_ON_PAGE(PageUnevictable(page), page);
++ VM_BUG_ON_PAGE(PageActive(page), page);
++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
++
++ prefetchw_prev_lru_page(page, head, flags);
++
++ scanned += delta;
++
++ if (sort_page(page, lruvec, tier))
++ sorted += delta;
++ else if (isolate_page(page, lruvec, sc)) {
++ list_add(&page->lru, list);
++ isolated += delta;
++ } else {
++ list_move(&page->lru, &moved);
++ skipped += delta;
++ }
++
++ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE)
++ break;
++ }
++
++ if (skipped) {
++ list_splice(&moved, head);
++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
++ }
++
++ if (!remaining || isolated >= MIN_BATCH_SIZE)
++ break;
++ }
++
++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
++ if (!cgroup_reclaim(sc)) {
++ __count_vm_events(item, isolated);
++ __count_vm_events(PGREFILL, sorted);
++ }
++ __count_memcg_events(memcg, item, isolated);
++ __count_memcg_events(memcg, PGREFILL, sorted);
++ __count_vm_events(PGSCAN_ANON + type, isolated);
++
++ /*
++ * We may have trouble finding eligible pages due to reclaim_idx,
++ * may_unmap and may_writepage. Check `remaining` to make sure we won't
++ * be stuck if we aren't making enough progress.
++ */
++ return isolated || !remaining ? scanned : 0;
++}
++
++static int get_tier_idx(struct lruvec *lruvec, int type)
++{
++ int tier;
++ struct ctrl_pos sp, pv;
++
++ /*
++ * Ideally we don't want to evict upper tiers that have higher refaulted
++ * %. However, we need to leave a margin for the fluctuation in
++ * refaulted %. So we use a larger gain factor to make sure upper tiers
++ * are indeed more active. We choose 2 because the lowest upper tier
++ * would have twice of refaulted % of the base tier, according to their
++ * numbers of accesses.
++ */
++ read_ctrl_pos(lruvec, type, 0, 1, &sp);
++ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
++ read_ctrl_pos(lruvec, type, tier, 2, &pv);
++ if (!positive_ctrl_err(&sp, &pv))
++ break;
++ }
++
++ return tier - 1;
++}
++
++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
++{
++ int type, tier;
++ struct ctrl_pos sp, pv;
++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
++
++ /*
++ * Compare refaulted % between the base tiers of anon and file to
++ * determine which type to evict. Also need to compare refaulted % of
++ * the upper tiers of the selected type with that of the base tier of
++ * the other type to determine which tier of the selected type to evict.
++ */
++ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp);
++ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv);
++ type = positive_ctrl_err(&sp, &pv);
++
++ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
++ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
++ read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
++ if (!positive_ctrl_err(&sp, &pv))
++ break;
++ }
++
++ *tier_idx = tier - 1;
++
++ return type;
++}
++
++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++ int *type_scanned, struct list_head *list)
++{
++ int i;
++ int type;
++ int scanned;
++ int tier = -1;
++ DEFINE_MIN_SEQ(lruvec);
++
++ VM_BUG_ON(!seq_is_valid(lruvec));
++
++ /*
++ * Try to select a type based on generations and swappiness, and if that
++ * fails, fall back to get_type_to_scan(). When anon and file are both
++ * available from the same generation, swappiness 200 is interpreted as
++ * anon first and swappiness 1 is interpreted as file first.
++ */
++ if (!swappiness)
++ type = 1;
++ else if (min_seq[0] < min_seq[1])
++ type = 0;
++ else if (swappiness == 1)
++ type = 1;
++ else if (swappiness == 200)
++ type = 0;
++ else
++ type = get_type_to_scan(lruvec, swappiness, &tier);
++
++ for (i = !swappiness; i < ANON_AND_FILE; i++) {
++ if (tier < 0)
++ tier = get_tier_idx(lruvec, type);
++
++ scanned = scan_pages(lruvec, sc, type, tier, list);
++ if (scanned)
++ break;
++
++ type = !type;
++ tier = -1;
++ }
++
++ *type_scanned = type;
++
++ return scanned;
++}
++
++/* Main function used by the foreground, the background and the user-triggered eviction. */
++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++{
++ int type;
++ int scanned;
++ int reclaimed;
++ LIST_HEAD(list);
++ struct page *page;
++ enum vm_event_item item;
++ struct reclaim_stat stat;
++ struct mm_walk_args *args;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++ spin_lock_irq(&lruvec->lru_lock);
++
++ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
++
++ if (try_to_inc_min_seq(lruvec, swappiness))
++ scanned++;
++
++ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS)
++ scanned = 0;
++
++ spin_unlock_irq(&lruvec->lru_lock);
++
++ if (list_empty(&list))
++ return scanned;
++
++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
++ /*
++ * We need to prevent rejected pages from being added back to the same
++ * lists they were isolated from. Otherwise we may risk looping on them
++ * forever.
++ */
++ list_for_each_entry(page, &list, lru) {
++ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page)))
++ SetPageActive(page);
++
++ ClearPageReferenced(page);
++ ClearPageWorkingset(page);
++ }
++
++ spin_lock_irq(&lruvec->lru_lock);
++
++ move_pages_to_lru(lruvec, &list);
++
++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
++ if (args && args->batch_size)
++ reset_batch_size(lruvec, args);
++
++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
++ if (!cgroup_reclaim(sc))
++ __count_vm_events(item, reclaimed);
++ __count_memcg_events(memcg, item, reclaimed);
++ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
++
++ spin_unlock_irq(&lruvec->lru_lock);
++
++ mem_cgroup_uncharge_list(&list);
++ free_unref_page_list(&list);
++
++ sc->nr_reclaimed += reclaimed;
++
++ return scanned;
++}
++
++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++{
++ bool low;
++ long nr_to_scan;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ int priority = sc->priority;
++ DEFINE_MAX_SEQ(lruvec);
++ DEFINE_MIN_SEQ(lruvec);
++
++ if (mem_cgroup_below_min(memcg) ||
++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
++ return 0;
++
++ if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
++ priority = DEF_PRIORITY;
++ sc->force_deactivate = 0;
++ }
++
++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
++ if (!nr_to_scan)
++ return 0;
++
++ nr_to_scan >>= priority;
++
++ if (!mem_cgroup_online(memcg))
++ nr_to_scan++;
++
++ if (!nr_to_scan)
++ return 0;
++
++ if (current_is_kswapd()) {
++ /* leave the work to lru_gen_age_node() */
++ if (max_seq - min_seq[1] < MIN_NR_GENS)
++ return 0;
++
++ if (!low)
++ sc->force_deactivate = 0;
++
++ return nr_to_scan;
++ }
++
++ if (max_seq - min_seq[1] >= MIN_NR_GENS)
++ return nr_to_scan;
++
++ /* move onto slab and other memcgs if we haven't tried them all */
++ if (!sc->force_deactivate) {
++ sc->skipped_deactivate = 1;
++ return 0;
++ }
++
++ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0;
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++ struct blk_plug plug;
++ long scanned = 0;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++ lru_add_drain();
++
++ if (current_is_kswapd())
++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
++
++ blk_start_plug(&plug);
++
++ while (true) {
++ int delta;
++ int swappiness;
++ long nr_to_scan;
++
++ if (sc->may_swap)
++ swappiness = get_swappiness(memcg);
++ else if (!cgroup_reclaim(sc) && get_swappiness(memcg))
++ swappiness = 1;
++ else
++ swappiness = 0;
++
++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
++ if (!nr_to_scan)
++ break;
++
++ delta = evict_pages(lruvec, sc, swappiness);
++ if (!delta)
++ break;
++
++ scanned += delta;
++ if (scanned >= nr_to_scan)
++ break;
++
++ cond_resched();
++ }
++
++ blk_finish_plug(&plug);
++
++ if (current_is_kswapd())
++ current->reclaim_state->mm_walk_args = NULL;
++}
++
++/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli
+ {
+ }
+
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++}
++
+ #endif /* CONFIG_LRU_GEN */
+
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec
+ struct blk_plug plug;
+ bool scan_adjusted;
+
++ if (lru_gen_enabled()) {
++ lru_gen_shrink_lruvec(lruvec, sc);
++ return;
++ }
++
+ get_scan_count(lruvec, sc, nr);
+
+ /* Record the original scan target for proportional adjustments later */
+@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem
+ struct lruvec *target_lruvec;
+ unsigned long refaults;
+
++ if (lru_gen_enabled())
++ return;
++
+ target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ target_lruvec->refaults[0] = refaults;
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
+ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ bool workingset)
+ {
+- eviction >>= bucket_order;
+ eviction &= EVICTION_MASK;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
+ eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow,
+
+ *memcgidp = memcgid;
+ *pgdat = NODE_DATA(nid);
+- *evictionp = entry << bucket_order;
++ *evictionp = entry;
+ *workingsetp = workingset;
+ }
+
++#ifdef CONFIG_LRU_GEN
++
++static int page_lru_refs(struct page *page)
++{
++ unsigned long flags = READ_ONCE(page->flags);
++
++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
++
++ /* see the comment on MAX_NR_TIERS */
++ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0;
++}
++
++/* Return a token to be stored in the shadow entry of a page being evicted. */
++static void *lru_gen_eviction(struct page *page)
++{
++ int hist, tier;
++ unsigned long token;
++ unsigned long min_seq;
++ struct lruvec *lruvec;
++ struct lrugen *lrugen;
++ int type = page_is_file_lru(page);
++ int refs = page_lru_refs(page);
++ int delta = thp_nr_pages(page);
++ bool workingset = PageWorkingset(page);
++ struct mem_cgroup *memcg = page_memcg(page);
++ struct pglist_data *pgdat = page_pgdat(page);
++
++ lruvec = mem_cgroup_lruvec(memcg, pgdat);
++ lrugen = &lruvec->evictable;
++ min_seq = READ_ONCE(lrugen->min_seq[type]);
++ token = (min_seq << LRU_REFS_WIDTH) | refs;
++
++ hist = lru_hist_from_seq(min_seq);
++ tier = lru_tier_from_refs(refs + workingset);
++ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
++
++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
++}
++
++/* Count a refaulted page based on the token stored in its shadow entry. */
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++ int hist, tier, refs;
++ int memcg_id;
++ bool workingset;
++ unsigned long token;
++ unsigned long min_seq;
++ struct lruvec *lruvec;
++ struct lrugen *lrugen;
++ struct mem_cgroup *memcg;
++ struct pglist_data *pgdat;
++ int type = page_is_file_lru(page);
++ int delta = thp_nr_pages(page);
++
++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
++ if (page_pgdat(page) != pgdat)
++ return;
++
++ rcu_read_lock();
++ memcg = page_memcg_rcu(page);
++ if (mem_cgroup_id(memcg) != memcg_id)
++ goto unlock;
++
++ refs = token & (BIT(LRU_REFS_WIDTH) - 1);
++ if (refs && !workingset)
++ goto unlock;
++
++ token >>= LRU_REFS_WIDTH;
++ lruvec = mem_cgroup_lruvec(memcg, pgdat);
++ lrugen = &lruvec->evictable;
++ min_seq = READ_ONCE(lrugen->min_seq[type]);
++ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
++ goto unlock;
++
++ hist = lru_hist_from_seq(min_seq);
++ tier = lru_tier_from_refs(refs + workingset);
++ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
++ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
++
++ /*
++ * Tiers don't offer any protection to pages accessed via page tables.
++ * That's what generations do. Tiers can't fully protect pages after
++ * their numbers of accesses has exceeded the max value. Conservatively
++ * count these two conditions as stalls even though they might not
++ * indicate any real memory pressure.
++ */
++ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) {
++ SetPageWorkingset(page);
++ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
++ }
++unlock:
++ rcu_read_unlock();
++}
++
++#else
++
++static void *lru_gen_eviction(struct page *page)
++{
++ return NULL;
++}
++
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ /**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @lruvec: the lruvec that was aged
+@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
++ if (lru_gen_enabled())
++ return lru_gen_eviction(page);
++
+ lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ /* XXX: target_memcg can be NULL, go through lruvec */
+ memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ eviction = atomic_long_read(&lruvec->nonresident_age);
++ eviction >>= bucket_order;
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ }
+@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag
+ bool workingset;
+ int memcgid;
+
++ if (lru_gen_enabled()) {
++ lru_gen_refault(page, shadow);
++ return;
++ }
++
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
++ eviction <<= bucket_order;
+
+ rcu_read_lock();
+ /*