diff options
Diffstat (limited to 'target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch')
-rw-r--r-- | target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch | 1002 |
1 files changed, 1002 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch new file mode 100644 index 0000000000..09e4315dcc --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch @@ -0,0 +1,1002 @@ +From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Tue, 29 Jun 2021 20:46:47 -0600 +Subject: [PATCH 07/10] mm: multigenerational lru: eviction + +The eviction consumes old generations. Given an lruvec, the eviction +scans pages on lrugen->lists indexed by anon and file min_seq[] +(modulo MAX_NR_GENS). It first tries to select a type based on the +values of min_seq[]. If they are equal, it selects the type that has +a lower refaulted %. The eviction sorts a page according to its +updated generation number if the aging has found this page accessed. +It also moves a page to the next generation if this page is from an +upper tier that has a higher refaulted % than the base tier. The +eviction increments min_seq[] of a selected type when it finds +lrugen->lists indexed by min_seq[] of this selected type are empty. + +Each generation is divided into multiple tiers. Tiers represent +different ranges of numbers of accesses from file descriptors only. +Pages accessed N times via file descriptors belong to tier +order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, +and they require additional MAX_NR_TIERS-2 bits in page->flags. In +contrast to moving between generations which requires list operations, +moving between tiers only involves operations on page->flags and +therefore has a negligible cost. A feedback loop modeled after the PID +controller monitors refaulted % across all tiers and decides when to +protect pages from which tiers. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. Each tier keeps track of how many +pages from it have refaulted. Tier 0 is the base tier and pages from +it are evicted unconditionally because there are no better candidates. +Pages from an upper tier are either evicted or moved to the next +generation, depending on whether this upper tier has a higher +refaulted % than the base tier. This model has the following +advantages: + 1) It removes the cost in the buffered access path and reduces the + overall cost of protection because pages are conditionally protected + in the reclaim path. + 2) It takes mapped pages into account and avoids overprotecting + pages accessed multiple times via file descriptors. + 3 Additional tiers improve the protection of pages accessed more + than twice. + +Signed-off-by: Yu Zhao <yuzhao@google.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac +--- + include/linux/mm_inline.h | 10 + + include/linux/mmzone.h | 33 +++ + mm/swap.c | 42 +++ + mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++- + mm/workingset.c | 120 ++++++++- + 5 files changed, 757 insertions(+), 3 deletions(-) + +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi + return seq % NR_HIST_GENS; + } + ++/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ ++static inline int lru_tier_from_refs(int refs) ++{ ++ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); ++ ++ return order_base_2(refs + 1); ++} ++ + /* The youngest and the second youngest generations are counted as active. */ + static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) + { +@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + + new_flags &= ~LRU_GEN_MASK; ++ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for shrink_page_list() */ + if (reclaiming) + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -319,6 +319,30 @@ struct page_vma_mapped_walk; + #define MIN_NR_GENS 2 + #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + ++/* ++ * Each generation is divided into multiple tiers. Tiers represent different ++ * ranges of numbers of accesses from file descriptors, i.e., ++ * mark_page_accessed(). In contrast to moving between generations which ++ * requires the lru lock, moving between tiers only involves an atomic ++ * operation on page->flags and therefore has a negligible cost. ++ * ++ * The purposes of tiers are to: ++ * 1) estimate whether pages accessed multiple times via file descriptors are ++ * more active than pages accessed only via page tables by separating the two ++ * access types into upper tiers and the base tier, and comparing refaulted % ++ * across all tiers. ++ * 2) improve buffered io performance by deferring the protection of pages ++ * accessed multiple times until the eviction. That is the protection happens ++ * in the reclaim path, not the access path. ++ * ++ * Pages accessed N times via file descriptors belong to tier order_base_2(N). ++ * The base tier may be marked by PageReferenced(). All upper tiers are marked ++ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are ++ * used to support more than one upper tier. ++ */ ++#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) ++#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) ++ + /* Whether to keep stats for historical generations. */ + #ifdef CONFIG_LRU_GEN_STATS + #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) +@@ -337,6 +361,15 @@ struct lrugen { + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the sizes of the multigenerational lru lists in pages */ + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the exponential moving average of refaulted */ ++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the exponential moving average of protected+evicted */ ++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the base tier isn't protected, hence the minus one */ ++ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; ++ /* incremented without holding the lru lock */ ++ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multigenerational lru is enabled */ + bool enabled[ANON_AND_FILE]; + }; +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st + local_unlock(&lru_pvecs.lock); + } + ++#ifdef CONFIG_LRU_GEN ++static void page_inc_refs(struct page *page) ++{ ++ unsigned long refs; ++ unsigned long old_flags, new_flags; ++ ++ if (PageUnevictable(page)) ++ return; ++ ++ /* see the comment on MAX_NR_TIERS */ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & BIT(PG_referenced))) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ if (!(new_flags & BIT(PG_workingset))) { ++ new_flags |= BIT(PG_workingset); ++ continue; ++ } ++ ++ refs = new_flags & LRU_REFS_MASK; ++ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); ++ ++ new_flags &= ~LRU_REFS_MASK; ++ new_flags |= refs; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++} ++#else ++static void page_inc_refs(struct page *page) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * Mark a page as having seen activity. + * +@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag + { + page = compound_head(page); + ++ if (lru_gen_enabled()) { ++ page_inc_refs(page); ++ return; ++ } ++ + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; +- mem_cgroup_swapout(page, swap); ++ ++ /* get a shadow entry before page_memcg() is cleared */ + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(page, target_memcg); ++ mem_cgroup_swapout(page, swap); + __delete_from_swap_cache(page, swap, shadow); + xa_unlock_irq(&mapping->i_pages); + put_swap_page(page, swap); +@@ -1410,6 +1412,11 @@ retry: + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + ++ /* lru_gen_look_around() has updated this page? */ ++ if (lru_gen_enabled() && !ignore_references && ++ page_mapped(page) && PageReferenced(page)) ++ goto keep_locked; ++ + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + +@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t + unsigned long file; + struct lruvec *target_lruvec; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* +@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + } + ++static int page_lru_tier(struct page *page) ++{ ++ int refs; ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? ++ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; ++ ++ return lru_tier_from_refs(refs); ++} ++ + static int get_swappiness(struct mem_cgroup *memcg) + { + return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? +@@ -3246,6 +3267,91 @@ done: + } + + /****************************************************************************** ++ * refault feedback loop ++ ******************************************************************************/ ++ ++/* ++ * A feedback loop modeled after the PID controller. Currently supports the ++ * proportional (P) and the integral (I) terms; the derivative (D) term can be ++ * added if necessary. The setpoint (SP) is the desired position; the process ++ * variable (PV) is the measured position. The error is the difference between ++ * the SP and the PV. A positive error results in a positive control output ++ * correction, which, in our case, is to allow eviction. ++ * ++ * The P term is refaulted % of the current generation being evicted. The I ++ * term is the exponential moving average of refaulted % of previously evicted ++ * generations, using the smoothing factor 1/2. ++ * ++ * Our goal is to maintain proportional refaulted % across all tiers. ++ */ ++struct ctrl_pos { ++ unsigned long refaulted; ++ unsigned long total; ++ int gain; ++}; ++ ++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, ++ struct ctrl_pos *pos) ++{ ++ struct lrugen *lrugen = &lruvec->evictable; ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ pos->refaulted = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ pos->total = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ pos->total += lrugen->protected[hist][type][tier - 1]; ++ pos->gain = gain; ++} ++ ++static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) ++{ ++ int tier; ++ int hist = lru_hist_from_seq(gen); ++ struct lrugen *lrugen = &lruvec->evictable; ++ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); ++ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; ++ ++ if (!carryover && !clear) ++ return; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ if (carryover) { ++ unsigned long sum; ++ ++ sum = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); ++ ++ sum = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ sum += lrugen->protected[hist][type][tier - 1]; ++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); ++ } ++ ++ if (clear) { ++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); ++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); ++ if (tier) ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); ++ } ++ } ++} ++ ++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) ++{ ++ /* ++ * Allow eviction if the PV has a limited number of refaulted pages or a ++ * lower refaulted % than the SP. ++ */ ++ return pv->refaulted < MIN_BATCH_SIZE || ++ pv->refaulted * max(sp->total, 1UL) * sp->gain <= ++ sp->refaulted * max(pv->total, 1UL) * pv->gain; ++} ++ ++/****************************************************************************** + * the aging + ******************************************************************************/ + +@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page * + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + } while (new_flags != old_flags && + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); + +@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for end_page_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); +@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l + } + } + ++ reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +@@ -3824,6 +3933,8 @@ next: + if (min_seq[type] == lrugen->min_seq[type]) + continue; + ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } +@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l + } + } + ++ for (type = 0; type < ANON_AND_FILE; type++) ++ reset_ctrl_pos(lruvec, gen, type); ++ + WRITE_ONCE(lrugen->timestamps[gen], jiffies); + /* make sure all preceding modifications appear first */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma + } + + /****************************************************************************** ++ * the eviction ++ ******************************************************************************/ ++ ++static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) ++{ ++ bool success; ++ int gen = page_lru_gen(page); ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int tier = page_lru_tier(page); ++ int delta = thp_nr_pages(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); ++ ++ /* an mlocked page? */ ++ if (!page_evictable(page)) { ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageUnevictable(page); ++ add_page_to_lru_list(page, lruvec); ++ __count_vm_events(UNEVICTABLE_PGCULLED, delta); ++ return true; ++ } ++ ++ /* a lazy-free page that has been written into? */ ++ if (type && PageDirty(page) && PageAnon(page)) { ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageSwapBacked(page); ++ add_page_to_lru_list_tail(page, lruvec); ++ return true; ++ } ++ ++ /* page_update_gen() has updated this page? */ ++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { ++ list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ ++ /* protect this page if its tier has a higher refaulted % */ ++ if (tier > tier_idx) { ++ int hist = lru_hist_from_seq(gen); ++ ++ page_inc_gen(page, lruvec, false); ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], ++ lrugen->protected[hist][type][tier - 1] + delta); ++ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); ++ return true; ++ } ++ ++ /* mark this page for reclaim if it's pending writeback */ ++ if (PageWriteback(page) || (type && PageDirty(page))) { ++ page_inc_gen(page, lruvec, true); ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool success; ++ ++ if (!sc->may_unmap && page_mapped(page)) ++ return false; ++ ++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) ++ return false; ++ ++ if (!get_page_unless_zero(page)) ++ return false; ++ ++ if (!TestClearPageLRU(page)) { ++ put_page(page); ++ return false; ++ } ++ ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ ++ return true; ++} ++ ++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, ++ int type, int tier, struct list_head *list) ++{ ++ int gen, zone; ++ enum vm_event_item item; ++ int sorted = 0; ++ int scanned = 0; ++ int isolated = 0; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ VM_BUG_ON(!list_empty(list)); ++ ++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) ++ return 0; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = sc->reclaim_idx; zone >= 0; zone--) { ++ LIST_HEAD(moved); ++ int skipped = 0; ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ scanned += delta; ++ ++ if (sort_page(page, lruvec, tier)) ++ sorted += delta; ++ else if (isolate_page(page, lruvec, sc)) { ++ list_add(&page->lru, list); ++ isolated += delta; ++ } else { ++ list_move(&page->lru, &moved); ++ skipped += delta; ++ } ++ ++ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) ++ break; ++ } ++ ++ if (skipped) { ++ list_splice(&moved, head); ++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); ++ } ++ ++ if (!remaining || isolated >= MIN_BATCH_SIZE) ++ break; ++ } ++ ++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; ++ if (!cgroup_reclaim(sc)) { ++ __count_vm_events(item, isolated); ++ __count_vm_events(PGREFILL, sorted); ++ } ++ __count_memcg_events(memcg, item, isolated); ++ __count_memcg_events(memcg, PGREFILL, sorted); ++ __count_vm_events(PGSCAN_ANON + type, isolated); ++ ++ /* ++ * We may have trouble finding eligible pages due to reclaim_idx, ++ * may_unmap and may_writepage. Check `remaining` to make sure we won't ++ * be stuck if we aren't making enough progress. ++ */ ++ return isolated || !remaining ? scanned : 0; ++} ++ ++static int get_tier_idx(struct lruvec *lruvec, int type) ++{ ++ int tier; ++ struct ctrl_pos sp, pv; ++ ++ /* ++ * Ideally we don't want to evict upper tiers that have higher refaulted ++ * %. However, we need to leave a margin for the fluctuation in ++ * refaulted %. So we use a larger gain factor to make sure upper tiers ++ * are indeed more active. We choose 2 because the lowest upper tier ++ * would have twice of refaulted % of the base tier, according to their ++ * numbers of accesses. ++ */ ++ read_ctrl_pos(lruvec, type, 0, 1, &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, 2, &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ return tier - 1; ++} ++ ++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) ++{ ++ int type, tier; ++ struct ctrl_pos sp, pv; ++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; ++ ++ /* ++ * Compare refaulted % between the base tiers of anon and file to ++ * determine which type to evict. Also need to compare refaulted % of ++ * the upper tiers of the selected type with that of the base tier of ++ * the other type to determine which tier of the selected type to evict. ++ */ ++ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); ++ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); ++ type = positive_ctrl_err(&sp, &pv); ++ ++ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ *tier_idx = tier - 1; ++ ++ return type; ++} ++ ++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ int *type_scanned, struct list_head *list) ++{ ++ int i; ++ int type; ++ int scanned; ++ int tier = -1; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ /* ++ * Try to select a type based on generations and swappiness, and if that ++ * fails, fall back to get_type_to_scan(). When anon and file are both ++ * available from the same generation, swappiness 200 is interpreted as ++ * anon first and swappiness 1 is interpreted as file first. ++ */ ++ if (!swappiness) ++ type = 1; ++ else if (min_seq[0] < min_seq[1]) ++ type = 0; ++ else if (swappiness == 1) ++ type = 1; ++ else if (swappiness == 200) ++ type = 0; ++ else ++ type = get_type_to_scan(lruvec, swappiness, &tier); ++ ++ for (i = !swappiness; i < ANON_AND_FILE; i++) { ++ if (tier < 0) ++ tier = get_tier_idx(lruvec, type); ++ ++ scanned = scan_pages(lruvec, sc, type, tier, list); ++ if (scanned) ++ break; ++ ++ type = !type; ++ tier = -1; ++ } ++ ++ *type_scanned = type; ++ ++ return scanned; ++} ++ ++/* Main function used by the foreground, the background and the user-triggered eviction. */ ++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ int type; ++ int scanned; ++ int reclaimed; ++ LIST_HEAD(list); ++ struct page *page; ++ enum vm_event_item item; ++ struct reclaim_stat stat; ++ struct mm_walk_args *args; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); ++ ++ if (try_to_inc_min_seq(lruvec, swappiness)) ++ scanned++; ++ ++ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) ++ scanned = 0; ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ if (list_empty(&list)) ++ return scanned; ++ ++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); ++ /* ++ * We need to prevent rejected pages from being added back to the same ++ * lists they were isolated from. Otherwise we may risk looping on them ++ * forever. ++ */ ++ list_for_each_entry(page, &list, lru) { ++ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) ++ SetPageActive(page); ++ ++ ClearPageReferenced(page); ++ ClearPageWorkingset(page); ++ } ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ move_pages_to_lru(lruvec, &list); ++ ++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; ++ if (args && args->batch_size) ++ reset_batch_size(lruvec, args); ++ ++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; ++ if (!cgroup_reclaim(sc)) ++ __count_vm_events(item, reclaimed); ++ __count_memcg_events(memcg, item, reclaimed); ++ __count_vm_events(PGSTEAL_ANON + type, reclaimed); ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_uncharge_list(&list); ++ free_unref_page_list(&list); ++ ++ sc->nr_reclaimed += reclaimed; ++ ++ return scanned; ++} ++ ++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ bool low; ++ long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ int priority = sc->priority; ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg) || ++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) ++ return 0; ++ ++ if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ++ priority = DEF_PRIORITY; ++ sc->force_deactivate = 0; ++ } ++ ++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); ++ if (!nr_to_scan) ++ return 0; ++ ++ nr_to_scan >>= priority; ++ ++ if (!mem_cgroup_online(memcg)) ++ nr_to_scan++; ++ ++ if (!nr_to_scan) ++ return 0; ++ ++ if (current_is_kswapd()) { ++ /* leave the work to lru_gen_age_node() */ ++ if (max_seq - min_seq[1] < MIN_NR_GENS) ++ return 0; ++ ++ if (!low) ++ sc->force_deactivate = 0; ++ ++ return nr_to_scan; ++ } ++ ++ if (max_seq - min_seq[1] >= MIN_NR_GENS) ++ return nr_to_scan; ++ ++ /* move onto slab and other memcgs if we haven't tried them all */ ++ if (!sc->force_deactivate) { ++ sc->skipped_deactivate = 1; ++ return 0; ++ } ++ ++ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ long scanned = 0; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ lru_add_drain(); ++ ++ if (current_is_kswapd()) ++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; ++ ++ blk_start_plug(&plug); ++ ++ while (true) { ++ int delta; ++ int swappiness; ++ long nr_to_scan; ++ ++ if (sc->may_swap) ++ swappiness = get_swappiness(memcg); ++ else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) ++ swappiness = 1; ++ else ++ swappiness = 0; ++ ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); ++ if (!nr_to_scan) ++ break; ++ ++ delta = evict_pages(lruvec, sc, swappiness); ++ if (!delta) ++ break; ++ ++ scanned += delta; ++ if (scanned >= nr_to_scan) ++ break; ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++ ++ if (current_is_kswapd()) ++ current->reclaim_state->mm_walk_args = NULL; ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli + { + } + ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec + struct blk_plug plug; + bool scan_adjusted; + ++ if (lru_gen_enabled()) { ++ lru_gen_shrink_lruvec(lruvec, sc); ++ return; ++ } ++ + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ +@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem + struct lruvec *target_lruvec; + unsigned long refaults; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_ + static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) + { +- eviction >>= bucket_order; + eviction &= EVICTION_MASK; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; +@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, + + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); +- *evictionp = entry << bucket_order; ++ *evictionp = entry; + *workingsetp = workingset; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static int page_lru_refs(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); ++ ++ /* see the comment on MAX_NR_TIERS */ ++ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; ++} ++ ++/* Return a token to be stored in the shadow entry of a page being evicted. */ ++static void *lru_gen_eviction(struct page *page) ++{ ++ int hist, tier; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ int type = page_is_file_lru(page); ++ int refs = page_lru_refs(page); ++ int delta = thp_nr_pages(page); ++ bool workingset = PageWorkingset(page); ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ token = (min_seq << LRU_REFS_WIDTH) | refs; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_refs(refs + workingset); ++ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); ++ ++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); ++} ++ ++/* Count a refaulted page based on the token stored in its shadow entry. */ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++ int hist, tier, refs; ++ int memcg_id; ++ bool workingset; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ struct mem_cgroup *memcg; ++ struct pglist_data *pgdat; ++ int type = page_is_file_lru(page); ++ int delta = thp_nr_pages(page); ++ ++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); ++ if (page_pgdat(page) != pgdat) ++ return; ++ ++ rcu_read_lock(); ++ memcg = page_memcg_rcu(page); ++ if (mem_cgroup_id(memcg) != memcg_id) ++ goto unlock; ++ ++ refs = token & (BIT(LRU_REFS_WIDTH) - 1); ++ if (refs && !workingset) ++ goto unlock; ++ ++ token >>= LRU_REFS_WIDTH; ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) ++ goto unlock; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_refs(refs + workingset); ++ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); ++ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); ++ ++ /* ++ * Tiers don't offer any protection to pages accessed via page tables. ++ * That's what generations do. Tiers can't fully protect pages after ++ * their numbers of accesses has exceeded the max value. Conservatively ++ * count these two conditions as stalls even though they might not ++ * indicate any real memory pressure. ++ */ ++ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { ++ SetPageWorkingset(page); ++ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); ++ } ++unlock: ++ rcu_read_unlock(); ++} ++ ++#else ++ ++static void *lru_gen_eviction(struct page *page) ++{ ++ return NULL; ++} ++ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + /** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged +@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + ++ if (lru_gen_enabled()) ++ return lru_gen_eviction(page); ++ + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); ++ eviction >>= bucket_order; + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + } +@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag + bool workingset; + int memcgid; + ++ if (lru_gen_enabled()) { ++ lru_gen_refault(page, shadow); ++ return; ++ } ++ + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); ++ eviction <<= bucket_order; + + rcu_read_lock(); + /* |