diff options
Diffstat (limited to 'target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch')
-rw-r--r-- | target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch | 1002 |
1 files changed, 0 insertions, 1002 deletions
diff --git a/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch deleted file mode 100644 index a75fedecaa..0000000000 --- a/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch +++ /dev/null @@ -1,1002 +0,0 @@ -From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Tue, 29 Jun 2021 20:46:47 -0600 -Subject: [PATCH 07/10] mm: multigenerational lru: eviction - -The eviction consumes old generations. Given an lruvec, the eviction -scans pages on lrugen->lists indexed by anon and file min_seq[] -(modulo MAX_NR_GENS). It first tries to select a type based on the -values of min_seq[]. If they are equal, it selects the type that has -a lower refaulted %. The eviction sorts a page according to its -updated generation number if the aging has found this page accessed. -It also moves a page to the next generation if this page is from an -upper tier that has a higher refaulted % than the base tier. The -eviction increments min_seq[] of a selected type when it finds -lrugen->lists indexed by min_seq[] of this selected type are empty. - -Each generation is divided into multiple tiers. Tiers represent -different ranges of numbers of accesses from file descriptors only. -Pages accessed N times via file descriptors belong to tier -order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, -and they require additional MAX_NR_TIERS-2 bits in page->flags. In -contrast to moving between generations which requires list operations, -moving between tiers only involves operations on page->flags and -therefore has a negligible cost. A feedback loop modeled after the PID -controller monitors refaulted % across all tiers and decides when to -protect pages from which tiers. - -Unmapped pages are initially added to the oldest generation and then -conditionally protected by tiers. Each tier keeps track of how many -pages from it have refaulted. Tier 0 is the base tier and pages from -it are evicted unconditionally because there are no better candidates. -Pages from an upper tier are either evicted or moved to the next -generation, depending on whether this upper tier has a higher -refaulted % than the base tier. This model has the following -advantages: - 1) It removes the cost in the buffered access path and reduces the - overall cost of protection because pages are conditionally protected - in the reclaim path. - 2) It takes mapped pages into account and avoids overprotecting - pages accessed multiple times via file descriptors. - 3 Additional tiers improve the protection of pages accessed more - than twice. - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac ---- - include/linux/mm_inline.h | 10 + - include/linux/mmzone.h | 33 +++ - mm/swap.c | 42 +++ - mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++- - mm/workingset.c | 120 ++++++++- - 5 files changed, 757 insertions(+), 3 deletions(-) - ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi - return seq % NR_HIST_GENS; - } - -+/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ -+static inline int lru_tier_from_refs(int refs) -+{ -+ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); -+ -+ return order_base_2(refs + 1); -+} -+ - /* The youngest and the second youngest generations are counted as active. */ - static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) - { -@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru - gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; - - new_flags &= ~LRU_GEN_MASK; -+ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - /* for shrink_page_list() */ - if (reclaiming) - new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -319,6 +319,30 @@ struct page_vma_mapped_walk; - #define MIN_NR_GENS 2 - #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) - -+/* -+ * Each generation is divided into multiple tiers. Tiers represent different -+ * ranges of numbers of accesses from file descriptors, i.e., -+ * mark_page_accessed(). In contrast to moving between generations which -+ * requires the lru lock, moving between tiers only involves an atomic -+ * operation on page->flags and therefore has a negligible cost. -+ * -+ * The purposes of tiers are to: -+ * 1) estimate whether pages accessed multiple times via file descriptors are -+ * more active than pages accessed only via page tables by separating the two -+ * access types into upper tiers and the base tier, and comparing refaulted % -+ * across all tiers. -+ * 2) improve buffered io performance by deferring the protection of pages -+ * accessed multiple times until the eviction. That is the protection happens -+ * in the reclaim path, not the access path. -+ * -+ * Pages accessed N times via file descriptors belong to tier order_base_2(N). -+ * The base tier may be marked by PageReferenced(). All upper tiers are marked -+ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are -+ * used to support more than one upper tier. -+ */ -+#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) -+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) -+ - /* Whether to keep stats for historical generations. */ - #ifdef CONFIG_LRU_GEN_STATS - #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -@@ -337,6 +361,15 @@ struct lrugen { - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; - /* the sizes of the multigenerational lru lists in pages */ - unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the exponential moving average of refaulted */ -+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the exponential moving average of protected+evicted */ -+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the base tier isn't protected, hence the minus one */ -+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; -+ /* incremented without holding the lru lock */ -+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; -+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; - /* whether the multigenerational lru is enabled */ - bool enabled[ANON_AND_FILE]; - }; ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st - local_unlock(&lru_pvecs.lock); - } - -+#ifdef CONFIG_LRU_GEN -+static void page_inc_refs(struct page *page) -+{ -+ unsigned long refs; -+ unsigned long old_flags, new_flags; -+ -+ if (PageUnevictable(page)) -+ return; -+ -+ /* see the comment on MAX_NR_TIERS */ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ -+ if (!(new_flags & BIT(PG_referenced))) { -+ new_flags |= BIT(PG_referenced); -+ continue; -+ } -+ -+ if (!(new_flags & BIT(PG_workingset))) { -+ new_flags |= BIT(PG_workingset); -+ continue; -+ } -+ -+ refs = new_flags & LRU_REFS_MASK; -+ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); -+ -+ new_flags &= ~LRU_REFS_MASK; -+ new_flags |= refs; -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+} -+#else -+static void page_inc_refs(struct page *page) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - /* - * Mark a page as having seen activity. - * -@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag - { - page = compound_head(page); - -+ if (lru_gen_enabled()) { -+ page_inc_refs(page); -+ return; -+ } -+ - if (!PageReferenced(page)) { - SetPageReferenced(page); - } else if (PageUnevictable(page)) { ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre - - if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; -- mem_cgroup_swapout(page, swap); -+ -+ /* get a shadow entry before page_memcg() is cleared */ - if (reclaimed && !mapping_exiting(mapping)) - shadow = workingset_eviction(page, target_memcg); -+ mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page, swap, shadow); - xa_unlock_irq(&mapping->i_pages); - put_swap_page(page, swap); -@@ -1410,6 +1412,11 @@ retry: - if (!sc->may_unmap && page_mapped(page)) - goto keep_locked; - -+ /* lru_gen_look_around() has updated this page? */ -+ if (lru_gen_enabled() && !ignore_references && -+ page_mapped(page) && PageReferenced(page)) -+ goto keep_locked; -+ - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); - -@@ -2505,6 +2512,9 @@ static void prepare_scan_count(pg_data_t - unsigned long file; - struct lruvec *target_lruvec; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - - /* -@@ -2845,6 +2855,17 @@ static int page_lru_gen(struct page *pag - return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; - } - -+static int page_lru_tier(struct page *page) -+{ -+ int refs; -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? -+ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; -+ -+ return lru_tier_from_refs(refs); -+} -+ - static int get_swappiness(struct mem_cgroup *memcg) - { - return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? -@@ -3181,6 +3202,91 @@ done: - } - - /****************************************************************************** -+ * refault feedback loop -+ ******************************************************************************/ -+ -+/* -+ * A feedback loop modeled after the PID controller. Currently supports the -+ * proportional (P) and the integral (I) terms; the derivative (D) term can be -+ * added if necessary. The setpoint (SP) is the desired position; the process -+ * variable (PV) is the measured position. The error is the difference between -+ * the SP and the PV. A positive error results in a positive control output -+ * correction, which, in our case, is to allow eviction. -+ * -+ * The P term is refaulted % of the current generation being evicted. The I -+ * term is the exponential moving average of refaulted % of previously evicted -+ * generations, using the smoothing factor 1/2. -+ * -+ * Our goal is to maintain proportional refaulted % across all tiers. -+ */ -+struct ctrl_pos { -+ unsigned long refaulted; -+ unsigned long total; -+ int gain; -+}; -+ -+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, -+ struct ctrl_pos *pos) -+{ -+ struct lrugen *lrugen = &lruvec->evictable; -+ int hist = lru_hist_from_seq(lrugen->min_seq[type]); -+ -+ pos->refaulted = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ pos->total = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ pos->total += lrugen->protected[hist][type][tier - 1]; -+ pos->gain = gain; -+} -+ -+static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) -+{ -+ int tier; -+ int hist = lru_hist_from_seq(gen); -+ struct lrugen *lrugen = &lruvec->evictable; -+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); -+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; -+ -+ if (!carryover && !clear) -+ return; -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ if (carryover) { -+ unsigned long sum; -+ -+ sum = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); -+ -+ sum = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ sum += lrugen->protected[hist][type][tier - 1]; -+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); -+ } -+ -+ if (clear) { -+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); -+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); -+ if (tier) -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); -+ } -+ } -+} -+ -+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) -+{ -+ /* -+ * Allow eviction if the PV has a limited number of refaulted pages or a -+ * lower refaulted % than the SP. -+ */ -+ return pv->refaulted < MIN_BATCH_SIZE || -+ pv->refaulted * max(sp->total, 1UL) * sp->gain <= -+ sp->refaulted * max(pv->total, 1UL) * pv->gain; -+} -+ -+/****************************************************************************** - * the aging - ******************************************************************************/ - -@@ -3200,6 +3306,7 @@ static int page_update_gen(struct page * - - new_flags &= ~LRU_GEN_MASK; - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - } while (new_flags != old_flags && - cmpxchg(&page->flags, old_flags, new_flags) != old_flags); - -@@ -3231,6 +3338,7 @@ static void page_inc_gen(struct page *pa - - new_flags &= ~LRU_GEN_MASK; - new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - /* for end_page_writeback() */ - if (reclaiming) - new_flags |= BIT(PG_reclaim); -@@ -3722,6 +3830,7 @@ static bool inc_min_seq(struct lruvec *l - } - } - -+ reset_ctrl_pos(lruvec, gen, type); - WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); - - return true; -@@ -3759,6 +3868,8 @@ next: - if (min_seq[type] == lrugen->min_seq[type]) - continue; - -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ reset_ctrl_pos(lruvec, gen, type); - WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); - success = true; - } -@@ -3820,6 +3931,9 @@ static void inc_max_seq(struct lruvec *l - } - } - -+ for (type = 0; type < ANON_AND_FILE; type++) -+ reset_ctrl_pos(lruvec, gen, type); -+ - WRITE_ONCE(lrugen->timestamps[gen], jiffies); - /* make sure all preceding modifications appear first */ - smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); -@@ -4101,6 +4215,433 @@ void lru_gen_look_around(struct page_vma - } - - /****************************************************************************** -+ * the eviction -+ ******************************************************************************/ -+ -+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) -+{ -+ bool success; -+ int gen = page_lru_gen(page); -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int tier = page_lru_tier(page); -+ int delta = thp_nr_pages(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); -+ -+ /* an mlocked page? */ -+ if (!page_evictable(page)) { -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ SetPageUnevictable(page); -+ add_page_to_lru_list(page, lruvec); -+ __count_vm_events(UNEVICTABLE_PGCULLED, delta); -+ return true; -+ } -+ -+ /* a lazy-free page that has been written into? */ -+ if (type && PageDirty(page) && PageAnon(page)) { -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ SetPageSwapBacked(page); -+ add_page_to_lru_list_tail(page, lruvec); -+ return true; -+ } -+ -+ /* page_update_gen() has updated this page? */ -+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { -+ list_move(&page->lru, &lrugen->lists[gen][type][zone]); -+ return true; -+ } -+ -+ /* protect this page if its tier has a higher refaulted % */ -+ if (tier > tier_idx) { -+ int hist = lru_hist_from_seq(gen); -+ -+ page_inc_gen(page, lruvec, false); -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], -+ lrugen->protected[hist][type][tier - 1] + delta); -+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); -+ return true; -+ } -+ -+ /* mark this page for reclaim if it's pending writeback */ -+ if (PageWriteback(page) || (type && PageDirty(page))) { -+ page_inc_gen(page, lruvec, true); -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) -+{ -+ bool success; -+ -+ if (!sc->may_unmap && page_mapped(page)) -+ return false; -+ -+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && -+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) -+ return false; -+ -+ if (!get_page_unless_zero(page)) -+ return false; -+ -+ if (!TestClearPageLRU(page)) { -+ put_page(page); -+ return false; -+ } -+ -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ -+ return true; -+} -+ -+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, -+ int type, int tier, struct list_head *list) -+{ -+ int gen, zone; -+ enum vm_event_item item; -+ int sorted = 0; -+ int scanned = 0; -+ int isolated = 0; -+ int remaining = MAX_BATCH_SIZE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ -+ VM_BUG_ON(!list_empty(list)); -+ -+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) -+ return 0; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ for (zone = sc->reclaim_idx; zone >= 0; zone--) { -+ LIST_HEAD(moved); -+ int skipped = 0; -+ struct list_head *head = &lrugen->lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct page *page = lru_to_page(head); -+ int delta = thp_nr_pages(page); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ scanned += delta; -+ -+ if (sort_page(page, lruvec, tier)) -+ sorted += delta; -+ else if (isolate_page(page, lruvec, sc)) { -+ list_add(&page->lru, list); -+ isolated += delta; -+ } else { -+ list_move(&page->lru, &moved); -+ skipped += delta; -+ } -+ -+ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) -+ break; -+ } -+ -+ if (skipped) { -+ list_splice(&moved, head); -+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); -+ } -+ -+ if (!remaining || isolated >= MIN_BATCH_SIZE) -+ break; -+ } -+ -+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; -+ if (!cgroup_reclaim(sc)) { -+ __count_vm_events(item, isolated); -+ __count_vm_events(PGREFILL, sorted); -+ } -+ __count_memcg_events(memcg, item, isolated); -+ __count_memcg_events(memcg, PGREFILL, sorted); -+ __count_vm_events(PGSCAN_ANON + type, isolated); -+ -+ /* -+ * We may have trouble finding eligible pages due to reclaim_idx, -+ * may_unmap and may_writepage. Check `remaining` to make sure we won't -+ * be stuck if we aren't making enough progress. -+ */ -+ return isolated || !remaining ? scanned : 0; -+} -+ -+static int get_tier_idx(struct lruvec *lruvec, int type) -+{ -+ int tier; -+ struct ctrl_pos sp, pv; -+ -+ /* -+ * Ideally we don't want to evict upper tiers that have higher refaulted -+ * %. However, we need to leave a margin for the fluctuation in -+ * refaulted %. So we use a larger gain factor to make sure upper tiers -+ * are indeed more active. We choose 2 because the lowest upper tier -+ * would have twice of refaulted % of the base tier, according to their -+ * numbers of accesses. -+ */ -+ read_ctrl_pos(lruvec, type, 0, 1, &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, 2, &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ return tier - 1; -+} -+ -+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) -+{ -+ int type, tier; -+ struct ctrl_pos sp, pv; -+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; -+ -+ /* -+ * Compare refaulted % between the base tiers of anon and file to -+ * determine which type to evict. Also need to compare refaulted % of -+ * the upper tiers of the selected type with that of the base tier of -+ * the other type to determine which tier of the selected type to evict. -+ */ -+ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); -+ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); -+ type = positive_ctrl_err(&sp, &pv); -+ -+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ *tier_idx = tier - 1; -+ -+ return type; -+} -+ -+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ int *type_scanned, struct list_head *list) -+{ -+ int i; -+ int type; -+ int scanned; -+ int tier = -1; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ /* -+ * Try to select a type based on generations and swappiness, and if that -+ * fails, fall back to get_type_to_scan(). When anon and file are both -+ * available from the same generation, swappiness 200 is interpreted as -+ * anon first and swappiness 1 is interpreted as file first. -+ */ -+ if (!swappiness) -+ type = 1; -+ else if (min_seq[0] < min_seq[1]) -+ type = 0; -+ else if (swappiness == 1) -+ type = 1; -+ else if (swappiness == 200) -+ type = 0; -+ else -+ type = get_type_to_scan(lruvec, swappiness, &tier); -+ -+ for (i = !swappiness; i < ANON_AND_FILE; i++) { -+ if (tier < 0) -+ tier = get_tier_idx(lruvec, type); -+ -+ scanned = scan_pages(lruvec, sc, type, tier, list); -+ if (scanned) -+ break; -+ -+ type = !type; -+ tier = -1; -+ } -+ -+ *type_scanned = type; -+ -+ return scanned; -+} -+ -+/* Main function used by the foreground, the background and the user-triggered eviction. */ -+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ int type; -+ int scanned; -+ int reclaimed; -+ LIST_HEAD(list); -+ struct page *page; -+ enum vm_event_item item; -+ struct reclaim_stat stat; -+ struct mm_walk_args *args; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); -+ -+ if (try_to_inc_min_seq(lruvec, swappiness)) -+ scanned++; -+ -+ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) -+ scanned = 0; -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ if (list_empty(&list)) -+ return scanned; -+ -+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); -+ /* -+ * We need to prevent rejected pages from being added back to the same -+ * lists they were isolated from. Otherwise we may risk looping on them -+ * forever. -+ */ -+ list_for_each_entry(page, &list, lru) { -+ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) -+ SetPageActive(page); -+ -+ ClearPageReferenced(page); -+ ClearPageWorkingset(page); -+ } -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ move_pages_to_lru(lruvec, &list); -+ -+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; -+ if (args && args->batch_size) -+ reset_batch_size(lruvec, args); -+ -+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; -+ if (!cgroup_reclaim(sc)) -+ __count_vm_events(item, reclaimed); -+ __count_memcg_events(memcg, item, reclaimed); -+ __count_vm_events(PGSTEAL_ANON + type, reclaimed); -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ mem_cgroup_uncharge_list(&list); -+ free_unref_page_list(&list); -+ -+ sc->nr_reclaimed += reclaimed; -+ -+ return scanned; -+} -+ -+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ bool low; -+ long nr_to_scan; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ int priority = sc->priority; -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (mem_cgroup_below_min(memcg) || -+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) -+ return 0; -+ -+ if (sc->nr_reclaimed >= sc->nr_to_reclaim) { -+ priority = DEF_PRIORITY; -+ sc->force_deactivate = 0; -+ } -+ -+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); -+ if (!nr_to_scan) -+ return 0; -+ -+ nr_to_scan >>= priority; -+ -+ if (!mem_cgroup_online(memcg)) -+ nr_to_scan++; -+ -+ if (!nr_to_scan) -+ return 0; -+ -+ if (current_is_kswapd()) { -+ /* leave the work to lru_gen_age_node() */ -+ if (max_seq - min_seq[1] < MIN_NR_GENS) -+ return 0; -+ -+ if (!low) -+ sc->force_deactivate = 0; -+ -+ return nr_to_scan; -+ } -+ -+ if (max_seq - min_seq[1] >= MIN_NR_GENS) -+ return nr_to_scan; -+ -+ /* move onto slab and other memcgs if we haven't tried them all */ -+ if (!sc->force_deactivate) { -+ sc->skipped_deactivate = 1; -+ return 0; -+ } -+ -+ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; -+} -+ -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ struct blk_plug plug; -+ long scanned = 0; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ lru_add_drain(); -+ -+ if (current_is_kswapd()) -+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; -+ -+ blk_start_plug(&plug); -+ -+ while (true) { -+ int delta; -+ int swappiness; -+ long nr_to_scan; -+ -+ if (sc->may_swap) -+ swappiness = get_swappiness(memcg); -+ else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) -+ swappiness = 1; -+ else -+ swappiness = 0; -+ -+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); -+ if (!nr_to_scan) -+ break; -+ -+ delta = evict_pages(lruvec, sc, swappiness); -+ if (!delta) -+ break; -+ -+ scanned += delta; -+ if (scanned >= nr_to_scan) -+ break; -+ -+ cond_resched(); -+ } -+ -+ blk_finish_plug(&plug); -+ -+ if (current_is_kswapd()) -+ current->reclaim_state->mm_walk_args = NULL; -+} -+ -+/****************************************************************************** - * state change - ******************************************************************************/ - -@@ -4355,6 +4896,10 @@ static void lru_gen_age_node(struct pgli - { - } - -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+} -+ - #endif /* CONFIG_LRU_GEN */ - - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -4368,6 +4913,11 @@ static void shrink_lruvec(struct lruvec - bool proportional_reclaim; - struct blk_plug plug; - -+ if (lru_gen_enabled()) { -+ lru_gen_shrink_lruvec(lruvec, sc); -+ return; -+ } -+ - get_scan_count(lruvec, sc, nr); - - /* Record the original scan target for proportional adjustments later */ -@@ -4839,6 +5389,9 @@ static void snapshot_refaults(struct mem - struct lruvec *target_lruvec; - unsigned long refaults; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; ---- a/mm/workingset.c -+++ b/mm/workingset.c -@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_ - static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, - bool workingset) - { -- eviction >>= bucket_order; - eviction &= EVICTION_MASK; - eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; - eviction = (eviction << NODES_SHIFT) | pgdat->node_id; -@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, - - *memcgidp = memcgid; - *pgdat = NODE_DATA(nid); -- *evictionp = entry << bucket_order; -+ *evictionp = entry; - *workingsetp = workingset; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static int page_lru_refs(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); -+ -+ /* see the comment on MAX_NR_TIERS */ -+ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; -+} -+ -+/* Return a token to be stored in the shadow entry of a page being evicted. */ -+static void *lru_gen_eviction(struct page *page) -+{ -+ int hist, tier; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ int type = page_is_file_lru(page); -+ int refs = page_lru_refs(page); -+ int delta = thp_nr_pages(page); -+ bool workingset = PageWorkingset(page); -+ struct mem_cgroup *memcg = page_memcg(page); -+ struct pglist_data *pgdat = page_pgdat(page); -+ -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ token = (min_seq << LRU_REFS_WIDTH) | refs; -+ -+ hist = lru_hist_from_seq(min_seq); -+ tier = lru_tier_from_refs(refs + workingset); -+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); -+ -+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); -+} -+ -+/* Count a refaulted page based on the token stored in its shadow entry. */ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+ int hist, tier, refs; -+ int memcg_id; -+ bool workingset; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ struct mem_cgroup *memcg; -+ struct pglist_data *pgdat; -+ int type = page_is_file_lru(page); -+ int delta = thp_nr_pages(page); -+ -+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); -+ if (page_pgdat(page) != pgdat) -+ return; -+ -+ rcu_read_lock(); -+ memcg = page_memcg_rcu(page); -+ if (mem_cgroup_id(memcg) != memcg_id) -+ goto unlock; -+ -+ refs = token & (BIT(LRU_REFS_WIDTH) - 1); -+ if (refs && !workingset) -+ goto unlock; -+ -+ token >>= LRU_REFS_WIDTH; -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) -+ goto unlock; -+ -+ hist = lru_hist_from_seq(min_seq); -+ tier = lru_tier_from_refs(refs + workingset); -+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); -+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); -+ -+ /* -+ * Tiers don't offer any protection to pages accessed via page tables. -+ * That's what generations do. Tiers can't fully protect pages after -+ * their numbers of accesses has exceeded the max value. Conservatively -+ * count these two conditions as stalls even though they might not -+ * indicate any real memory pressure. -+ */ -+ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { -+ SetPageWorkingset(page); -+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); -+ } -+unlock: -+ rcu_read_unlock(); -+} -+ -+#else -+ -+static void *lru_gen_eviction(struct page *page) -+{ -+ return NULL; -+} -+ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - /** - * workingset_age_nonresident - age non-resident entries as LRU ages - * @lruvec: the lruvec that was aged -@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - -+ if (lru_gen_enabled()) -+ return lru_gen_eviction(page); -+ - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->nonresident_age); -+ eviction >>= bucket_order; - workingset_age_nonresident(lruvec, thp_nr_pages(page)); - return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); - } -@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag - bool workingset; - int memcgid; - -+ if (lru_gen_enabled()) { -+ lru_gen_refault(page, shadow); -+ return; -+ } -+ - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); -+ eviction <<= bucket_order; - - rcu_read_lock(); - /* |