diff options
Diffstat (limited to 'target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch')
-rw-r--r-- | target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch | 320 |
1 files changed, 320 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch new file mode 100644 index 0000000000..e47bfc36d4 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch @@ -0,0 +1,320 @@ +From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001 +From: Yu Zhao <yuzhao@google.com> +Date: Sun, 18 Sep 2022 02:00:06 -0600 +Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When multiple memcgs are available, it is possible to use generations as a +frame of reference to make better choices and improve overall performance +under global memory pressure. This patch adds a basic optimization to +select memcgs that can drop single-use unmapped clean pages first. Doing +so reduces the chance of going into the aging path or swapping, which can +be costly. + +A typical example that benefits from this optimization is a server running +mixed types of workloads, e.g., heavy anon workload in one memcg and heavy +buffered I/O workload in the other. + +Though this optimization can be applied to both kswapd and direct reclaim, +it is only added to kswapd to keep the patchset manageable. Later +improvements may cover the direct reclaim path. + +While ensuring certain fairness to all eligible memcgs, proportional scans +of individual memcgs also require proper backoff to avoid overshooting +their aggregate reclaim target by too much. Otherwise it can cause high +direct reclaim latency. The conditions for backoff are: + +1. At low priorities, for direct reclaim, if aging fairness or direct + reclaim latency is at risk, i.e., aging one memcg multiple times or + swapping after the target is met. +2. At high priorities, for global reclaim, if per-zone free pages are + above respective watermarks. + +Server benchmark results: + Mixed workloads: + fio (buffered I/O): +[19, 21]% + IOPS BW + patch1-8: 1880k 7343MiB/s + patch1-9: 2252k 8796MiB/s + + memcached (anon): +[119, 123]% + Ops/sec KB/sec + patch1-8: 862768.65 33514.68 + patch1-9: 1911022.12 74234.54 + + Mixed workloads: + fio (buffered I/O): +[75, 77]% + IOPS BW + 5.19-rc1: 1279k 4996MiB/s + patch1-9: 2252k 8796MiB/s + + memcached (anon): +[13, 15]% + Ops/sec KB/sec + 5.19-rc1: 1673524.04 65008.87 + patch1-9: 1911022.12 74234.54 + + Configurations: + (changes since patch 6) + + cat mixed.sh + modprobe brd rd_nr=2 rd_size=56623104 + + swapoff -a + mkswap /dev/ram0 + swapon /dev/ram0 + + mkfs.ext4 /dev/ram1 + mount -t ext4 /dev/ram1 /mnt + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \ + --ratio 1:0 --pipeline 8 -d 2000 + + fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \ + --buffered=1 --ioengine=io_uring --iodepth=128 \ + --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ + --rw=randread --random_distribution=random --norandommap \ + --time_based --ramp_time=10m --runtime=90m --group_reporting & + pid=$! + + sleep 200 + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \ + --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed + + kill -INT $pid + wait + +Client benchmark results: + no change (CONFIG_MEMCG=n) + +Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com +Signed-off-by: Yu Zhao <yuzhao@google.com> +Acked-by: Brian Geffon <bgeffon@google.com> +Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> +Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Acked-by: Steven Barrett <steven@liquorix.net> +Acked-by: Suleiman Souhlal <suleiman@google.com> +Tested-by: Daniel Byrne <djbyrne@mtu.edu> +Tested-by: Donald Carr <d@chaos-reins.com> +Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> +Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> +Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> +Tested-by: Sofia Trinh <sofia.trinh@edi.works> +Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> +Cc: Barry Song <baohua@kernel.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Hillf Danton <hdanton@sina.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Johannes Weiner <hannes@cmpxchg.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Miaohe Lin <linmiaohe@huawei.com> +Cc: Michael Larabel <Michael@MichaelLarabel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Mike Rapoport <rppt@kernel.org> +Cc: Mike Rapoport <rppt@linux.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Qi Zheng <zhengqi.arch@bytedance.com> +Cc: Tejun Heo <tj@kernel.org> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Will Deacon <will@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 96 insertions(+), 9 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index a7844c689522..b6f6fc2585e1 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -127,6 +127,12 @@ struct scan_control { + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + ++#ifdef CONFIG_LRU_GEN ++ /* help kswapd make better choices among multiple memcgs */ ++ unsigned int memcgs_need_aging:1; ++ unsigned long last_reclaimed; ++#endif ++ + /* Allocation order */ + s8 order; + +@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + + VM_WARN_ON_ONCE(!current_is_kswapd()); + ++ sc->last_reclaimed = sc->nr_reclaimed; ++ ++ /* ++ * To reduce the chance of going into the aging path, which can be ++ * costly, optimistically skip it if the flag below was cleared in the ++ * eviction path. This improves the overall performance when multiple ++ * memcgs are available. ++ */ ++ if (!sc->memcgs_need_aging) { ++ sc->memcgs_need_aging = true; ++ return; ++ } ++ + set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); +@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swa + return scanned; + } + +-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ bool *need_swapping) + { + int type; + int scanned; +@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + + sc->nr_reclaimed += reclaimed; + ++ if (need_swapping && type == LRU_GEN_ANON) ++ *need_swapping = true; ++ + return scanned; + } + +@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp + * reclaim. + */ + static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, +- bool can_swap) ++ bool can_swap, bool *need_aging) + { +- bool need_aging; + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); +@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + +- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); +- if (!need_aging) ++ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); ++ if (!*need_aging) + return nr_to_scan; + + /* skip the aging path at the default priority */ +@@ -4715,10 +4737,68 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; + } + ++static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, ++ struct scan_control *sc, bool need_swapping) ++{ ++ int i; ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (!current_is_kswapd()) { ++ /* age each memcg once to ensure fairness */ ++ if (max_seq - seq > 1) ++ return true; ++ ++ /* over-swapping can increase allocation latency */ ++ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) ++ return true; ++ ++ /* give this thread a chance to exit and free its memory */ ++ if (fatal_signal_pending(current)) { ++ sc->nr_reclaimed += MIN_LRU_BATCH; ++ return true; ++ } ++ ++ if (cgroup_reclaim(sc)) ++ return false; ++ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) ++ return false; ++ ++ /* keep scanning at low priorities to ensure fairness */ ++ if (sc->priority > DEF_PRIORITY - 2) ++ return false; ++ ++ /* ++ * A minimum amount of work was done under global memory pressure. For ++ * kswapd, it may be overshooting. For direct reclaim, the target isn't ++ * met, and yet the allocation may still succeed, since kswapd may have ++ * caught up. In either case, it's better to stop now, and restart if ++ * necessary. ++ */ ++ for (i = 0; i <= sc->reclaim_idx; i++) { ++ unsigned long wmark; ++ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; ++ ++ if (!managed_zone(zone)) ++ continue; ++ ++ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); ++ if (wmark > zone_page_state(zone, NR_FREE_PAGES)) ++ return false; ++ } ++ ++ sc->nr_reclaimed += MIN_LRU_BATCH; ++ ++ return true; ++} ++ + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + struct blk_plug plug; ++ bool need_aging = false; ++ bool need_swapping = false; + unsigned long scanned = 0; ++ unsigned long reclaimed = sc->nr_reclaimed; ++ DEFINE_MAX_SEQ(lruvec); + + lru_add_drain(); + +@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + else + swappiness = 0; + +- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); + if (!nr_to_scan) +- break; ++ goto done; + +- delta = evict_pages(lruvec, sc, swappiness); ++ delta = evict_pages(lruvec, sc, swappiness, &need_swapping); + if (!delta) +- break; ++ goto done; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + ++ if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) ++ break; ++ + cond_resched(); + } + ++ /* see the comment in lru_gen_age_node() */ ++ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) ++ sc->memcgs_need_aging = false; ++done: + clear_mm_walk(); + + blk_finish_plug(&plug); +-- +2.40.0 + |