aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch')
-rw-r--r--target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch320
1 files changed, 320 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch
new file mode 100644
index 0000000000..e47bfc36d4
--- /dev/null
+++ b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch
@@ -0,0 +1,320 @@
+From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Sun, 18 Sep 2022 02:00:06 -0600
+Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When multiple memcgs are available, it is possible to use generations as a
+frame of reference to make better choices and improve overall performance
+under global memory pressure. This patch adds a basic optimization to
+select memcgs that can drop single-use unmapped clean pages first. Doing
+so reduces the chance of going into the aging path or swapping, which can
+be costly.
+
+A typical example that benefits from this optimization is a server running
+mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
+buffered I/O workload in the other.
+
+Though this optimization can be applied to both kswapd and direct reclaim,
+it is only added to kswapd to keep the patchset manageable. Later
+improvements may cover the direct reclaim path.
+
+While ensuring certain fairness to all eligible memcgs, proportional scans
+of individual memcgs also require proper backoff to avoid overshooting
+their aggregate reclaim target by too much. Otherwise it can cause high
+direct reclaim latency. The conditions for backoff are:
+
+1. At low priorities, for direct reclaim, if aging fairness or direct
+ reclaim latency is at risk, i.e., aging one memcg multiple times or
+ swapping after the target is met.
+2. At high priorities, for global reclaim, if per-zone free pages are
+ above respective watermarks.
+
+Server benchmark results:
+ Mixed workloads:
+ fio (buffered I/O): +[19, 21]%
+ IOPS BW
+ patch1-8: 1880k 7343MiB/s
+ patch1-9: 2252k 8796MiB/s
+
+ memcached (anon): +[119, 123]%
+ Ops/sec KB/sec
+ patch1-8: 862768.65 33514.68
+ patch1-9: 1911022.12 74234.54
+
+ Mixed workloads:
+ fio (buffered I/O): +[75, 77]%
+ IOPS BW
+ 5.19-rc1: 1279k 4996MiB/s
+ patch1-9: 2252k 8796MiB/s
+
+ memcached (anon): +[13, 15]%
+ Ops/sec KB/sec
+ 5.19-rc1: 1673524.04 65008.87
+ patch1-9: 1911022.12 74234.54
+
+ Configurations:
+ (changes since patch 6)
+
+ cat mixed.sh
+ modprobe brd rd_nr=2 rd_size=56623104
+
+ swapoff -a
+ mkswap /dev/ram0
+ swapon /dev/ram0
+
+ mkfs.ext4 /dev/ram1
+ mount -t ext4 /dev/ram1 /mnt
+
+ memtier_benchmark -S /var/run/memcached/memcached.sock \
+ -P memcache_binary -n allkeys --key-minimum=1 \
+ --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
+ --ratio 1:0 --pipeline 8 -d 2000
+
+ fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
+ --buffered=1 --ioengine=io_uring --iodepth=128 \
+ --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
+ --rw=randread --random_distribution=random --norandommap \
+ --time_based --ramp_time=10m --runtime=90m --group_reporting &
+ pid=$!
+
+ sleep 200
+
+ memtier_benchmark -S /var/run/memcached/memcached.sock \
+ -P memcache_binary -n allkeys --key-minimum=1 \
+ --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
+ --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
+
+ kill -INT $pid
+ wait
+
+Client benchmark results:
+ no change (CONFIG_MEMCG=n)
+
+Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Acked-by: Brian Geffon <bgeffon@google.com>
+Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
+Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+Acked-by: Steven Barrett <steven@liquorix.net>
+Acked-by: Suleiman Souhlal <suleiman@google.com>
+Tested-by: Daniel Byrne <djbyrne@mtu.edu>
+Tested-by: Donald Carr <d@chaos-reins.com>
+Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
+Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
+Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
+Tested-by: Sofia Trinh <sofia.trinh@edi.works>
+Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Cc: Barry Song <baohua@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Hillf Danton <hdanton@sina.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Michael Larabel <Michael@MichaelLarabel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mike Rapoport <rppt@kernel.org>
+Cc: Mike Rapoport <rppt@linux.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 96 insertions(+), 9 deletions(-)
+
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index a7844c689522..b6f6fc2585e1 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -127,6 +127,12 @@ struct scan_control {
+ /* Always discard instead of demoting to lower tier memory */
+ unsigned int no_demotion:1;
+
++#ifdef CONFIG_LRU_GEN
++ /* help kswapd make better choices among multiple memcgs */
++ unsigned int memcgs_need_aging:1;
++ unsigned long last_reclaimed;
++#endif
++
+ /* Allocation order */
+ s8 order;
+
+@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+
+ VM_WARN_ON_ONCE(!current_is_kswapd());
+
++ sc->last_reclaimed = sc->nr_reclaimed;
++
++ /*
++ * To reduce the chance of going into the aging path, which can be
++ * costly, optimistically skip it if the flag below was cleared in the
++ * eviction path. This improves the overall performance when multiple
++ * memcgs are available.
++ */
++ if (!sc->memcgs_need_aging) {
++ sc->memcgs_need_aging = true;
++ return;
++ }
++
+ set_mm_walk(pgdat);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swa
+ return scanned;
+ }
+
+-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++ bool *need_swapping)
+ {
+ int type;
+ int scanned;
+@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
+
+ sc->nr_reclaimed += reclaimed;
+
++ if (need_swapping && type == LRU_GEN_ANON)
++ *need_swapping = true;
++
+ return scanned;
+ }
+
+@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
+ * reclaim.
+ */
+ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+- bool can_swap)
++ bool can_swap, bool *need_aging)
+ {
+- bool need_aging;
+ unsigned long nr_to_scan;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+ return 0;
+
+- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+- if (!need_aging)
++ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
++ if (!*need_aging)
+ return nr_to_scan;
+
+ /* skip the aging path at the default priority */
+@@ -4715,10 +4737,68 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
+ return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+ }
+
++static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
++ struct scan_control *sc, bool need_swapping)
++{
++ int i;
++ DEFINE_MAX_SEQ(lruvec);
++
++ if (!current_is_kswapd()) {
++ /* age each memcg once to ensure fairness */
++ if (max_seq - seq > 1)
++ return true;
++
++ /* over-swapping can increase allocation latency */
++ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
++ return true;
++
++ /* give this thread a chance to exit and free its memory */
++ if (fatal_signal_pending(current)) {
++ sc->nr_reclaimed += MIN_LRU_BATCH;
++ return true;
++ }
++
++ if (cgroup_reclaim(sc))
++ return false;
++ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
++ return false;
++
++ /* keep scanning at low priorities to ensure fairness */
++ if (sc->priority > DEF_PRIORITY - 2)
++ return false;
++
++ /*
++ * A minimum amount of work was done under global memory pressure. For
++ * kswapd, it may be overshooting. For direct reclaim, the target isn't
++ * met, and yet the allocation may still succeed, since kswapd may have
++ * caught up. In either case, it's better to stop now, and restart if
++ * necessary.
++ */
++ for (i = 0; i <= sc->reclaim_idx; i++) {
++ unsigned long wmark;
++ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
++
++ if (!managed_zone(zone))
++ continue;
++
++ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
++ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
++ return false;
++ }
++
++ sc->nr_reclaimed += MIN_LRU_BATCH;
++
++ return true;
++}
++
+ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ struct blk_plug plug;
++ bool need_aging = false;
++ bool need_swapping = false;
+ unsigned long scanned = 0;
++ unsigned long reclaimed = sc->nr_reclaimed;
++ DEFINE_MAX_SEQ(lruvec);
+
+ lru_add_drain();
+
+@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
+ else
+ swappiness = 0;
+
+- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
+ if (!nr_to_scan)
+- break;
++ goto done;
+
+- delta = evict_pages(lruvec, sc, swappiness);
++ delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
+ if (!delta)
+- break;
++ goto done;
+
+ scanned += delta;
+ if (scanned >= nr_to_scan)
+ break;
+
++ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
++ break;
++
+ cond_resched();
+ }
+
++ /* see the comment in lru_gen_age_node() */
++ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
++ sc->memcgs_need_aging = false;
++done:
+ clear_mm_walk();
+
+ blk_finish_plug(&plug);
+--
+2.40.0
+