1 files changed, 320 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch
new file mode 100644
index 0000000000..e47bfc36d4
--- /dev/null
+++ b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch
@@ -0,0 +1,320 @@
+From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Sun, 18 Sep 2022 02:00:06 -0600
+Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When multiple memcgs are available, it is possible to use generations as a
+frame of reference to make better choices and improve overall performance
+under global memory pressure.  This patch adds a basic optimization to
+select memcgs that can drop single-use unmapped clean pages first.  Doing
+so reduces the chance of going into the aging path or swapping, which can
+be costly.
+
+A typical example that benefits from this optimization is a server running
+mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
+buffered I/O workload in the other.
+
+Though this optimization can be applied to both kswapd and direct reclaim,
+it is only added to kswapd to keep the patchset manageable.  Later
+improvements may cover the direct reclaim path.
+
+While ensuring certain fairness to all eligible memcgs, proportional scans
+of individual memcgs also require proper backoff to avoid overshooting
+their aggregate reclaim target by too much.  Otherwise it can cause high
+direct reclaim latency.  The conditions for backoff are:
+
+1. At low priorities, for direct reclaim, if aging fairness or direct
+   reclaim latency is at risk, i.e., aging one memcg multiple times or
+   swapping after the target is met.
+2. At high priorities, for global reclaim, if per-zone free pages are
+   above respective watermarks.
+
+Server benchmark results:
+  Mixed workloads:
+    fio (buffered I/O): +[19, 21]%
+                IOPS         BW
+      patch1-8: 1880k        7343MiB/s
+      patch1-9: 2252k        8796MiB/s
+
+    memcached (anon): +[119, 123]%
+                Ops/sec      KB/sec
+      patch1-8: 862768.65    33514.68
+      patch1-9: 1911022.12   74234.54
+
+  Mixed workloads:
+    fio (buffered I/O): +[75, 77]%
+                IOPS         BW
+      5.19-rc1: 1279k        4996MiB/s
+      patch1-9: 2252k        8796MiB/s
+
+    memcached (anon): +[13, 15]%
+                Ops/sec      KB/sec
+      5.19-rc1: 1673524.04   65008.87
+      patch1-9: 1911022.12   74234.54
+
+  Configurations:
+    (changes since patch 6)
+
+    cat mixed.sh
+    modprobe brd rd_nr=2 rd_size=56623104
+
+    swapoff -a
+    mkswap /dev/ram0
+    swapon /dev/ram0
+
+    mkfs.ext4 /dev/ram1
+    mount -t ext4 /dev/ram1 /mnt
+
+    memtier_benchmark -S /var/run/memcached/memcached.sock \
+      -P memcache_binary -n allkeys --key-minimum=1 \
+      --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
+      --ratio 1:0 --pipeline 8 -d 2000
+
+    fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
+      --buffered=1 --ioengine=io_uring --iodepth=128 \
+      --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
+      --rw=randread --random_distribution=random --norandommap \
+      --time_based --ramp_time=10m --runtime=90m --group_reporting &
+    pid=$!
+
+    sleep 200
+
+    memtier_benchmark -S /var/run/memcached/memcached.sock \
+      -P memcache_binary -n allkeys --key-minimum=1 \
+      --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
+      --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
+
+    kill -INT $pid
+    wait
+
+Client benchmark results:
+  no change (CONFIG_MEMCG=n)
+
+Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Acked-by: Brian Geffon <bgeffon@google.com>
+Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
+Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+Acked-by: Steven Barrett <steven@liquorix.net>
+Acked-by: Suleiman Souhlal <suleiman@google.com>
+Tested-by: Daniel Byrne <djbyrne@mtu.edu>
+Tested-by: Donald Carr <d@chaos-reins.com>
+Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
+Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
+Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
+Tested-by: Sofia Trinh <sofia.trinh@edi.works>
+Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Cc: Barry Song <baohua@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Hillf Danton <hdanton@sina.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Michael Larabel <Michael@MichaelLarabel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mike Rapoport <rppt@kernel.org>
+Cc: Mike Rapoport <rppt@linux.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 96 insertions(+), 9 deletions(-)
+
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index a7844c689522..b6f6fc2585e1 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -127,6 +127,12 @@ struct scan_control {
+ 	/* Always discard instead of demoting to lower tier memory */
+ 	unsigned int no_demotion:1;
+ 
++#ifdef CONFIG_LRU_GEN
++	/* help kswapd make better choices among multiple memcgs */
++	unsigned int memcgs_need_aging:1;
++	unsigned long last_reclaimed;
++#endif
++
+ 	/* Allocation order */
+ 	s8 order;
+ 
+@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ 
+ 	VM_WARN_ON_ONCE(!current_is_kswapd());
+ 
++	sc->last_reclaimed = sc->nr_reclaimed;
++
++	/*
++	 * To reduce the chance of going into the aging path, which can be
++	 * costly, optimistically skip it if the flag below was cleared in the
++	 * eviction path. This improves the overall performance when multiple
++	 * memcgs are available.
++	 */
++	if (!sc->memcgs_need_aging) {
++		sc->memcgs_need_aging = true;
++		return;
++	}
++
+ 	set_mm_walk(pgdat);
+ 
+ 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swa
+ 	return scanned;
+ }
+ 
+-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++		       bool *need_swapping)
+ {
+ 	int type;
+ 	int scanned;
+@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
+ 
+ 	sc->nr_reclaimed += reclaimed;
+ 
++	if (need_swapping && type == LRU_GEN_ANON)
++		*need_swapping = true;
++
+ 	return scanned;
+ }
+ 
+@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
+  *    reclaim.
+  */
+ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+-				    bool can_swap)
++				    bool can_swap, bool *need_aging)
+ {
+-	bool need_aging;
+ 	unsigned long nr_to_scan;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MAX_SEQ(lruvec);
+@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
+ 	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+ 		return 0;
+ 
+-	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+-	if (!need_aging)
++	*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
++	if (!*need_aging)
+ 		return nr_to_scan;
+ 
+ 	/* skip the aging path at the default priority */
+@@ -4715,10 +4737,68 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
+ 	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+ }
+ 
++static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
++			      struct scan_control *sc, bool need_swapping)
++{
++	int i;
++	DEFINE_MAX_SEQ(lruvec);
++
++	if (!current_is_kswapd()) {
++		/* age each memcg once to ensure fairness */
++		if (max_seq - seq > 1)
++			return true;
++
++		/* over-swapping can increase allocation latency */
++		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
++			return true;
++
++		/* give this thread a chance to exit and free its memory */
++		if (fatal_signal_pending(current)) {
++			sc->nr_reclaimed += MIN_LRU_BATCH;
++			return true;
++		}
++
++		if (cgroup_reclaim(sc))
++			return false;
++	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
++		return false;
++
++	/* keep scanning at low priorities to ensure fairness */
++	if (sc->priority > DEF_PRIORITY - 2)
++		return false;
++
++	/*
++	 * A minimum amount of work was done under global memory pressure. For
++	 * kswapd, it may be overshooting. For direct reclaim, the target isn't
++	 * met, and yet the allocation may still succeed, since kswapd may have
++	 * caught up. In either case, it's better to stop now, and restart if
++	 * necessary.
++	 */
++	for (i = 0; i <= sc->reclaim_idx; i++) {
++		unsigned long wmark;
++		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
++
++		if (!managed_zone(zone))
++			continue;
++
++		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
++		if (wmark > zone_page_state(zone, NR_FREE_PAGES))
++			return false;
++	}
++
++	sc->nr_reclaimed += MIN_LRU_BATCH;
++
++	return true;
++}
++
+ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ 	struct blk_plug plug;
++	bool need_aging = false;
++	bool need_swapping = false;
+ 	unsigned long scanned = 0;
++	unsigned long reclaimed = sc->nr_reclaimed;
++	DEFINE_MAX_SEQ(lruvec);
+ 
+ 	lru_add_drain();
+ 
+@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
+ 		else
+ 			swappiness = 0;
+ 
+-		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
++		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
+ 		if (!nr_to_scan)
+-			break;
++			goto done;
+ 
+-		delta = evict_pages(lruvec, sc, swappiness);
++		delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
+ 		if (!delta)
+-			break;
++			goto done;
+ 
+ 		scanned += delta;
+ 		if (scanned >= nr_to_scan)
+ 			break;
+ 
++		if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
++			break;
++
+ 		cond_resched();
+ 	}
+ 
++	/* see the comment in lru_gen_age_node() */
++	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
++		sc->memcgs_need_aging = false;
++done:
+ 	clear_mm_walk();
+ 
+ 	blk_finish_plug(&plug);
+-- 
+2.40.0
+