1 files changed, 339 insertions, 0 deletions
diff --git a/target/linux/generic/pending-4.19/690-net-add-support-for-threaded-NAPI-polling.patch b/target/linux/generic/pending-4.19/690-net-add-support-for-threaded-NAPI-polling.patch
new file mode 100644
index 0000000000..b5c701c80b
--- /dev/null
+++ b/target/linux/generic/pending-4.19/690-net-add-support-for-threaded-NAPI-polling.patch
@@ -0,0 +1,339 @@
+From: Felix Fietkau <nbd@nbd.name>
+Date: Sun, 26 Jul 2020 14:03:21 +0200
+Subject: [PATCH] net: add support for threaded NAPI polling
+
+For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
+poll function does not perform well. Since NAPI poll is bound to the CPU it
+was scheduled from, we can easily end up with a few very busy CPUs spending
+most of their time in softirq/ksoftirqd and some idle ones.
+
+Introduce threaded NAPI for such drivers based on a workqueue. The API is the
+same except for using netif_threaded_napi_add instead of netif_napi_add.
+
+In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
+improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
+NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
+thread.
+
+With threaded NAPI it seems stable and consistent (and higher than the best
+results I got without it).
+
+Based on a patch by Hillf Danton
+
+Cc: Hillf Danton <hdanton@sina.com>
+Signed-off-by: Felix Fietkau <nbd@nbd.name>
+---
+
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -339,6 +339,7 @@ struct napi_struct {
+ 	struct list_head	dev_list;
+ 	struct hlist_node	napi_hash_node;
+ 	unsigned int		napi_id;
++	struct work_struct	work;
+ };
+ 
+ enum {
+@@ -349,6 +350,7 @@ enum {
+ 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
+ 	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
+ 	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
++	NAPI_STATE_THREADED,	/* Use threaded NAPI */
+ };
+ 
+ enum {
+@@ -359,6 +361,7 @@ enum {
+ 	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
+ 	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+ 	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
++	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED),
+ };
+ 
+ enum gro_result {
+@@ -2230,6 +2233,26 @@ void netif_napi_add(struct net_device *d
+ 		    int (*poll)(struct napi_struct *, int), int weight);
+ 
+ /**
++ *	netif_threaded_napi_add - initialize a NAPI context
++ *	@dev:  network device
++ *	@napi: NAPI context
++ *	@poll: polling function
++ *	@weight: default weight
++ *
++ * This variant of netif_napi_add() should be used from drivers using NAPI
++ * with CPU intensive poll functions.
++ * This will schedule polling from a high priority workqueue
++ */
++static inline void netif_threaded_napi_add(struct net_device *dev,
++					   struct napi_struct *napi,
++					   int (*poll)(struct napi_struct *, int),
++					   int weight)
++{
++	set_bit(NAPI_STATE_THREADED, &napi->state);
++	netif_napi_add(dev, napi, poll, weight);
++}
++
++/**
+  *	netif_tx_napi_add - initialize a NAPI context
+  *	@dev:  network device
+  *	@napi: NAPI context
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -160,6 +160,7 @@ static DEFINE_SPINLOCK(offload_lock);
+ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+ struct list_head ptype_all __read_mostly;	/* Taps */
+ static struct list_head offload_base __read_mostly;
++static struct workqueue_struct *napi_workq __read_mostly;
+ 
+ static int netif_rx_internal(struct sk_buff *skb);
+ static int call_netdevice_notifiers_info(unsigned long val,
+@@ -5891,6 +5892,11 @@ void __napi_schedule(struct napi_struct
+ {
+ 	unsigned long flags;
+ 
++	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
++		queue_work(napi_workq, &n->work);
++		return;
++	}
++
+ 	local_irq_save(flags);
+ 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ 	local_irq_restore(flags);
+@@ -5938,6 +5944,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
+  */
+ void __napi_schedule_irqoff(struct napi_struct *n)
+ {
++	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
++		queue_work(napi_workq, &n->work);
++		return;
++	}
++
+ 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ }
+ EXPORT_SYMBOL(__napi_schedule_irqoff);
+@@ -6186,6 +6197,82 @@ static void init_gro_hash(struct napi_st
+ 	napi->gro_bitmask = 0;
+ }
+ 
++static int __napi_poll(struct napi_struct *n, bool *repoll)
++{
++	int work, weight;
++
++	weight = n->weight;
++
++	/* This NAPI_STATE_SCHED test is for avoiding a race
++	 * with netpoll's poll_napi().  Only the entity which
++	 * obtains the lock and sees NAPI_STATE_SCHED set will
++	 * actually make the ->poll() call.  Therefore we avoid
++	 * accidentally calling ->poll() when NAPI is not scheduled.
++	 */
++	work = 0;
++	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
++		work = n->poll(n, weight);
++		trace_napi_poll(n, work, weight);
++	}
++
++	WARN_ON_ONCE(work > weight);
++
++	if (likely(work < weight))
++		return work;
++
++	/* Drivers must not modify the NAPI state if they
++	 * consume the entire weight.  In such cases this code
++	 * still "owns" the NAPI instance and therefore can
++	 * move the instance around on the list at-will.
++	 */
++	if (unlikely(napi_disable_pending(n))) {
++		napi_complete(n);
++		return work;
++	}
++
++	if (n->gro_bitmask) {
++		/* flush too old packets
++		 * If HZ < 1000, flush all packets.
++		 */
++		napi_gro_flush(n, HZ >= 1000);
++	}
++
++	*repoll = true;
++
++	return work;
++}
++
++static void napi_workfn(struct work_struct *work)
++{
++	struct napi_struct *n = container_of(work, struct napi_struct, work);
++	void *have;
++
++	for (;;) {
++		bool repoll = false;
++
++		local_bh_disable();
++
++		have = netpoll_poll_lock(n);
++		__napi_poll(n, &repoll);
++		netpoll_poll_unlock(have);
++
++		local_bh_enable();
++
++		if (!repoll)
++			return;
++
++		if (!need_resched())
++			continue;
++
++		/*
++		 * have to pay for the latency of task switch even if
++		 * napi is scheduled
++		 */
++		queue_work(napi_workq, work);
++		return;
++	}
++}
++
+ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+ 		    int (*poll)(struct napi_struct *, int), int weight)
+ {
+@@ -6204,6 +6291,7 @@ void netif_napi_add(struct net_device *d
+ #ifdef CONFIG_NETPOLL
+ 	napi->poll_owner = -1;
+ #endif
++	INIT_WORK(&napi->work, napi_workfn);
+ 	set_bit(NAPI_STATE_SCHED, &napi->state);
+ 	napi_hash_add(napi);
+ }
+@@ -6242,6 +6330,7 @@ static void flush_gro_hash(struct napi_s
+ void netif_napi_del(struct napi_struct *napi)
+ {
+ 	might_sleep();
++	cancel_work_sync(&napi->work);
+ 	if (napi_hash_del(napi))
+ 		synchronize_net();
+ 	list_del_init(&napi->dev_list);
+@@ -6254,48 +6343,18 @@ EXPORT_SYMBOL(netif_napi_del);
+ 
+ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+ {
++	bool do_repoll = false;
+ 	void *have;
+-	int work, weight;
++	int work;
+ 
+ 	list_del_init(&n->poll_list);
+ 
+ 	have = netpoll_poll_lock(n);
+ 
+-	weight = n->weight;
+-
+-	/* This NAPI_STATE_SCHED test is for avoiding a race
+-	 * with netpoll's poll_napi().  Only the entity which
+-	 * obtains the lock and sees NAPI_STATE_SCHED set will
+-	 * actually make the ->poll() call.  Therefore we avoid
+-	 * accidentally calling ->poll() when NAPI is not scheduled.
+-	 */
+-	work = 0;
+-	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+-		work = n->poll(n, weight);
+-		trace_napi_poll(n, work, weight);
+-	}
+-
+-	WARN_ON_ONCE(work > weight);
+-
+-	if (likely(work < weight))
+-		goto out_unlock;
++	work = __napi_poll(n, &do_repoll);
+ 
+-	/* Drivers must not modify the NAPI state if they
+-	 * consume the entire weight.  In such cases this code
+-	 * still "owns" the NAPI instance and therefore can
+-	 * move the instance around on the list at-will.
+-	 */
+-	if (unlikely(napi_disable_pending(n))) {
+-		napi_complete(n);
++	if (!do_repoll)
+ 		goto out_unlock;
+-	}
+-
+-	if (n->gro_bitmask) {
+-		/* flush too old packets
+-		 * If HZ < 1000, flush all packets.
+-		 */
+-		napi_gro_flush(n, HZ >= 1000);
+-	}
+ 
+ 	/* Some drivers may have called napi_schedule
+ 	 * prior to exhausting their budget.
+@@ -9895,6 +9954,10 @@ static int __init net_dev_init(void)
+ 		sd->backlog.weight = weight_p;
+ 	}
+ 
++	napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
++				     WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
++	BUG_ON(!napi_workq);
++
+ 	dev_boot_phase = 0;
+ 
+ 	/* The loopback device is special if any other network devices
+--- a/net/core/net-sysfs.c
++++ b/net/core/net-sysfs.c
+@@ -447,6 +447,52 @@ static ssize_t proto_down_store(struct d
+ }
+ NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+ 
++static int change_napi_threaded(struct net_device *dev, unsigned long val)
++{
++	struct napi_struct *napi;
++
++	if (list_empty(&dev->napi_list))
++		return -EOPNOTSUPP;
++
++	list_for_each_entry(napi, &dev->napi_list, dev_list) {
++		if (val)
++			set_bit(NAPI_STATE_THREADED, &napi->state);
++		else
++			clear_bit(NAPI_STATE_THREADED, &napi->state);
++	}
++
++	return 0;
++}
++
++static ssize_t napi_threaded_store(struct device *dev,
++				struct device_attribute *attr,
++				const char *buf, size_t len)
++{
++	return netdev_store(dev, attr, buf, len, change_napi_threaded);
++}
++
++static ssize_t napi_threaded_show(struct device *dev,
++				  struct device_attribute *attr,
++				  char *buf)
++{
++	struct net_device *netdev = to_net_dev(dev);
++	struct napi_struct *napi;
++	bool enabled = false;
++
++	if (!rtnl_trylock())
++		return restart_syscall();
++
++	list_for_each_entry(napi, &netdev->napi_list, dev_list) {
++		if (test_bit(NAPI_STATE_THREADED, &napi->state))
++			enabled = true;
++	}
++
++	rtnl_unlock();
++
++	return sprintf(buf, fmt_dec, enabled);
++}
++static DEVICE_ATTR_RW(napi_threaded);
++
+ static ssize_t phys_port_id_show(struct device *dev,
+ 				 struct device_attribute *attr, char *buf)
+ {
+@@ -542,6 +588,7 @@ static struct attribute *net_class_attrs
+ 	&dev_attr_flags.attr,
+ 	&dev_attr_tx_queue_len.attr,
+ 	&dev_attr_gro_flush_timeout.attr,
++	&dev_attr_napi_threaded.attr,
+ 	&dev_attr_phys_port_id.attr,
+ 	&dev_attr_phys_port_name.attr,
+ 	&dev_attr_phys_switch_id.attr,