1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
|
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 26 Jul 2020 14:03:21 +0200
Subject: [PATCH] net: add support for threaded NAPI polling
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
poll function does not perform well. Since NAPI poll is bound to the CPU it
was scheduled from, we can easily end up with a few very busy CPUs spending
most of their time in softirq/ksoftirqd and some idle ones.
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
same except for using netif_threaded_napi_add instead of netif_napi_add.
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
thread.
With threaded NAPI it seems stable and consistent (and higher than the best
results I got without it).
Based on a patch by Hillf Danton
Cc: Hillf Danton <hdanton@sina.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -340,6 +340,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
+ struct work_struct work;
};
enum {
@@ -350,6 +351,7 @@ enum {
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_THREADED, /* Use threaded NAPI */
};
enum {
@@ -360,6 +362,7 @@ enum {
NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
};
enum gro_result {
@@ -2101,6 +2104,7 @@ struct net_device {
struct lock_class_key addr_list_lock_key;
bool proto_down;
unsigned wol_enabled:1;
+ unsigned threaded:1;
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
@@ -2281,6 +2285,26 @@ void netif_napi_add(struct net_device *d
int (*poll)(struct napi_struct *, int), int weight);
/**
+ * netif_threaded_napi_add - initialize a NAPI context
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @weight: default weight
+ *
+ * This variant of netif_napi_add() should be used from drivers using NAPI
+ * with CPU intensive poll functions.
+ * This will schedule polling from a high priority workqueue
+ */
+static inline void netif_threaded_napi_add(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ netif_napi_add(dev, napi, poll, weight);
+}
+
+/**
* netif_tx_napi_add - initialize a NAPI context
* @dev: network device
* @napi: NAPI context
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -156,6 +156,7 @@ static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
+static struct workqueue_struct *napi_workq __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
@@ -5912,6 +5913,11 @@ void __napi_schedule(struct napi_struct
{
unsigned long flags;
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+ queue_work(napi_workq, &n->work);
+ return;
+ }
+
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
@@ -5959,6 +5965,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+ queue_work(napi_workq, &n->work);
+ return;
+ }
+
____napi_schedule(this_cpu_ptr(&softnet_data), n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
@@ -6220,9 +6231,89 @@ static void init_gro_hash(struct napi_st
napi->gro_bitmask = 0;
}
+static int __napi_poll(struct napi_struct *n, bool *repoll)
+{
+ int work, weight;
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n, work, weight);
+ }
+
+ WARN_ON_ONCE(work > weight);
+
+ if (likely(work < weight))
+ return work;
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ return work;
+ }
+
+ if (n->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(n, HZ >= 1000);
+ }
+
+ gro_normal_list(n);
+
+ *repoll = true;
+
+ return work;
+}
+
+static void napi_workfn(struct work_struct *work)
+{
+ struct napi_struct *n = container_of(work, struct napi_struct, work);
+ void *have;
+
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(n);
+ __napi_poll(n, &repoll);
+ netpoll_poll_unlock(have);
+
+ local_bh_enable();
+
+ if (!repoll)
+ return;
+
+ if (!need_resched())
+ continue;
+
+ /*
+ * have to pay for the latency of task switch even if
+ * napi is scheduled
+ */
+ queue_work(napi_workq, work);
+ return;
+ }
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
+ if (dev->threaded)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
INIT_LIST_HEAD(&napi->poll_list);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
@@ -6239,6 +6330,7 @@ void netif_napi_add(struct net_device *d
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ INIT_WORK(&napi->work, napi_workfn);
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
@@ -6279,6 +6371,7 @@ static void flush_gro_hash(struct napi_s
void netif_napi_del(struct napi_struct *napi)
{
might_sleep();
+ cancel_work_sync(&napi->work);
if (napi_hash_del(napi))
synchronize_net();
list_del_init(&napi->dev_list);
@@ -6291,50 +6384,18 @@ EXPORT_SYMBOL(netif_napi_del);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
+ bool do_repoll = false;
void *have;
- int work, weight;
+ int work;
list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
- weight = n->weight;
+ work = __napi_poll(n, &do_repoll);
- /* This NAPI_STATE_SCHED test is for avoiding a race
- * with netpoll's poll_napi(). Only the entity which
- * obtains the lock and sees NAPI_STATE_SCHED set will
- * actually make the ->poll() call. Therefore we avoid
- * accidentally calling ->poll() when NAPI is not scheduled.
- */
- work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- work = n->poll(n, weight);
- trace_napi_poll(n, work, weight);
- }
-
- WARN_ON_ONCE(work > weight);
-
- if (likely(work < weight))
- goto out_unlock;
-
- /* Drivers must not modify the NAPI state if they
- * consume the entire weight. In such cases this code
- * still "owns" the NAPI instance and therefore can
- * move the instance around on the list at-will.
- */
- if (unlikely(napi_disable_pending(n))) {
- napi_complete(n);
+ if (!do_repoll)
goto out_unlock;
- }
-
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
- }
-
- gro_normal_list(n);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
@@ -10314,6 +10375,10 @@ static int __init net_dev_init(void)
sd->backlog.weight = weight_p;
}
+ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
+ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
+ BUG_ON(!napi_workq);
+
dev_boot_phase = 0;
/* The loopback device is special if any other network devices
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -442,6 +442,52 @@ static ssize_t proto_down_store(struct d
}
NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ struct napi_struct *napi;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (val)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ else
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
+ }
+
+ return 0;
+}
+
+static ssize_t napi_threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_napi_threaded);
+}
+
+static ssize_t napi_threaded_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct napi_struct *napi;
+ bool enabled = false;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (test_bit(NAPI_STATE_THREADED, &napi->state))
+ enabled = true;
+ }
+
+ rtnl_unlock();
+
+ return sprintf(buf, fmt_dec, enabled);
+}
+static DEVICE_ATTR_RW(napi_threaded);
+
static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -532,6 +578,7 @@ static struct attribute *net_class_attrs
&dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
+ &dev_attr_napi_threaded.attr,
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
|