From 7d4143234c4dfdd050ebc64ec8231f9d81ea65af Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 24 Feb 2021 13:15:08 +0100 Subject: kernel: 5.10: wireguard: backport 5.12-rc1 changes in net.git These will eventually make their way to 5.10, but it could be a while. https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/commit/?id=ee576c47db60432c37e54b1e2b43a8ca6d3a8dca https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/commit/?id=5a0598695634a6bb4126818902dd9140cd9df8b6 https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/commit/?id=99fff5264e7ab06f45b0ad60243475be0a8d0559 https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/commit/?id=8b5553ace83cced775eefd0f3f18b5c6214ccf7a https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/commit/?id=bce2473927af8de12ad131a743f55d69d358c0b9 Signed-off-by: Jason A. Donenfeld [Rename 082-wireguard-kconfig... to 083-wireguard-kconfig...] Signed-off-by: Ilya Lipnitskiy --- ...ss-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch | 337 +++++++++++++ ...eer-put-frequently-used-members-above-cac.patch | 42 ++ ...evice-do-not-generate-ICMP-for-non-IP-pac.patch | 47 ++ ...queueing-get-rid-of-per-peer-ring-buffers.patch | 560 +++++++++++++++++++++ ...-kconfig-use-arm-chacha-even-with-no-neon.patch | 30 ++ 5 files changed, 1016 insertions(+) create mode 100644 target/linux/generic/backport-5.10/070-net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch create mode 100644 target/linux/generic/backport-5.10/080-wireguard-peer-put-frequently-used-members-above-cac.patch create mode 100644 target/linux/generic/backport-5.10/081-wireguard-device-do-not-generate-ICMP-for-non-IP-pac.patch create mode 100644 target/linux/generic/backport-5.10/082-wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch create mode 100644 target/linux/generic/backport-5.10/083-wireguard-kconfig-use-arm-chacha-even-with-no-neon.patch (limited to 'target/linux/generic') diff --git a/target/linux/generic/backport-5.10/070-net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch b/target/linux/generic/backport-5.10/070-net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch new file mode 100644 index 0000000000..d2c084424f --- /dev/null +++ b/target/linux/generic/backport-5.10/070-net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch @@ -0,0 +1,337 @@ +From 4a25324891a32d080589a6e3a4dec2be2d9e3d60 Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Tue, 23 Feb 2021 14:18:58 +0100 +Subject: [PATCH] net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before + sending + +commit ee576c47db60432c37e54b1e2b43a8ca6d3a8dca upstream. + +The icmp{,v6}_send functions make all sorts of use of skb->cb, casting +it with IPCB or IP6CB, assuming the skb to have come directly from the +inet layer. But when the packet comes from the ndo layer, especially +when forwarded, there's no telling what might be in skb->cb at that +point. As a result, the icmp sending code risks reading bogus memory +contents, which can result in nasty stack overflows such as this one +reported by a user: + + panic+0x108/0x2ea + __stack_chk_fail+0x14/0x20 + __icmp_send+0x5bd/0x5c0 + icmp_ndo_send+0x148/0x160 + +In icmp_send, skb->cb is cast with IPCB and an ip_options struct is read +from it. The optlen parameter there is of particular note, as it can +induce writes beyond bounds. There are quite a few ways that can happen +in __ip_options_echo. For example: + + // sptr/skb are attacker-controlled skb bytes + sptr = skb_network_header(skb); + // dptr/dopt points to stack memory allocated by __icmp_send + dptr = dopt->__data; + // sopt is the corrupt skb->cb in question + if (sopt->rr) { + optlen = sptr[sopt->rr+1]; // corrupt skb->cb + skb->data + soffset = sptr[sopt->rr+2]; // corrupt skb->cb + skb->data + // this now writes potentially attacker-controlled data, over + // flowing the stack: + memcpy(dptr, sptr+sopt->rr, optlen); + } + +In the icmpv6_send case, the story is similar, but not as dire, as only +IP6CB(skb)->iif and IP6CB(skb)->dsthao are used. The dsthao case is +worse than the iif case, but it is passed to ipv6_find_tlv, which does +a bit of bounds checking on the value. + +This is easy to simulate by doing a `memset(skb->cb, 0x41, +sizeof(skb->cb));` before calling icmp{,v6}_ndo_send, and it's only by +good fortune and the rarity of icmp sending from that context that we've +avoided reports like this until now. For example, in KASAN: + + BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xa0e/0x12b0 + Write of size 38 at addr ffff888006f1f80e by task ping/89 + CPU: 2 PID: 89 Comm: ping Not tainted 5.10.0-rc7-debug+ #5 + Call Trace: + dump_stack+0x9a/0xcc + print_address_description.constprop.0+0x1a/0x160 + __kasan_report.cold+0x20/0x38 + kasan_report+0x32/0x40 + check_memory_region+0x145/0x1a0 + memcpy+0x39/0x60 + __ip_options_echo+0xa0e/0x12b0 + __icmp_send+0x744/0x1700 + +Actually, out of the 4 drivers that do this, only gtp zeroed the cb for +the v4 case, while the rest did not. So this commit actually removes the +gtp-specific zeroing, while putting the code where it belongs in the +shared infrastructure of icmp{,v6}_ndo_send. + +This commit fixes the issue by passing an empty IPCB or IP6CB along to +the functions that actually do the work. For the icmp_send, this was +already trivial, thanks to __icmp_send providing the plumbing function. +For icmpv6_send, this required a tiny bit of refactoring to make it +behave like the v4 case, after which it was straight forward. + +Fixes: a2b78e9b2cac ("sunvnet: generate ICMP PTMUD messages for smaller port MTUs") +Reported-by: SinYu +Reviewed-by: Willem de Bruijn +Link: https://lore.kernel.org/netdev/CAF=yD-LOF116aHub6RMe8vB8ZpnrrnoTdqhobEx+bvoA8AsP0w@mail.gmail.com/T/ +Signed-off-by: Jason A. Donenfeld +Link: https://lore.kernel.org/r/20210223131858.72082-1-Jason@zx2c4.com +Signed-off-by: Jakub Kicinski +[Jason: backported to 5.10] +Signed-off-by: Jason A. Donenfeld +--- + include/linux/icmpv6.h | 17 ++++++++++++++--- + include/linux/ipv6.h | 1 - + include/net/icmp.h | 6 +++++- + net/ipv4/icmp.c | 5 +++-- + net/ipv6/icmp.c | 16 ++++++++-------- + net/ipv6/ip6_icmp.c | 12 +++++++----- + 6 files changed, 37 insertions(+), 20 deletions(-) + +--- a/drivers/net/gtp.c ++++ b/drivers/net/gtp.c +@@ -539,7 +539,6 @@ static int gtp_build_skb_ip4(struct sk_b + if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && + mtu < ntohs(iph->tot_len)) { + netdev_dbg(dev, "packet too big, fragmentation needed\n"); +- memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + goto err_rt; +--- a/include/linux/icmpv6.h ++++ b/include/linux/icmpv6.h +@@ -3,6 +3,7 @@ + #define _LINUX_ICMPV6_H + + #include ++#include + #include + + static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) +@@ -15,13 +16,16 @@ static inline struct icmp6hdr *icmp6_hdr + #if IS_ENABLED(CONFIG_IPV6) + + typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, +- const struct in6_addr *force_saddr); ++ const struct in6_addr *force_saddr, ++ const struct inet6_skb_parm *parm); + #if IS_BUILTIN(CONFIG_IPV6) + void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, +- const struct in6_addr *force_saddr); +-static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) ++ const struct in6_addr *force_saddr, ++ const struct inet6_skb_parm *parm); ++static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ++ const struct inet6_skb_parm *parm) + { +- icmp6_send(skb, type, code, info, NULL); ++ icmp6_send(skb, type, code, info, NULL, parm); + } + static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) + { +@@ -34,18 +38,28 @@ static inline int inet6_unregister_icmp_ + return 0; + } + #else +-extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info); ++extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ++ const struct inet6_skb_parm *parm); + extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); + extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); + #endif + ++static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) ++{ ++ __icmpv6_send(skb, type, code, info, IP6CB(skb)); ++} ++ + int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, + unsigned int data_len); + + #if IS_ENABLED(CONFIG_NF_NAT) + void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); + #else +-#define icmpv6_ndo_send icmpv6_send ++static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) ++{ ++ struct inet6_skb_parm parm = { 0 }; ++ __icmpv6_send(skb_in, type, code, info, &parm); ++} + #endif + + #else +--- a/include/linux/ipv6.h ++++ b/include/linux/ipv6.h +@@ -84,7 +84,6 @@ struct ipv6_params { + __s32 autoconf; + }; + extern struct ipv6_params ipv6_defaults; +-#include + #include + #include + +--- a/include/net/icmp.h ++++ b/include/net/icmp.h +@@ -46,7 +46,11 @@ static inline void icmp_send(struct sk_b + #if IS_ENABLED(CONFIG_NF_NAT) + void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); + #else +-#define icmp_ndo_send icmp_send ++static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) ++{ ++ struct ip_options opts = { 0 }; ++ __icmp_send(skb_in, type, code, info, &opts); ++} + #endif + + int icmp_rcv(struct sk_buff *skb); +--- a/net/ipv4/icmp.c ++++ b/net/ipv4/icmp.c +@@ -775,13 +775,14 @@ EXPORT_SYMBOL(__icmp_send); + void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) + { + struct sk_buff *cloned_skb = NULL; ++ struct ip_options opts = { 0 }; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + __be32 orig_ip; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { +- icmp_send(skb_in, type, code, info); ++ __icmp_send(skb_in, type, code, info, &opts); + return; + } + +@@ -796,7 +797,7 @@ void icmp_ndo_send(struct sk_buff *skb_i + + orig_ip = ip_hdr(skb_in)->saddr; + ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; +- icmp_send(skb_in, type, code, info); ++ __icmp_send(skb_in, type, code, info, &opts); + ip_hdr(skb_in)->saddr = orig_ip; + out: + consume_skb(cloned_skb); +--- a/net/ipv6/icmp.c ++++ b/net/ipv6/icmp.c +@@ -331,10 +331,9 @@ static int icmpv6_getfrag(void *from, ch + } + + #if IS_ENABLED(CONFIG_IPV6_MIP6) +-static void mip6_addr_swap(struct sk_buff *skb) ++static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) + { + struct ipv6hdr *iph = ipv6_hdr(skb); +- struct inet6_skb_parm *opt = IP6CB(skb); + struct ipv6_destopt_hao *hao; + struct in6_addr tmp; + int off; +@@ -351,7 +350,7 @@ static void mip6_addr_swap(struct sk_buf + } + } + #else +-static inline void mip6_addr_swap(struct sk_buff *skb) {} ++static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {} + #endif + + static struct dst_entry *icmpv6_route_lookup(struct net *net, +@@ -446,7 +445,8 @@ static int icmp6_iif(const struct sk_buf + * Send an ICMP message in response to a packet in error + */ + void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, +- const struct in6_addr *force_saddr) ++ const struct in6_addr *force_saddr, ++ const struct inet6_skb_parm *parm) + { + struct inet6_dev *idev = NULL; + struct ipv6hdr *hdr = ipv6_hdr(skb); +@@ -542,7 +542,7 @@ void icmp6_send(struct sk_buff *skb, u8 + if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) + goto out_bh_enable; + +- mip6_addr_swap(skb); ++ mip6_addr_swap(skb, parm); + + sk = icmpv6_xmit_lock(net); + if (!sk) +@@ -559,7 +559,7 @@ void icmp6_send(struct sk_buff *skb, u8 + /* select a more meaningful saddr from input if */ + struct net_device *in_netdev; + +- in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); ++ in_netdev = dev_get_by_index(net, parm->iif); + if (in_netdev) { + ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, + inet6_sk(sk)->srcprefs, +@@ -640,7 +640,7 @@ EXPORT_SYMBOL(icmp6_send); + */ + void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) + { +- icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL); ++ icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb)); + kfree_skb(skb); + } + +@@ -697,10 +697,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk + } + if (type == ICMP_TIME_EXCEEDED) + icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, +- info, &temp_saddr); ++ info, &temp_saddr, IP6CB(skb2)); + else + icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, +- info, &temp_saddr); ++ info, &temp_saddr, IP6CB(skb2)); + if (rt) + ip6_rt_put(rt); + +--- a/net/ipv6/ip6_icmp.c ++++ b/net/ipv6/ip6_icmp.c +@@ -33,23 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icm + } + EXPORT_SYMBOL(inet6_unregister_icmp_sender); + +-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) ++void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ++ const struct inet6_skb_parm *parm) + { + ip6_icmp_send_t *send; + + rcu_read_lock(); + send = rcu_dereference(ip6_icmp_send); + if (send) +- send(skb, type, code, info, NULL); ++ send(skb, type, code, info, NULL, parm); + rcu_read_unlock(); + } +-EXPORT_SYMBOL(icmpv6_send); ++EXPORT_SYMBOL(__icmpv6_send); + #endif + + #if IS_ENABLED(CONFIG_NF_NAT) + #include + void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) + { ++ struct inet6_skb_parm parm = { 0 }; + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + struct in6_addr orig_ip; +@@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { +- icmpv6_send(skb_in, type, code, info); ++ __icmpv6_send(skb_in, type, code, info, &parm); + return; + } + +@@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb + + orig_ip = ipv6_hdr(skb_in)->saddr; + ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; +- icmpv6_send(skb_in, type, code, info); ++ __icmpv6_send(skb_in, type, code, info, &parm); + ipv6_hdr(skb_in)->saddr = orig_ip; + out: + consume_skb(cloned_skb); diff --git a/target/linux/generic/backport-5.10/080-wireguard-peer-put-frequently-used-members-above-cac.patch b/target/linux/generic/backport-5.10/080-wireguard-peer-put-frequently-used-members-above-cac.patch new file mode 100644 index 0000000000..1e0115d982 --- /dev/null +++ b/target/linux/generic/backport-5.10/080-wireguard-peer-put-frequently-used-members-above-cac.patch @@ -0,0 +1,42 @@ +From a13827e9091c07e25cdeec9a402d74a27e2a1111 Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Mon, 22 Feb 2021 17:25:46 +0100 +Subject: [PATCH] wireguard: peer: put frequently used members above cache + lines + +commit 5a0598695634a6bb4126818902dd9140cd9df8b6 upstream. + +The is_dead boolean is checked for every single packet, while the +internal_id member is used basically only for pr_debug messages. So it +makes sense to hoist up is_dead into some space formerly unused by a +struct hole, while demoting internal_api to below the lowest struct +cache line. + +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Jakub Kicinski +Signed-off-by: Jason A. Donenfeld +--- + drivers/net/wireguard/peer.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/wireguard/peer.h ++++ b/drivers/net/wireguard/peer.h +@@ -39,6 +39,7 @@ struct wg_peer { + struct crypt_queue tx_queue, rx_queue; + struct sk_buff_head staged_packet_queue; + int serial_work_cpu; ++ bool is_dead; + struct noise_keypairs keypairs; + struct endpoint endpoint; + struct dst_cache endpoint_cache; +@@ -61,9 +62,8 @@ struct wg_peer { + struct rcu_head rcu; + struct list_head peer_list; + struct list_head allowedips_list; +- u64 internal_id; + struct napi_struct napi; +- bool is_dead; ++ u64 internal_id; + }; + + struct wg_peer *wg_peer_create(struct wg_device *wg, diff --git a/target/linux/generic/backport-5.10/081-wireguard-device-do-not-generate-ICMP-for-non-IP-pac.patch b/target/linux/generic/backport-5.10/081-wireguard-device-do-not-generate-ICMP-for-non-IP-pac.patch new file mode 100644 index 0000000000..9b1fee7290 --- /dev/null +++ b/target/linux/generic/backport-5.10/081-wireguard-device-do-not-generate-ICMP-for-non-IP-pac.patch @@ -0,0 +1,47 @@ +From 49da2a610d63cef849f0095e601821ad6edfbef7 Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Mon, 22 Feb 2021 17:25:47 +0100 +Subject: [PATCH] wireguard: device: do not generate ICMP for non-IP packets + +commit 99fff5264e7ab06f45b0ad60243475be0a8d0559 upstream. + +If skb->protocol doesn't match the actual skb->data header, it's +probably not a good idea to pass it off to icmp{,v6}_ndo_send, which is +expecting to reply to a valid IP packet. So this commit has that early +mismatch case jump to a later error label. + +Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Jakub Kicinski +Signed-off-by: Jason A. Donenfeld +--- + drivers/net/wireguard/device.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/drivers/net/wireguard/device.c ++++ b/drivers/net/wireguard/device.c +@@ -138,7 +138,7 @@ static netdev_tx_t wg_xmit(struct sk_buf + else if (skb->protocol == htons(ETH_P_IPV6)) + net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI6\n", + dev->name, &ipv6_hdr(skb)->daddr); +- goto err; ++ goto err_icmp; + } + + family = READ_ONCE(peer->endpoint.addr.sa_family); +@@ -201,12 +201,13 @@ static netdev_tx_t wg_xmit(struct sk_buf + + err_peer: + wg_peer_put(peer); +-err: +- ++dev->stats.tx_errors; ++err_icmp: + if (skb->protocol == htons(ETH_P_IP)) + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + else if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); ++err: ++ ++dev->stats.tx_errors; + kfree_skb(skb); + return ret; + } diff --git a/target/linux/generic/backport-5.10/082-wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch b/target/linux/generic/backport-5.10/082-wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch new file mode 100644 index 0000000000..1e7c5e881f --- /dev/null +++ b/target/linux/generic/backport-5.10/082-wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch @@ -0,0 +1,560 @@ +From 1771bbcc5bc99f569dd82ec9e1b7c397a2fb50ac Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Mon, 22 Feb 2021 17:25:48 +0100 +Subject: [PATCH] wireguard: queueing: get rid of per-peer ring buffers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 8b5553ace83cced775eefd0f3f18b5c6214ccf7a upstream. + +Having two ring buffers per-peer means that every peer results in two +massive ring allocations. On an 8-core x86_64 machine, this commit +reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which +is an 90% reduction. Ninety percent! With some single-machine +deployments approaching 500,000 peers, we're talking about a reduction +from 7 gigs of memory down to 700 megs of memory. + +In order to get rid of these per-peer allocations, this commit switches +to using a list-based queueing approach. Currently GSO fragments are +chained together using the skb->next pointer (the skb_list_* singly +linked list approach), so we form the per-peer queue around the unused +skb->prev pointer (which sort of makes sense because the links are +pointing backwards). Use of skb_queue_* is not possible here, because +that is based on doubly linked lists and spinlocks. Multiple cores can +write into the queue at any given time, because its writes occur in the +start_xmit path or in the udp_recv path. But reads happen in a single +workqueue item per-peer, amounting to a multi-producer, single-consumer +paradigm. + +The MPSC queue is implemented locklessly and never blocks. However, it +is not linearizable (though it is serializable), with a very tight and +unlikely race on writes, which, when hit (some tiny fraction of the +0.15% of partial adds on a fully loaded 16-core x86_64 system), causes +the queue reader to terminate early. However, because every packet sent +queues up the same workqueue item after it is fully added, the worker +resumes again, and stopping early isn't actually a problem, since at +that point the packet wouldn't have yet been added to the encryption +queue. These properties allow us to avoid disabling interrupts or +spinning. The design is based on Dmitry Vyukov's algorithm [1]. + +Performance-wise, ordinarily list-based queues aren't preferable to +ringbuffers, because of cache misses when following pointers around. +However, we *already* have to follow the adjacent pointers when working +through fragments, so there shouldn't actually be any change there. A +potential downside is that dequeueing is a bit more complicated, but the +ptr_ring structure used prior had a spinlock when dequeueing, so all and +all the difference appears to be a wash. + +Actually, from profiling, the biggest performance hit, by far, of this +commit winds up being atomic_add_unless(count, 1, max) and atomic_ +dec(count), which account for the majority of CPU time, according to +perf. In that sense, the previous ring buffer was superior in that it +could check if it was full by head==tail, which the list-based approach +cannot do. + +But all and all, this enables us to get massive memory savings, allowing +WireGuard to scale for real world deployments, without taking much of a +performance hit. + +[1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue + +Reviewed-by: Dmitry Vyukov +Reviewed-by: Toke Høiland-Jørgensen +Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Jakub Kicinski +Signed-off-by: Jason A. Donenfeld +--- + drivers/net/wireguard/device.c | 12 ++--- + drivers/net/wireguard/device.h | 15 +++--- + drivers/net/wireguard/peer.c | 28 ++++------- + drivers/net/wireguard/peer.h | 4 +- + drivers/net/wireguard/queueing.c | 86 +++++++++++++++++++++++++------- + drivers/net/wireguard/queueing.h | 45 ++++++++++++----- + drivers/net/wireguard/receive.c | 16 +++--- + drivers/net/wireguard/send.c | 31 ++++-------- + 8 files changed, 144 insertions(+), 93 deletions(-) + +--- a/drivers/net/wireguard/device.c ++++ b/drivers/net/wireguard/device.c +@@ -235,8 +235,8 @@ static void wg_destruct(struct net_devic + destroy_workqueue(wg->handshake_receive_wq); + destroy_workqueue(wg->handshake_send_wq); + destroy_workqueue(wg->packet_crypt_wq); +- wg_packet_queue_free(&wg->decrypt_queue, true); +- wg_packet_queue_free(&wg->encrypt_queue, true); ++ wg_packet_queue_free(&wg->decrypt_queue); ++ wg_packet_queue_free(&wg->encrypt_queue); + rcu_barrier(); /* Wait for all the peers to be actually freed. */ + wg_ratelimiter_uninit(); + memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); +@@ -338,12 +338,12 @@ static int wg_newlink(struct net *src_ne + goto err_destroy_handshake_send; + + ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker, +- true, MAX_QUEUED_PACKETS); ++ MAX_QUEUED_PACKETS); + if (ret < 0) + goto err_destroy_packet_crypt; + + ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker, +- true, MAX_QUEUED_PACKETS); ++ MAX_QUEUED_PACKETS); + if (ret < 0) + goto err_free_encrypt_queue; + +@@ -368,9 +368,9 @@ static int wg_newlink(struct net *src_ne + err_uninit_ratelimiter: + wg_ratelimiter_uninit(); + err_free_decrypt_queue: +- wg_packet_queue_free(&wg->decrypt_queue, true); ++ wg_packet_queue_free(&wg->decrypt_queue); + err_free_encrypt_queue: +- wg_packet_queue_free(&wg->encrypt_queue, true); ++ wg_packet_queue_free(&wg->encrypt_queue); + err_destroy_packet_crypt: + destroy_workqueue(wg->packet_crypt_wq); + err_destroy_handshake_send: +--- a/drivers/net/wireguard/device.h ++++ b/drivers/net/wireguard/device.h +@@ -27,13 +27,14 @@ struct multicore_worker { + + struct crypt_queue { + struct ptr_ring ring; +- union { +- struct { +- struct multicore_worker __percpu *worker; +- int last_cpu; +- }; +- struct work_struct work; +- }; ++ struct multicore_worker __percpu *worker; ++ int last_cpu; ++}; ++ ++struct prev_queue { ++ struct sk_buff *head, *tail, *peeked; ++ struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff. ++ atomic_t count; + }; + + struct wg_device { +--- a/drivers/net/wireguard/peer.c ++++ b/drivers/net/wireguard/peer.c +@@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg + peer = kzalloc(sizeof(*peer), GFP_KERNEL); + if (unlikely(!peer)) + return ERR_PTR(ret); +- peer->device = wg; ++ if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) ++ goto err; + ++ peer->device = wg; + wg_noise_handshake_init(&peer->handshake, &wg->static_identity, + public_key, preshared_key, peer); +- if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) +- goto err_1; +- if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false, +- MAX_QUEUED_PACKETS)) +- goto err_2; +- if (wg_packet_queue_init(&peer->rx_queue, NULL, false, +- MAX_QUEUED_PACKETS)) +- goto err_3; +- + peer->internal_id = atomic64_inc_return(&peer_counter); + peer->serial_work_cpu = nr_cpumask_bits; + wg_cookie_init(&peer->latest_cookie); + wg_timers_init(peer); + wg_cookie_checker_precompute_peer_keys(peer); + spin_lock_init(&peer->keypairs.keypair_update_lock); +- INIT_WORK(&peer->transmit_handshake_work, +- wg_packet_handshake_send_worker); ++ INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); ++ INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); ++ wg_prev_queue_init(&peer->tx_queue); ++ wg_prev_queue_init(&peer->rx_queue); + rwlock_init(&peer->endpoint_lock); + kref_init(&peer->refcount); + skb_queue_head_init(&peer->staged_packet_queue); +@@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg + pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); + return peer; + +-err_3: +- wg_packet_queue_free(&peer->tx_queue, false); +-err_2: +- dst_cache_destroy(&peer->endpoint_cache); +-err_1: ++err: + kfree(peer); + return ERR_PTR(ret); + } +@@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head + struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); + + dst_cache_destroy(&peer->endpoint_cache); +- wg_packet_queue_free(&peer->rx_queue, false); +- wg_packet_queue_free(&peer->tx_queue, false); ++ WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); + + /* The final zeroing takes care of clearing any remaining handshake key + * material and other potentially sensitive information. +--- a/drivers/net/wireguard/peer.h ++++ b/drivers/net/wireguard/peer.h +@@ -36,7 +36,7 @@ struct endpoint { + + struct wg_peer { + struct wg_device *device; +- struct crypt_queue tx_queue, rx_queue; ++ struct prev_queue tx_queue, rx_queue; + struct sk_buff_head staged_packet_queue; + int serial_work_cpu; + bool is_dead; +@@ -46,7 +46,7 @@ struct wg_peer { + rwlock_t endpoint_lock; + struct noise_handshake handshake; + atomic64_t last_sent_handshake; +- struct work_struct transmit_handshake_work, clear_peer_work; ++ struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work; + struct cookie latest_cookie; + struct hlist_node pubkey_hash; + u64 rx_bytes, tx_bytes; +--- a/drivers/net/wireguard/queueing.c ++++ b/drivers/net/wireguard/queueing.c +@@ -9,8 +9,7 @@ struct multicore_worker __percpu * + wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) + { + int cpu; +- struct multicore_worker __percpu *worker = +- alloc_percpu(struct multicore_worker); ++ struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker); + + if (!worker) + return NULL; +@@ -23,7 +22,7 @@ wg_packet_percpu_multicore_worker_alloc( + } + + int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, +- bool multicore, unsigned int len) ++ unsigned int len) + { + int ret; + +@@ -31,25 +30,78 @@ int wg_packet_queue_init(struct crypt_qu + ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL); + if (ret) + return ret; +- if (function) { +- if (multicore) { +- queue->worker = wg_packet_percpu_multicore_worker_alloc( +- function, queue); +- if (!queue->worker) { +- ptr_ring_cleanup(&queue->ring, NULL); +- return -ENOMEM; +- } +- } else { +- INIT_WORK(&queue->work, function); +- } ++ queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue); ++ if (!queue->worker) { ++ ptr_ring_cleanup(&queue->ring, NULL); ++ return -ENOMEM; + } + return 0; + } + +-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore) ++void wg_packet_queue_free(struct crypt_queue *queue) + { +- if (multicore) +- free_percpu(queue->worker); ++ free_percpu(queue->worker); + WARN_ON(!__ptr_ring_empty(&queue->ring)); + ptr_ring_cleanup(&queue->ring, NULL); + } ++ ++#define NEXT(skb) ((skb)->prev) ++#define STUB(queue) ((struct sk_buff *)&queue->empty) ++ ++void wg_prev_queue_init(struct prev_queue *queue) ++{ ++ NEXT(STUB(queue)) = NULL; ++ queue->head = queue->tail = STUB(queue); ++ queue->peeked = NULL; ++ atomic_set(&queue->count, 0); ++ BUILD_BUG_ON( ++ offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) - ++ offsetof(struct prev_queue, empty) || ++ offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) - ++ offsetof(struct prev_queue, empty)); ++} ++ ++static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) ++{ ++ WRITE_ONCE(NEXT(skb), NULL); ++ WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb); ++} ++ ++bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) ++{ ++ if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS)) ++ return false; ++ __wg_prev_queue_enqueue(queue, skb); ++ return true; ++} ++ ++struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue) ++{ ++ struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail)); ++ ++ if (tail == STUB(queue)) { ++ if (!next) ++ return NULL; ++ queue->tail = next; ++ tail = next; ++ next = smp_load_acquire(&NEXT(next)); ++ } ++ if (next) { ++ queue->tail = next; ++ atomic_dec(&queue->count); ++ return tail; ++ } ++ if (tail != READ_ONCE(queue->head)) ++ return NULL; ++ __wg_prev_queue_enqueue(queue, STUB(queue)); ++ next = smp_load_acquire(&NEXT(tail)); ++ if (next) { ++ queue->tail = next; ++ atomic_dec(&queue->count); ++ return tail; ++ } ++ return NULL; ++} ++ ++#undef NEXT ++#undef STUB +--- a/drivers/net/wireguard/queueing.h ++++ b/drivers/net/wireguard/queueing.h +@@ -17,12 +17,13 @@ struct wg_device; + struct wg_peer; + struct multicore_worker; + struct crypt_queue; ++struct prev_queue; + struct sk_buff; + + /* queueing.c APIs: */ + int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, +- bool multicore, unsigned int len); +-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore); ++ unsigned int len); ++void wg_packet_queue_free(struct crypt_queue *queue); + struct multicore_worker __percpu * + wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); + +@@ -135,8 +136,31 @@ static inline int wg_cpumask_next_online + return cpu; + } + ++void wg_prev_queue_init(struct prev_queue *queue); ++ ++/* Multi producer */ ++bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); ++ ++/* Single consumer */ ++struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); ++ ++/* Single consumer */ ++static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) ++{ ++ if (queue->peeked) ++ return queue->peeked; ++ queue->peeked = wg_prev_queue_dequeue(queue); ++ return queue->peeked; ++} ++ ++/* Single consumer */ ++static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) ++{ ++ queue->peeked = NULL; ++} ++ + static inline int wg_queue_enqueue_per_device_and_peer( +- struct crypt_queue *device_queue, struct crypt_queue *peer_queue, ++ struct crypt_queue *device_queue, struct prev_queue *peer_queue, + struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu) + { + int cpu; +@@ -145,8 +169,9 @@ static inline int wg_queue_enqueue_per_d + /* We first queue this up for the peer ingestion, but the consumer + * will wait for the state to change to CRYPTED or DEAD before. + */ +- if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb))) ++ if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) + return -ENOSPC; ++ + /* Then we queue it up in the device queue, which consumes the + * packet as soon as it can. + */ +@@ -157,9 +182,7 @@ static inline int wg_queue_enqueue_per_d + return 0; + } + +-static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue, +- struct sk_buff *skb, +- enum packet_state state) ++static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) + { + /* We take a reference, because as soon as we call atomic_set, the + * peer can be freed from below us. +@@ -167,14 +190,12 @@ static inline void wg_queue_enqueue_per_ + struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); + + atomic_set_release(&PACKET_CB(skb)->state, state); +- queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, +- peer->internal_id), +- peer->device->packet_crypt_wq, &queue->work); ++ queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), ++ peer->device->packet_crypt_wq, &peer->transmit_packet_work); + wg_peer_put(peer); + } + +-static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb, +- enum packet_state state) ++static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) + { + /* We take a reference, because as soon as we call atomic_set, the + * peer can be freed from below us. +--- a/drivers/net/wireguard/receive.c ++++ b/drivers/net/wireguard/receive.c +@@ -444,7 +444,6 @@ packet_processed: + int wg_packet_rx_poll(struct napi_struct *napi, int budget) + { + struct wg_peer *peer = container_of(napi, struct wg_peer, napi); +- struct crypt_queue *queue = &peer->rx_queue; + struct noise_keypair *keypair; + struct endpoint endpoint; + enum packet_state state; +@@ -455,11 +454,10 @@ int wg_packet_rx_poll(struct napi_struct + if (unlikely(budget <= 0)) + return 0; + +- while ((skb = __ptr_ring_peek(&queue->ring)) != NULL && ++ while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL && + (state = atomic_read_acquire(&PACKET_CB(skb)->state)) != + PACKET_STATE_UNCRYPTED) { +- __ptr_ring_discard_one(&queue->ring); +- peer = PACKET_PEER(skb); ++ wg_prev_queue_drop_peeked(&peer->rx_queue); + keypair = PACKET_CB(skb)->keypair; + free = true; + +@@ -508,7 +506,7 @@ void wg_packet_decrypt_worker(struct wor + enum packet_state state = + likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ? + PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; +- wg_queue_enqueue_per_peer_napi(skb, state); ++ wg_queue_enqueue_per_peer_rx(skb, state); + if (need_resched()) + cond_resched(); + } +@@ -531,12 +529,10 @@ static void wg_packet_consume_data(struc + if (unlikely(READ_ONCE(peer->is_dead))) + goto err; + +- ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, +- &peer->rx_queue, skb, +- wg->packet_crypt_wq, +- &wg->decrypt_queue.last_cpu); ++ ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb, ++ wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu); + if (unlikely(ret == -EPIPE)) +- wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD); ++ wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD); + if (likely(!ret || ret == -EPIPE)) { + rcu_read_unlock_bh(); + return; +--- a/drivers/net/wireguard/send.c ++++ b/drivers/net/wireguard/send.c +@@ -239,8 +239,7 @@ void wg_packet_send_keepalive(struct wg_ + wg_packet_send_staged_packets(peer); + } + +-static void wg_packet_create_data_done(struct sk_buff *first, +- struct wg_peer *peer) ++static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first) + { + struct sk_buff *skb, *next; + bool is_keepalive, data_sent = false; +@@ -262,22 +261,19 @@ static void wg_packet_create_data_done(s + + void wg_packet_tx_worker(struct work_struct *work) + { +- struct crypt_queue *queue = container_of(work, struct crypt_queue, +- work); ++ struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work); + struct noise_keypair *keypair; + enum packet_state state; + struct sk_buff *first; +- struct wg_peer *peer; + +- while ((first = __ptr_ring_peek(&queue->ring)) != NULL && ++ while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL && + (state = atomic_read_acquire(&PACKET_CB(first)->state)) != + PACKET_STATE_UNCRYPTED) { +- __ptr_ring_discard_one(&queue->ring); +- peer = PACKET_PEER(first); ++ wg_prev_queue_drop_peeked(&peer->tx_queue); + keypair = PACKET_CB(first)->keypair; + + if (likely(state == PACKET_STATE_CRYPTED)) +- wg_packet_create_data_done(first, peer); ++ wg_packet_create_data_done(peer, first); + else + kfree_skb_list(first); + +@@ -306,16 +302,14 @@ void wg_packet_encrypt_worker(struct wor + break; + } + } +- wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first, +- state); ++ wg_queue_enqueue_per_peer_tx(first, state); + if (need_resched()) + cond_resched(); + } + } + +-static void wg_packet_create_data(struct sk_buff *first) ++static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first) + { +- struct wg_peer *peer = PACKET_PEER(first); + struct wg_device *wg = peer->device; + int ret = -EINVAL; + +@@ -323,13 +317,10 @@ static void wg_packet_create_data(struct + if (unlikely(READ_ONCE(peer->is_dead))) + goto err; + +- ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, +- &peer->tx_queue, first, +- wg->packet_crypt_wq, +- &wg->encrypt_queue.last_cpu); ++ ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first, ++ wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu); + if (unlikely(ret == -EPIPE)) +- wg_queue_enqueue_per_peer(&peer->tx_queue, first, +- PACKET_STATE_DEAD); ++ wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD); + err: + rcu_read_unlock_bh(); + if (likely(!ret || ret == -EPIPE)) +@@ -393,7 +384,7 @@ void wg_packet_send_staged_packets(struc + packets.prev->next = NULL; + wg_peer_get(keypair->entry.peer); + PACKET_CB(packets.next)->keypair = keypair; +- wg_packet_create_data(packets.next); ++ wg_packet_create_data(peer, packets.next); + return; + + out_invalid: diff --git a/target/linux/generic/backport-5.10/083-wireguard-kconfig-use-arm-chacha-even-with-no-neon.patch b/target/linux/generic/backport-5.10/083-wireguard-kconfig-use-arm-chacha-even-with-no-neon.patch new file mode 100644 index 0000000000..8a8c04e1a8 --- /dev/null +++ b/target/linux/generic/backport-5.10/083-wireguard-kconfig-use-arm-chacha-even-with-no-neon.patch @@ -0,0 +1,30 @@ +From 514091206bc055a159348ae8575276dc925aea24 Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Mon, 22 Feb 2021 17:25:49 +0100 +Subject: [PATCH] wireguard: kconfig: use arm chacha even with no neon + +commit bce2473927af8de12ad131a743f55d69d358c0b9 upstream. + +The condition here was incorrect: a non-neon fallback implementation is +available on arm32 when NEON is not supported. + +Reported-by: Ilya Lipnitskiy +Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Jakub Kicinski +Signed-off-by: Jason A. Donenfeld +--- + drivers/net/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/Kconfig ++++ b/drivers/net/Kconfig +@@ -87,7 +87,7 @@ config WIREGUARD + select CRYPTO_CURVE25519_X86 if X86 && 64BIT + select ARM_CRYPTO if ARM + select ARM64_CRYPTO if ARM64 +- select CRYPTO_CHACHA20_NEON if (ARM || ARM64) && KERNEL_MODE_NEON ++ select CRYPTO_CHACHA20_NEON if ARM || (ARM64 && KERNEL_MODE_NEON) + select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON + select CRYPTO_POLY1305_ARM if ARM + select CRYPTO_CURVE25519_NEON if ARM && KERNEL_MODE_NEON -- cgit v1.2.3