aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch237
1 files changed, 237 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch b/target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch
new file mode 100644
index 0000000000..78da24ea46
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch
@@ -0,0 +1,237 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:36 +0200
+Subject: [PATCH] wireguard: allowedips: remove nodes in O(1)
+
+commit f634f418c227c912e7ea95a3299efdc9b10e4022 upstream.
+
+Previously, deleting peers would require traversing the entire trie in
+order to rebalance nodes and safely free them. This meant that removing
+1000 peers from a trie with a half million nodes would take an extremely
+long time, during which we're holding the rtnl lock. Large-scale users
+were reporting 200ms latencies added to the networking stack as a whole
+every time their userspace software would queue up significant removals.
+That's a serious situation.
+
+This commit fixes that by maintaining a double pointer to the parent's
+bit pointer for each node, and then using the already existing node list
+belonging to each peer to go directly to the node, fix up its pointers,
+and free it with RCU. This means removal is O(1) instead of O(n), and we
+don't use gobs of stack.
+
+The removal algorithm has the same downside as the code that it fixes:
+it won't collapse needlessly long runs of fillers. We can enhance that
+in the future if it ever becomes a problem. This commit documents that
+limitation with a TODO comment in code, a small but meaningful
+improvement over the prior situation.
+
+Currently the biggest flaw, which the next commit addresses, is that
+because this increases the node size on 64-bit machines from 60 bytes to
+68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up
+using twice as much memory per node, because of power-of-two
+allocations, which is a big bummer. We'll need to figure something out
+there.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/allowedips.c | 132 ++++++++++++-----------------
+ drivers/net/wireguard/allowedips.h | 9 +-
+ 2 files changed, 57 insertions(+), 84 deletions(-)
+
+--- a/drivers/net/wireguard/allowedips.c
++++ b/drivers/net/wireguard/allowedips.c
+@@ -66,60 +66,6 @@ static void root_remove_peer_lists(struc
+ }
+ }
+
+-static void walk_remove_by_peer(struct allowedips_node __rcu **top,
+- struct wg_peer *peer, struct mutex *lock)
+-{
+-#define REF(p) rcu_access_pointer(p)
+-#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
+-#define PUSH(p) ({ \
+- WARN_ON(IS_ENABLED(DEBUG) && len >= 128); \
+- stack[len++] = p; \
+- })
+-
+- struct allowedips_node __rcu **stack[128], **nptr;
+- struct allowedips_node *node, *prev;
+- unsigned int len;
+-
+- if (unlikely(!peer || !REF(*top)))
+- return;
+-
+- for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
+- nptr = stack[len - 1];
+- node = DEREF(nptr);
+- if (!node) {
+- --len;
+- continue;
+- }
+- if (!prev || REF(prev->bit[0]) == node ||
+- REF(prev->bit[1]) == node) {
+- if (REF(node->bit[0]))
+- PUSH(&node->bit[0]);
+- else if (REF(node->bit[1]))
+- PUSH(&node->bit[1]);
+- } else if (REF(node->bit[0]) == prev) {
+- if (REF(node->bit[1]))
+- PUSH(&node->bit[1]);
+- } else {
+- if (rcu_dereference_protected(node->peer,
+- lockdep_is_held(lock)) == peer) {
+- RCU_INIT_POINTER(node->peer, NULL);
+- list_del_init(&node->peer_list);
+- if (!node->bit[0] || !node->bit[1]) {
+- rcu_assign_pointer(*nptr, DEREF(
+- &node->bit[!REF(node->bit[0])]));
+- kfree_rcu(node, rcu);
+- node = DEREF(nptr);
+- }
+- }
+- --len;
+- }
+- }
+-
+-#undef REF
+-#undef DEREF
+-#undef PUSH
+-}
+-
+ static unsigned int fls128(u64 a, u64 b)
+ {
+ return a ? fls64(a) + 64U : fls64(b);
+@@ -224,6 +170,7 @@ static int add(struct allowedips_node __
+ RCU_INIT_POINTER(node->peer, peer);
+ list_add_tail(&node->peer_list, &peer->allowedips_list);
+ copy_and_assign_cidr(node, key, cidr, bits);
++ rcu_assign_pointer(node->parent_bit, trie);
+ rcu_assign_pointer(*trie, node);
+ return 0;
+ }
+@@ -243,9 +190,9 @@ static int add(struct allowedips_node __
+ if (!node) {
+ down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
+ } else {
+- down = rcu_dereference_protected(CHOOSE_NODE(node, key),
+- lockdep_is_held(lock));
++ down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
+ if (!down) {
++ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
+ rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
+ return 0;
+ }
+@@ -254,29 +201,37 @@ static int add(struct allowedips_node __
+ parent = node;
+
+ if (newnode->cidr == cidr) {
++ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
+ rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
+- if (!parent)
++ if (!parent) {
++ rcu_assign_pointer(newnode->parent_bit, trie);
+ rcu_assign_pointer(*trie, newnode);
+- else
+- rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
+- newnode);
+- } else {
+- node = kzalloc(sizeof(*node), GFP_KERNEL);
+- if (unlikely(!node)) {
+- list_del(&newnode->peer_list);
+- kfree(newnode);
+- return -ENOMEM;
++ } else {
++ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
++ rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
+ }
+- INIT_LIST_HEAD(&node->peer_list);
+- copy_and_assign_cidr(node, newnode->bits, cidr, bits);
++ return 0;
++ }
++
++ node = kzalloc(sizeof(*node), GFP_KERNEL);
++ if (unlikely(!node)) {
++ list_del(&newnode->peer_list);
++ kfree(newnode);
++ return -ENOMEM;
++ }
++ INIT_LIST_HEAD(&node->peer_list);
++ copy_and_assign_cidr(node, newnode->bits, cidr, bits);
+
+- rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
+- rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
+- if (!parent)
+- rcu_assign_pointer(*trie, node);
+- else
+- rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
+- node);
++ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
++ rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
++ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
++ rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
++ if (!parent) {
++ rcu_assign_pointer(node->parent_bit, trie);
++ rcu_assign_pointer(*trie, node);
++ } else {
++ rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
++ rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
+ }
+ return 0;
+ }
+@@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allow
+ void wg_allowedips_remove_by_peer(struct allowedips *table,
+ struct wg_peer *peer, struct mutex *lock)
+ {
++ struct allowedips_node *node, *child, *tmp;
++
++ if (list_empty(&peer->allowedips_list))
++ return;
+ ++table->seq;
+- walk_remove_by_peer(&table->root4, peer, lock);
+- walk_remove_by_peer(&table->root6, peer, lock);
++ list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) {
++ list_del_init(&node->peer_list);
++ RCU_INIT_POINTER(node->peer, NULL);
++ if (node->bit[0] && node->bit[1])
++ continue;
++ child = rcu_dereference_protected(
++ node->bit[!rcu_access_pointer(node->bit[0])],
++ lockdep_is_held(lock));
++ if (child)
++ child->parent_bit = node->parent_bit;
++ *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
++ kfree_rcu(node, rcu);
++
++ /* TODO: Note that we currently don't walk up and down in order to
++ * free any potential filler nodes. This means that this function
++ * doesn't free up as much as it could, which could be revisited
++ * at some point.
++ */
++ }
+ }
+
+ int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)
+--- a/drivers/net/wireguard/allowedips.h
++++ b/drivers/net/wireguard/allowedips.h
+@@ -15,14 +15,11 @@ struct wg_peer;
+ struct allowedips_node {
+ struct wg_peer __rcu *peer;
+ struct allowedips_node __rcu *bit[2];
+- /* While it may seem scandalous that we waste space for v4,
+- * we're alloc'ing to the nearest power of 2 anyway, so this
+- * doesn't actually make a difference.
+- */
+- u8 bits[16] __aligned(__alignof(u64));
+ u8 cidr, bit_at_a, bit_at_b, bitlen;
++ u8 bits[16] __aligned(__alignof(u64));
+
+- /* Keep rarely used list at bottom to be beyond cache line. */
++ /* Keep rarely used members at bottom to be beyond cache line. */
++ struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
+ union {
+ struct list_head peer_list;
+ struct rcu_head rcu;