diff options
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch')
-rw-r--r-- | target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch b/target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch new file mode 100644 index 0000000000..78da24ea46 --- /dev/null +++ b/target/linux/generic/backport-5.4/080-wireguard-0133-wireguard-allowedips-remove-nodes-in-O-1.patch @@ -0,0 +1,237 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" <Jason@zx2c4.com> +Date: Fri, 4 Jun 2021 17:17:36 +0200 +Subject: [PATCH] wireguard: allowedips: remove nodes in O(1) + +commit f634f418c227c912e7ea95a3299efdc9b10e4022 upstream. + +Previously, deleting peers would require traversing the entire trie in +order to rebalance nodes and safely free them. This meant that removing +1000 peers from a trie with a half million nodes would take an extremely +long time, during which we're holding the rtnl lock. Large-scale users +were reporting 200ms latencies added to the networking stack as a whole +every time their userspace software would queue up significant removals. +That's a serious situation. + +This commit fixes that by maintaining a double pointer to the parent's +bit pointer for each node, and then using the already existing node list +belonging to each peer to go directly to the node, fix up its pointers, +and free it with RCU. This means removal is O(1) instead of O(n), and we +don't use gobs of stack. + +The removal algorithm has the same downside as the code that it fixes: +it won't collapse needlessly long runs of fillers. We can enhance that +in the future if it ever becomes a problem. This commit documents that +limitation with a TODO comment in code, a small but meaningful +improvement over the prior situation. + +Currently the biggest flaw, which the next commit addresses, is that +because this increases the node size on 64-bit machines from 60 bytes to +68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up +using twice as much memory per node, because of power-of-two +allocations, which is a big bummer. We'll need to figure something out +there. + +Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") +Cc: stable@vger.kernel.org +Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> +Signed-off-by: David S. Miller <davem@davemloft.net> +Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> +--- + drivers/net/wireguard/allowedips.c | 132 ++++++++++++----------------- + drivers/net/wireguard/allowedips.h | 9 +- + 2 files changed, 57 insertions(+), 84 deletions(-) + +--- a/drivers/net/wireguard/allowedips.c ++++ b/drivers/net/wireguard/allowedips.c +@@ -66,60 +66,6 @@ static void root_remove_peer_lists(struc + } + } + +-static void walk_remove_by_peer(struct allowedips_node __rcu **top, +- struct wg_peer *peer, struct mutex *lock) +-{ +-#define REF(p) rcu_access_pointer(p) +-#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock)) +-#define PUSH(p) ({ \ +- WARN_ON(IS_ENABLED(DEBUG) && len >= 128); \ +- stack[len++] = p; \ +- }) +- +- struct allowedips_node __rcu **stack[128], **nptr; +- struct allowedips_node *node, *prev; +- unsigned int len; +- +- if (unlikely(!peer || !REF(*top))) +- return; +- +- for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) { +- nptr = stack[len - 1]; +- node = DEREF(nptr); +- if (!node) { +- --len; +- continue; +- } +- if (!prev || REF(prev->bit[0]) == node || +- REF(prev->bit[1]) == node) { +- if (REF(node->bit[0])) +- PUSH(&node->bit[0]); +- else if (REF(node->bit[1])) +- PUSH(&node->bit[1]); +- } else if (REF(node->bit[0]) == prev) { +- if (REF(node->bit[1])) +- PUSH(&node->bit[1]); +- } else { +- if (rcu_dereference_protected(node->peer, +- lockdep_is_held(lock)) == peer) { +- RCU_INIT_POINTER(node->peer, NULL); +- list_del_init(&node->peer_list); +- if (!node->bit[0] || !node->bit[1]) { +- rcu_assign_pointer(*nptr, DEREF( +- &node->bit[!REF(node->bit[0])])); +- kfree_rcu(node, rcu); +- node = DEREF(nptr); +- } +- } +- --len; +- } +- } +- +-#undef REF +-#undef DEREF +-#undef PUSH +-} +- + static unsigned int fls128(u64 a, u64 b) + { + return a ? fls64(a) + 64U : fls64(b); +@@ -224,6 +170,7 @@ static int add(struct allowedips_node __ + RCU_INIT_POINTER(node->peer, peer); + list_add_tail(&node->peer_list, &peer->allowedips_list); + copy_and_assign_cidr(node, key, cidr, bits); ++ rcu_assign_pointer(node->parent_bit, trie); + rcu_assign_pointer(*trie, node); + return 0; + } +@@ -243,9 +190,9 @@ static int add(struct allowedips_node __ + if (!node) { + down = rcu_dereference_protected(*trie, lockdep_is_held(lock)); + } else { +- down = rcu_dereference_protected(CHOOSE_NODE(node, key), +- lockdep_is_held(lock)); ++ down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock)); + if (!down) { ++ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key)); + rcu_assign_pointer(CHOOSE_NODE(node, key), newnode); + return 0; + } +@@ -254,29 +201,37 @@ static int add(struct allowedips_node __ + parent = node; + + if (newnode->cidr == cidr) { ++ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits)); + rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down); +- if (!parent) ++ if (!parent) { ++ rcu_assign_pointer(newnode->parent_bit, trie); + rcu_assign_pointer(*trie, newnode); +- else +- rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), +- newnode); +- } else { +- node = kzalloc(sizeof(*node), GFP_KERNEL); +- if (unlikely(!node)) { +- list_del(&newnode->peer_list); +- kfree(newnode); +- return -ENOMEM; ++ } else { ++ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits)); ++ rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode); + } +- INIT_LIST_HEAD(&node->peer_list); +- copy_and_assign_cidr(node, newnode->bits, cidr, bits); ++ return 0; ++ } ++ ++ node = kzalloc(sizeof(*node), GFP_KERNEL); ++ if (unlikely(!node)) { ++ list_del(&newnode->peer_list); ++ kfree(newnode); ++ return -ENOMEM; ++ } ++ INIT_LIST_HEAD(&node->peer_list); ++ copy_and_assign_cidr(node, newnode->bits, cidr, bits); + +- rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down); +- rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode); +- if (!parent) +- rcu_assign_pointer(*trie, node); +- else +- rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), +- node); ++ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits)); ++ rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down); ++ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits)); ++ rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode); ++ if (!parent) { ++ rcu_assign_pointer(node->parent_bit, trie); ++ rcu_assign_pointer(*trie, node); ++ } else { ++ rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits)); ++ rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node); + } + return 0; + } +@@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allow + void wg_allowedips_remove_by_peer(struct allowedips *table, + struct wg_peer *peer, struct mutex *lock) + { ++ struct allowedips_node *node, *child, *tmp; ++ ++ if (list_empty(&peer->allowedips_list)) ++ return; + ++table->seq; +- walk_remove_by_peer(&table->root4, peer, lock); +- walk_remove_by_peer(&table->root6, peer, lock); ++ list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) { ++ list_del_init(&node->peer_list); ++ RCU_INIT_POINTER(node->peer, NULL); ++ if (node->bit[0] && node->bit[1]) ++ continue; ++ child = rcu_dereference_protected( ++ node->bit[!rcu_access_pointer(node->bit[0])], ++ lockdep_is_held(lock)); ++ if (child) ++ child->parent_bit = node->parent_bit; ++ *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child; ++ kfree_rcu(node, rcu); ++ ++ /* TODO: Note that we currently don't walk up and down in order to ++ * free any potential filler nodes. This means that this function ++ * doesn't free up as much as it could, which could be revisited ++ * at some point. ++ */ ++ } + } + + int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr) +--- a/drivers/net/wireguard/allowedips.h ++++ b/drivers/net/wireguard/allowedips.h +@@ -15,14 +15,11 @@ struct wg_peer; + struct allowedips_node { + struct wg_peer __rcu *peer; + struct allowedips_node __rcu *bit[2]; +- /* While it may seem scandalous that we waste space for v4, +- * we're alloc'ing to the nearest power of 2 anyway, so this +- * doesn't actually make a difference. +- */ +- u8 bits[16] __aligned(__alignof(u64)); + u8 cidr, bit_at_a, bit_at_b, bitlen; ++ u8 bits[16] __aligned(__alignof(u64)); + +- /* Keep rarely used list at bottom to be beyond cache line. */ ++ /* Keep rarely used members at bottom to be beyond cache line. */ ++ struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */ + union { + struct list_head peer_list; + struct rcu_head rcu; |