diff options
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch')
-rw-r--r-- | target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch | 272 |
1 files changed, 0 insertions, 272 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch b/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch deleted file mode 100644 index b31b8d9a0e..0000000000 --- a/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch +++ /dev/null @@ -1,272 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ard Biesheuvel <ardb@kernel.org> -Date: Tue, 3 Nov 2020 17:28:09 +0100 -Subject: [PATCH] crypto: arm/chacha-neon - optimize for non-block size - multiples - -commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream. - -The current NEON based ChaCha implementation for ARM is optimized for -multiples of 4x the ChaCha block size (64 bytes). This makes sense for -block encryption, but given that ChaCha is also often used in the -context of networking, it makes sense to consider arbitrary length -inputs as well. - -For example, WireGuard typically uses 1420 byte packets, and performing -ChaCha encryption involves 5 invocations of chacha_4block_xor_neon() -and 3 invocations of chacha_block_xor_neon(), where the last one also -involves a memcpy() using a buffer on the stack to process the final -chunk of 1420 % 64 == 12 bytes. - -Let's optimize for this case as well, by letting chacha_4block_xor_neon() -deal with any input size between 64 and 256 bytes, using NEON permutation -instructions and overlapping loads and stores. This way, the 140 byte -tail of a 1420 byte input buffer can simply be processed in one go. - -This results in the following performance improvements for 1420 byte -blocks, without significant impact on power-of-2 input sizes. (Note -that Raspberry Pi is widely used in combination with a 32-bit kernel, -even though the core is 64-bit capable) - - Cortex-A8 (BeagleBone) : 7% - Cortex-A15 (Calxeda Midway) : 21% - Cortex-A53 (Raspberry Pi 3) : 3% - Cortex-A72 (Raspberry Pi 4) : 19% - -Cc: Eric Biggers <ebiggers@google.com> -Cc: "Jason A . Donenfeld" <Jason@zx2c4.com> -Signed-off-by: Ard Biesheuvel <ardb@kernel.org> -Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> -Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> ---- - arch/arm/crypto/chacha-glue.c | 34 +++++------ - arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++--- - 2 files changed, 107 insertions(+), 24 deletions(-) - ---- a/arch/arm/crypto/chacha-glue.c -+++ b/arch/arm/crypto/chacha-glue.c -@@ -23,7 +23,7 @@ - asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, - int nrounds); - asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, -- int nrounds); -+ int nrounds, unsigned int nbytes); - asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds); - asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); - -@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8 - { - u8 buf[CHACHA_BLOCK_SIZE]; - -- while (bytes >= CHACHA_BLOCK_SIZE * 4) { -- chacha_4block_xor_neon(state, dst, src, nrounds); -- bytes -= CHACHA_BLOCK_SIZE * 4; -- src += CHACHA_BLOCK_SIZE * 4; -- dst += CHACHA_BLOCK_SIZE * 4; -- state[12] += 4; -- } -- while (bytes >= CHACHA_BLOCK_SIZE) { -- chacha_block_xor_neon(state, dst, src, nrounds); -- bytes -= CHACHA_BLOCK_SIZE; -- src += CHACHA_BLOCK_SIZE; -- dst += CHACHA_BLOCK_SIZE; -- state[12]++; -+ while (bytes > CHACHA_BLOCK_SIZE) { -+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); -+ -+ chacha_4block_xor_neon(state, dst, src, nrounds, l); -+ bytes -= l; -+ src += l; -+ dst += l; -+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); - } - if (bytes) { -- memcpy(buf, src, bytes); -- chacha_block_xor_neon(state, buf, buf, nrounds); -- memcpy(dst, buf, bytes); -+ const u8 *s = src; -+ u8 *d = dst; -+ -+ if (bytes != CHACHA_BLOCK_SIZE) -+ s = d = memcpy(buf, src, bytes); -+ chacha_block_xor_neon(state, d, s, nrounds); -+ if (d != dst) -+ memcpy(dst, buf, bytes); - } - } - ---- a/arch/arm/crypto/chacha-neon-core.S -+++ b/arch/arm/crypto/chacha-neon-core.S -@@ -47,6 +47,7 @@ - */ - - #include <linux/linkage.h> -+#include <asm/cache.h> - - .text - .fpu neon -@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon) - - .align 5 - ENTRY(chacha_4block_xor_neon) -- push {r4-r5} -+ push {r4, lr} - mov r4, sp // preserve the stack pointer - sub ip, sp, #0x20 // allocate a 32 byte buffer - bic ip, ip, #0x1f // aligned to 32 bytes -@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon) - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - -- adr r5, .Lctrinc -+ adr lr, .Lctrinc - vdup.32 q15, d7[1] - vdup.32 q14, d7[0] -- vld1.32 {q4}, [r5, :128] -+ vld1.32 {q4}, [lr, :128] - vdup.32 q13, d6[1] - vdup.32 q12, d6[0] - vdup.32 q11, d5[1] -@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon) - - // Re-interleave the words in the first two rows of each block (x0..7). - // Also add the counter values 0-3 to x12[0-3]. -- vld1.32 {q8}, [r5, :128] // load counter values 0-3 -+ vld1.32 {q8}, [lr, :128] // load counter values 0-3 - vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) - vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) - vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) -@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon) - - // Re-interleave the words in the last two rows of each block (x8..15). - vld1.32 {q8-q9}, [sp, :256] -+ mov sp, r4 // restore original stack pointer -+ ldr r4, [r4, #8] // load number of bytes - vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) - vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) - vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) -@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon) - // XOR the rest of the data with the keystream - - vld1.8 {q0-q1}, [r2]! -+ subs r4, r4, #96 - veor q0, q0, q8 - veor q1, q1, q12 -+ ble .Lle96 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! -+ subs r4, r4, #32 - veor q0, q0, q2 - veor q1, q1, q6 -+ ble .Lle128 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! -+ subs r4, r4, #32 - veor q0, q0, q10 - veor q1, q1, q14 -+ ble .Lle160 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! -+ subs r4, r4, #32 - veor q0, q0, q4 - veor q1, q1, q5 -+ ble .Lle192 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! -+ subs r4, r4, #32 - veor q0, q0, q9 - veor q1, q1, q13 -+ ble .Lle224 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! -+ subs r4, r4, #32 - veor q0, q0, q3 - veor q1, q1, q7 -+ blt .Llt256 -+.Lout: - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2] -- mov sp, r4 // restore original stack pointer - veor q0, q0, q11 - veor q1, q1, q15 - vst1.8 {q0-q1}, [r1] - -- pop {r4-r5} -- bx lr -+ pop {r4, pc} -+ -+.Lle192: -+ vmov q4, q9 -+ vmov q5, q13 -+ -+.Lle160: -+ // nothing to do -+ -+.Lfinalblock: -+ // Process the final block if processing less than 4 full blocks. -+ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the -+ // previous 32 byte output block that still needs to be written at -+ // [r1] in q0-q1. -+ beq .Lfullblock -+ -+.Lpartialblock: -+ adr lr, .Lpermute + 32 -+ add r2, r2, r4 -+ add lr, lr, r4 -+ add r4, r4, r1 -+ -+ vld1.8 {q2-q3}, [lr] -+ vld1.8 {q6-q7}, [r2] -+ -+ add r4, r4, #32 -+ -+ vtbl.8 d4, {q4-q5}, d4 -+ vtbl.8 d5, {q4-q5}, d5 -+ vtbl.8 d6, {q4-q5}, d6 -+ vtbl.8 d7, {q4-q5}, d7 -+ -+ veor q6, q6, q2 -+ veor q7, q7, q3 -+ -+ vst1.8 {q6-q7}, [r4] // overlapping stores -+ vst1.8 {q0-q1}, [r1] -+ pop {r4, pc} -+ -+.Lfullblock: -+ vmov q11, q4 -+ vmov q15, q5 -+ b .Lout -+.Lle96: -+ vmov q4, q2 -+ vmov q5, q6 -+ b .Lfinalblock -+.Lle128: -+ vmov q4, q10 -+ vmov q5, q14 -+ b .Lfinalblock -+.Lle224: -+ vmov q4, q3 -+ vmov q5, q7 -+ b .Lfinalblock -+.Llt256: -+ vmov q4, q11 -+ vmov q5, q15 -+ b .Lpartialblock - ENDPROC(chacha_4block_xor_neon) -+ -+ .align L1_CACHE_SHIFT -+.Lpermute: -+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 -+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f -+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 -+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f -+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 -+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f -+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 -+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f |