aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch272
1 files changed, 0 insertions, 272 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch b/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch
deleted file mode 100644
index b31b8d9a0e..0000000000
--- a/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch
+++ /dev/null
@@ -1,272 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ardb@kernel.org>
-Date: Tue, 3 Nov 2020 17:28:09 +0100
-Subject: [PATCH] crypto: arm/chacha-neon - optimize for non-block size
- multiples
-
-commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.
-
-The current NEON based ChaCha implementation for ARM is optimized for
-multiples of 4x the ChaCha block size (64 bytes). This makes sense for
-block encryption, but given that ChaCha is also often used in the
-context of networking, it makes sense to consider arbitrary length
-inputs as well.
-
-For example, WireGuard typically uses 1420 byte packets, and performing
-ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
-and 3 invocations of chacha_block_xor_neon(), where the last one also
-involves a memcpy() using a buffer on the stack to process the final
-chunk of 1420 % 64 == 12 bytes.
-
-Let's optimize for this case as well, by letting chacha_4block_xor_neon()
-deal with any input size between 64 and 256 bytes, using NEON permutation
-instructions and overlapping loads and stores. This way, the 140 byte
-tail of a 1420 byte input buffer can simply be processed in one go.
-
-This results in the following performance improvements for 1420 byte
-blocks, without significant impact on power-of-2 input sizes. (Note
-that Raspberry Pi is widely used in combination with a 32-bit kernel,
-even though the core is 64-bit capable)
-
- Cortex-A8 (BeagleBone) : 7%
- Cortex-A15 (Calxeda Midway) : 21%
- Cortex-A53 (Raspberry Pi 3) : 3%
- Cortex-A72 (Raspberry Pi 4) : 19%
-
-Cc: Eric Biggers <ebiggers@google.com>
-Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
-Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
-Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
----
- arch/arm/crypto/chacha-glue.c | 34 +++++------
- arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
- 2 files changed, 107 insertions(+), 24 deletions(-)
-
---- a/arch/arm/crypto/chacha-glue.c
-+++ b/arch/arm/crypto/chacha-glue.c
-@@ -23,7 +23,7 @@
- asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
- int nrounds);
- asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
-- int nrounds);
-+ int nrounds, unsigned int nbytes);
- asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
- asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
-
-@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
- {
- u8 buf[CHACHA_BLOCK_SIZE];
-
-- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-- chacha_4block_xor_neon(state, dst, src, nrounds);
-- bytes -= CHACHA_BLOCK_SIZE * 4;
-- src += CHACHA_BLOCK_SIZE * 4;
-- dst += CHACHA_BLOCK_SIZE * 4;
-- state[12] += 4;
-- }
-- while (bytes >= CHACHA_BLOCK_SIZE) {
-- chacha_block_xor_neon(state, dst, src, nrounds);
-- bytes -= CHACHA_BLOCK_SIZE;
-- src += CHACHA_BLOCK_SIZE;
-- dst += CHACHA_BLOCK_SIZE;
-- state[12]++;
-+ while (bytes > CHACHA_BLOCK_SIZE) {
-+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
-+
-+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
-+ bytes -= l;
-+ src += l;
-+ dst += l;
-+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
- }
- if (bytes) {
-- memcpy(buf, src, bytes);
-- chacha_block_xor_neon(state, buf, buf, nrounds);
-- memcpy(dst, buf, bytes);
-+ const u8 *s = src;
-+ u8 *d = dst;
-+
-+ if (bytes != CHACHA_BLOCK_SIZE)
-+ s = d = memcpy(buf, src, bytes);
-+ chacha_block_xor_neon(state, d, s, nrounds);
-+ if (d != dst)
-+ memcpy(dst, buf, bytes);
- }
- }
-
---- a/arch/arm/crypto/chacha-neon-core.S
-+++ b/arch/arm/crypto/chacha-neon-core.S
-@@ -47,6 +47,7 @@
- */
-
- #include <linux/linkage.h>
-+#include <asm/cache.h>
-
- .text
- .fpu neon
-@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
-
- .align 5
- ENTRY(chacha_4block_xor_neon)
-- push {r4-r5}
-+ push {r4, lr}
- mov r4, sp // preserve the stack pointer
- sub ip, sp, #0x20 // allocate a 32 byte buffer
- bic ip, ip, #0x1f // aligned to 32 bytes
-@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
- vld1.32 {q0-q1}, [r0]
- vld1.32 {q2-q3}, [ip]
-
-- adr r5, .Lctrinc
-+ adr lr, .Lctrinc
- vdup.32 q15, d7[1]
- vdup.32 q14, d7[0]
-- vld1.32 {q4}, [r5, :128]
-+ vld1.32 {q4}, [lr, :128]
- vdup.32 q13, d6[1]
- vdup.32 q12, d6[0]
- vdup.32 q11, d5[1]
-@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
-
- // Re-interleave the words in the first two rows of each block (x0..7).
- // Also add the counter values 0-3 to x12[0-3].
-- vld1.32 {q8}, [r5, :128] // load counter values 0-3
-+ vld1.32 {q8}, [lr, :128] // load counter values 0-3
- vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
- vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
- vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
-@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
-
- // Re-interleave the words in the last two rows of each block (x8..15).
- vld1.32 {q8-q9}, [sp, :256]
-+ mov sp, r4 // restore original stack pointer
-+ ldr r4, [r4, #8] // load number of bytes
- vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
- vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
- vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
-@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
- // XOR the rest of the data with the keystream
-
- vld1.8 {q0-q1}, [r2]!
-+ subs r4, r4, #96
- veor q0, q0, q8
- veor q1, q1, q12
-+ ble .Lle96
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
-+ subs r4, r4, #32
- veor q0, q0, q2
- veor q1, q1, q6
-+ ble .Lle128
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
-+ subs r4, r4, #32
- veor q0, q0, q10
- veor q1, q1, q14
-+ ble .Lle160
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
-+ subs r4, r4, #32
- veor q0, q0, q4
- veor q1, q1, q5
-+ ble .Lle192
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
-+ subs r4, r4, #32
- veor q0, q0, q9
- veor q1, q1, q13
-+ ble .Lle224
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
-+ subs r4, r4, #32
- veor q0, q0, q3
- veor q1, q1, q7
-+ blt .Llt256
-+.Lout:
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]
-- mov sp, r4 // restore original stack pointer
- veor q0, q0, q11
- veor q1, q1, q15
- vst1.8 {q0-q1}, [r1]
-
-- pop {r4-r5}
-- bx lr
-+ pop {r4, pc}
-+
-+.Lle192:
-+ vmov q4, q9
-+ vmov q5, q13
-+
-+.Lle160:
-+ // nothing to do
-+
-+.Lfinalblock:
-+ // Process the final block if processing less than 4 full blocks.
-+ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
-+ // previous 32 byte output block that still needs to be written at
-+ // [r1] in q0-q1.
-+ beq .Lfullblock
-+
-+.Lpartialblock:
-+ adr lr, .Lpermute + 32
-+ add r2, r2, r4
-+ add lr, lr, r4
-+ add r4, r4, r1
-+
-+ vld1.8 {q2-q3}, [lr]
-+ vld1.8 {q6-q7}, [r2]
-+
-+ add r4, r4, #32
-+
-+ vtbl.8 d4, {q4-q5}, d4
-+ vtbl.8 d5, {q4-q5}, d5
-+ vtbl.8 d6, {q4-q5}, d6
-+ vtbl.8 d7, {q4-q5}, d7
-+
-+ veor q6, q6, q2
-+ veor q7, q7, q3
-+
-+ vst1.8 {q6-q7}, [r4] // overlapping stores
-+ vst1.8 {q0-q1}, [r1]
-+ pop {r4, pc}
-+
-+.Lfullblock:
-+ vmov q11, q4
-+ vmov q15, q5
-+ b .Lout
-+.Lle96:
-+ vmov q4, q2
-+ vmov q5, q6
-+ b .Lfinalblock
-+.Lle128:
-+ vmov q4, q10
-+ vmov q5, q14
-+ b .Lfinalblock
-+.Lle224:
-+ vmov q4, q3
-+ vmov q5, q7
-+ b .Lfinalblock
-+.Llt256:
-+ vmov q4, q11
-+ vmov q5, q15
-+ b .Lpartialblock
- ENDPROC(chacha_4block_xor_neon)
-+
-+ .align L1_CACHE_SHIFT
-+.Lpermute:
-+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f