From 1265dbafcdce30c6e6db914b70f2824851c0e0b7 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 2 Mar 2021 09:51:42 +0100 Subject: kernel-5.10: backport chacha non block size optimizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These make a big difference when doing WireGuard with small armv7 routers, and the 5.4 backport already has it. Suggested-by: Ilya Lipnitskiy Cc: David Bauer Cc: Petr Štetiar Signed-off-by: Jason A. Donenfeld --- ...chacha-neon-optimize-for-non-block-size-m.patch | 272 +++++++++++++++++++++ ...chacha-neon-add-missing-counter-increment.patch | 38 +++ 2 files changed, 310 insertions(+) create mode 100644 target/linux/generic/backport-5.10/071-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch create mode 100644 target/linux/generic/backport-5.10/072-crypto-arm-chacha-neon-add-missing-counter-increment.patch diff --git a/target/linux/generic/backport-5.10/071-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch b/target/linux/generic/backport-5.10/071-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch new file mode 100644 index 0000000000..b1f46e9af8 --- /dev/null +++ b/target/linux/generic/backport-5.10/071-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch @@ -0,0 +1,272 @@ +From 03662fcd41f4b764857f17b95f9a2a63c24bddd4 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Tue, 3 Nov 2020 17:28:09 +0100 +Subject: [PATCH 1/2] crypto: arm/chacha-neon - optimize for non-block size + multiples + +commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream. + +The current NEON based ChaCha implementation for ARM is optimized for +multiples of 4x the ChaCha block size (64 bytes). This makes sense for +block encryption, but given that ChaCha is also often used in the +context of networking, it makes sense to consider arbitrary length +inputs as well. + +For example, WireGuard typically uses 1420 byte packets, and performing +ChaCha encryption involves 5 invocations of chacha_4block_xor_neon() +and 3 invocations of chacha_block_xor_neon(), where the last one also +involves a memcpy() using a buffer on the stack to process the final +chunk of 1420 % 64 == 12 bytes. + +Let's optimize for this case as well, by letting chacha_4block_xor_neon() +deal with any input size between 64 and 256 bytes, using NEON permutation +instructions and overlapping loads and stores. This way, the 140 byte +tail of a 1420 byte input buffer can simply be processed in one go. + +This results in the following performance improvements for 1420 byte +blocks, without significant impact on power-of-2 input sizes. (Note +that Raspberry Pi is widely used in combination with a 32-bit kernel, +even though the core is 64-bit capable) + + Cortex-A8 (BeagleBone) : 7% + Cortex-A15 (Calxeda Midway) : 21% + Cortex-A53 (Raspberry Pi 3) : 3% + Cortex-A72 (Raspberry Pi 4) : 19% + +Cc: Eric Biggers +Cc: "Jason A . Donenfeld" +Signed-off-by: Ard Biesheuvel +Signed-off-by: Herbert Xu +Signed-off-by: Jason A. Donenfeld +--- + arch/arm/crypto/chacha-glue.c | 34 +++++------ + arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++--- + 2 files changed, 107 insertions(+), 24 deletions(-) + +--- a/arch/arm/crypto/chacha-glue.c ++++ b/arch/arm/crypto/chacha-glue.c +@@ -23,7 +23,7 @@ + asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, + int nrounds); + asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, +- int nrounds); ++ int nrounds, unsigned int nbytes); + asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds); + asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); + +@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8 + { + u8 buf[CHACHA_BLOCK_SIZE]; + +- while (bytes >= CHACHA_BLOCK_SIZE * 4) { +- chacha_4block_xor_neon(state, dst, src, nrounds); +- bytes -= CHACHA_BLOCK_SIZE * 4; +- src += CHACHA_BLOCK_SIZE * 4; +- dst += CHACHA_BLOCK_SIZE * 4; +- state[12] += 4; +- } +- while (bytes >= CHACHA_BLOCK_SIZE) { +- chacha_block_xor_neon(state, dst, src, nrounds); +- bytes -= CHACHA_BLOCK_SIZE; +- src += CHACHA_BLOCK_SIZE; +- dst += CHACHA_BLOCK_SIZE; +- state[12]++; ++ while (bytes > CHACHA_BLOCK_SIZE) { ++ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); ++ ++ chacha_4block_xor_neon(state, dst, src, nrounds, l); ++ bytes -= l; ++ src += l; ++ dst += l; ++ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } + if (bytes) { +- memcpy(buf, src, bytes); +- chacha_block_xor_neon(state, buf, buf, nrounds); +- memcpy(dst, buf, bytes); ++ const u8 *s = src; ++ u8 *d = dst; ++ ++ if (bytes != CHACHA_BLOCK_SIZE) ++ s = d = memcpy(buf, src, bytes); ++ chacha_block_xor_neon(state, d, s, nrounds); ++ if (d != dst) ++ memcpy(dst, buf, bytes); + } + } + +--- a/arch/arm/crypto/chacha-neon-core.S ++++ b/arch/arm/crypto/chacha-neon-core.S +@@ -47,6 +47,7 @@ + */ + + #include ++#include + + .text + .fpu neon +@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon) + + .align 5 + ENTRY(chacha_4block_xor_neon) +- push {r4-r5} ++ push {r4, lr} + mov r4, sp // preserve the stack pointer + sub ip, sp, #0x20 // allocate a 32 byte buffer + bic ip, ip, #0x1f // aligned to 32 bytes +@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon) + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + +- adr r5, .Lctrinc ++ adr lr, .Lctrinc + vdup.32 q15, d7[1] + vdup.32 q14, d7[0] +- vld1.32 {q4}, [r5, :128] ++ vld1.32 {q4}, [lr, :128] + vdup.32 q13, d6[1] + vdup.32 q12, d6[0] + vdup.32 q11, d5[1] +@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon) + + // Re-interleave the words in the first two rows of each block (x0..7). + // Also add the counter values 0-3 to x12[0-3]. +- vld1.32 {q8}, [r5, :128] // load counter values 0-3 ++ vld1.32 {q8}, [lr, :128] // load counter values 0-3 + vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) + vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) + vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) +@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon) + + // Re-interleave the words in the last two rows of each block (x8..15). + vld1.32 {q8-q9}, [sp, :256] ++ mov sp, r4 // restore original stack pointer ++ ldr r4, [r4, #8] // load number of bytes + vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) + vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) + vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) +@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon) + // XOR the rest of the data with the keystream + + vld1.8 {q0-q1}, [r2]! ++ subs r4, r4, #96 + veor q0, q0, q8 + veor q1, q1, q12 ++ ble .Lle96 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! ++ subs r4, r4, #32 + veor q0, q0, q2 + veor q1, q1, q6 ++ ble .Lle128 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! ++ subs r4, r4, #32 + veor q0, q0, q10 + veor q1, q1, q14 ++ ble .Lle160 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! ++ subs r4, r4, #32 + veor q0, q0, q4 + veor q1, q1, q5 ++ ble .Lle192 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! ++ subs r4, r4, #32 + veor q0, q0, q9 + veor q1, q1, q13 ++ ble .Lle224 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! ++ subs r4, r4, #32 + veor q0, q0, q3 + veor q1, q1, q7 ++ blt .Llt256 ++.Lout: + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2] +- mov sp, r4 // restore original stack pointer + veor q0, q0, q11 + veor q1, q1, q15 + vst1.8 {q0-q1}, [r1] + +- pop {r4-r5} +- bx lr ++ pop {r4, pc} ++ ++.Lle192: ++ vmov q4, q9 ++ vmov q5, q13 ++ ++.Lle160: ++ // nothing to do ++ ++.Lfinalblock: ++ // Process the final block if processing less than 4 full blocks. ++ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the ++ // previous 32 byte output block that still needs to be written at ++ // [r1] in q0-q1. ++ beq .Lfullblock ++ ++.Lpartialblock: ++ adr lr, .Lpermute + 32 ++ add r2, r2, r4 ++ add lr, lr, r4 ++ add r4, r4, r1 ++ ++ vld1.8 {q2-q3}, [lr] ++ vld1.8 {q6-q7}, [r2] ++ ++ add r4, r4, #32 ++ ++ vtbl.8 d4, {q4-q5}, d4 ++ vtbl.8 d5, {q4-q5}, d5 ++ vtbl.8 d6, {q4-q5}, d6 ++ vtbl.8 d7, {q4-q5}, d7 ++ ++ veor q6, q6, q2 ++ veor q7, q7, q3 ++ ++ vst1.8 {q6-q7}, [r4] // overlapping stores ++ vst1.8 {q0-q1}, [r1] ++ pop {r4, pc} ++ ++.Lfullblock: ++ vmov q11, q4 ++ vmov q15, q5 ++ b .Lout ++.Lle96: ++ vmov q4, q2 ++ vmov q5, q6 ++ b .Lfinalblock ++.Lle128: ++ vmov q4, q10 ++ vmov q5, q14 ++ b .Lfinalblock ++.Lle224: ++ vmov q4, q3 ++ vmov q5, q7 ++ b .Lfinalblock ++.Llt256: ++ vmov q4, q11 ++ vmov q5, q15 ++ b .Lpartialblock + ENDPROC(chacha_4block_xor_neon) ++ ++ .align L1_CACHE_SHIFT ++.Lpermute: ++ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ++ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f ++ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 ++ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f ++ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ++ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f ++ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 ++ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f diff --git a/target/linux/generic/backport-5.10/072-crypto-arm-chacha-neon-add-missing-counter-increment.patch b/target/linux/generic/backport-5.10/072-crypto-arm-chacha-neon-add-missing-counter-increment.patch new file mode 100644 index 0000000000..1e4d2041e5 --- /dev/null +++ b/target/linux/generic/backport-5.10/072-crypto-arm-chacha-neon-add-missing-counter-increment.patch @@ -0,0 +1,38 @@ +From 7f63462faf9eab69132bea9abd48c2c05a93145b Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Sun, 13 Dec 2020 15:39:29 +0100 +Subject: [PATCH 2/2] crypto: arm/chacha-neon - add missing counter increment + +commit fd16931a2f518a32753920ff20895e5cf04c8ff1 upstream. + +Commit 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block +size multiples") refactored the chacha block handling in the glue code in +a way that may result in the counter increment to be omitted when calling +chacha_block_xor_neon() to process a full block. This violates the skcipher +API, which requires that the output IV is suitable for handling more input +as long as the preceding input has been presented in round multiples of the +block size. Also, the same code is exposed via the chacha library interface +whose callers may actually rely on this increment to occur even for final +blocks that are smaller than the chacha block size. + +So increment the counter after calling chacha_block_xor_neon(). + +Fixes: 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block size multiples") +Reported-by: Eric Biggers +Signed-off-by: Ard Biesheuvel +Signed-off-by: Herbert Xu +Signed-off-by: Jason A. Donenfeld +--- + arch/arm/crypto/chacha-glue.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/arm/crypto/chacha-glue.c ++++ b/arch/arm/crypto/chacha-glue.c +@@ -60,6 +60,7 @@ static void chacha_doneon(u32 *state, u8 + chacha_block_xor_neon(state, d, s, nrounds); + if (d != dst) + memcpy(dst, buf, bytes); ++ state[12]++; + } + } + -- cgit v1.2.3