diff options
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch')
-rw-r--r-- | target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch | 147 |
1 files changed, 147 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch b/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch new file mode 100644 index 0000000000..52847877f6 --- /dev/null +++ b/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch @@ -0,0 +1,147 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel <ardb@kernel.org> +Date: Wed, 8 Jul 2020 12:11:18 +0300 +Subject: [PATCH] crypto: x86/chacha-sse3 - use unaligned loads for state array + +commit e79a31715193686e92dadb4caedfbb1f5de3659c upstream. + +Due to the fact that the x86 port does not support allocating objects +on the stack with an alignment that exceeds 8 bytes, we have a rather +ugly hack in the x86 code for ChaCha to ensure that the state array is +aligned to 16 bytes, allowing the SSE3 implementation of the algorithm +to use aligned loads. + +Given that the performance benefit of using of aligned loads appears to +be limited (~0.25% for 1k blocks using tcrypt on a Corei7-8650U), and +the fact that this hack has leaked into generic ChaCha code, let's just +remove it. + +Cc: Martin Willi <martin@strongswan.org> +Cc: Herbert Xu <herbert@gondor.apana.org.au> +Cc: Eric Biggers <ebiggers@kernel.org> +Signed-off-by: Ard Biesheuvel <ardb@kernel.org> +Reviewed-by: Martin Willi <martin@strongswan.org> +Reviewed-by: Eric Biggers <ebiggers@google.com> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> +Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> +--- + arch/x86/crypto/chacha-ssse3-x86_64.S | 16 ++++++++-------- + arch/x86/crypto/chacha_glue.c | 17 ++--------------- + include/crypto/chacha.h | 4 ---- + 3 files changed, 10 insertions(+), 27 deletions(-) + +--- a/arch/x86/crypto/chacha-ssse3-x86_64.S ++++ b/arch/x86/crypto/chacha-ssse3-x86_64.S +@@ -120,10 +120,10 @@ ENTRY(chacha_block_xor_ssse3) + FRAME_BEGIN + + # x0..3 = s0..3 +- movdqa 0x00(%rdi),%xmm0 +- movdqa 0x10(%rdi),%xmm1 +- movdqa 0x20(%rdi),%xmm2 +- movdqa 0x30(%rdi),%xmm3 ++ movdqu 0x00(%rdi),%xmm0 ++ movdqu 0x10(%rdi),%xmm1 ++ movdqu 0x20(%rdi),%xmm2 ++ movdqu 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 +@@ -205,10 +205,10 @@ ENTRY(hchacha_block_ssse3) + # %edx: nrounds + FRAME_BEGIN + +- movdqa 0x00(%rdi),%xmm0 +- movdqa 0x10(%rdi),%xmm1 +- movdqa 0x20(%rdi),%xmm2 +- movdqa 0x30(%rdi),%xmm3 ++ movdqu 0x00(%rdi),%xmm0 ++ movdqu 0x10(%rdi),%xmm1 ++ movdqu 0x20(%rdi),%xmm2 ++ movdqu 0x30(%rdi),%xmm3 + + mov %edx,%r8d + call chacha_permute +--- a/arch/x86/crypto/chacha_glue.c ++++ b/arch/x86/crypto/chacha_glue.c +@@ -14,8 +14,6 @@ + #include <linux/module.h> + #include <asm/simd.h> + +-#define CHACHA_STATE_ALIGN 16 +- + asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); + asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, +@@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8 + + void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) + { +- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); +- + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { + hchacha_block_generic(state, stream, nrounds); + } else { +@@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch); + + void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) + { +- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); +- + chacha_init_generic(state, key, iv); + } + EXPORT_SYMBOL(chacha_init_arch); +@@ -148,8 +142,6 @@ EXPORT_SYMBOL(chacha_init_arch); + void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) + { +- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); +- + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); +@@ -171,15 +163,12 @@ EXPORT_SYMBOL(chacha_crypt_arch); + static int chacha_simd_stream_xor(struct skcipher_request *req, + const struct chacha_ctx *ctx, const u8 *iv) + { +- u32 *state, state_buf[16 + 2] __aligned(8); ++ u32 state[CHACHA_STATE_WORDS] __aligned(8); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); + +- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); +- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); +- + chacha_init_generic(state, ctx->key, iv); + + while (walk.nbytes > 0) { +@@ -218,12 +207,10 @@ static int xchacha_simd(struct skcipher_ + { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); +- u32 *state, state_buf[16 + 2] __aligned(8); ++ u32 state[CHACHA_STATE_WORDS] __aligned(8); + struct chacha_ctx subctx; + u8 real_iv[16]; + +- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); +- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + chacha_init_generic(state, ctx->key, req->iv); + + if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { +--- a/include/crypto/chacha.h ++++ b/include/crypto/chacha.h +@@ -25,11 +25,7 @@ + #define CHACHA_BLOCK_SIZE 64 + #define CHACHAPOLY_IV_SIZE 12 + +-#ifdef CONFIG_X86_64 +-#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32)) +-#else + #define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32)) +-#endif + + /* 192-bit nonce, then 64-bit stream position */ + #define XCHACHA_IV_SIZE 32 |