diff options
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch')
-rw-r--r-- | target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch b/target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch new file mode 100644 index 0000000000..f8828f243e --- /dev/null +++ b/target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch @@ -0,0 +1,243 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" <Jason@zx2c4.com> +Date: Thu, 23 Apr 2020 15:54:04 -0600 +Subject: [PATCH] crypto: arch/lib - limit simd usage to 4k chunks + +commit 706024a52c614b478b63f7728d202532ce6591a9 upstream. + +The initial Zinc patchset, after some mailing list discussion, contained +code to ensure that kernel_fpu_enable would not be kept on for more than +a 4k chunk, since it disables preemption. The choice of 4k isn't totally +scientific, but it's not a bad guess either, and it's what's used in +both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form +of PAGE_SIZE, which this commit corrects to be explicitly 4k for the +former two). + +Ard did some back of the envelope calculations and found that +at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k +means we have a maximum preemption disabling of 20us, which Sebastian +confirmed was probably a good limit. + +Unfortunately the chunking appears to have been left out of the final +patchset that added the glue code. So, this commit adds it back in. + +Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function") +Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function") +Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function") +Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel") +Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") +Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") +Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation") +Cc: Eric Biggers <ebiggers@google.com> +Cc: Ard Biesheuvel <ardb@kernel.org> +Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Cc: stable@vger.kernel.org +Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> +Reviewed-by: Ard Biesheuvel <ardb@kernel.org> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> +Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> +--- + arch/arm/crypto/chacha-glue.c | 14 +++++++++++--- + arch/arm/crypto/poly1305-glue.c | 15 +++++++++++---- + arch/arm64/crypto/chacha-neon-glue.c | 14 +++++++++++--- + arch/arm64/crypto/poly1305-glue.c | 15 +++++++++++---- + arch/x86/crypto/blake2s-glue.c | 10 ++++------ + arch/x86/crypto/chacha_glue.c | 14 +++++++++++--- + arch/x86/crypto/poly1305_glue.c | 13 ++++++------- + 7 files changed, 65 insertions(+), 30 deletions(-) + +--- a/arch/arm/crypto/chacha-glue.c ++++ b/arch/arm/crypto/chacha-glue.c +@@ -91,9 +91,17 @@ void chacha_crypt_arch(u32 *state, u8 *d + return; + } + +- kernel_neon_begin(); +- chacha_doneon(state, dst, src, bytes, nrounds); +- kernel_neon_end(); ++ do { ++ unsigned int todo = min_t(unsigned int, bytes, SZ_4K); ++ ++ kernel_neon_begin(); ++ chacha_doneon(state, dst, src, todo, nrounds); ++ kernel_neon_end(); ++ ++ bytes -= todo; ++ src += todo; ++ dst += todo; ++ } while (bytes); + } + EXPORT_SYMBOL(chacha_crypt_arch); + +--- a/arch/arm/crypto/poly1305-glue.c ++++ b/arch/arm/crypto/poly1305-glue.c +@@ -160,13 +160,20 @@ void poly1305_update_arch(struct poly130 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); + + if (static_branch_likely(&have_neon) && do_neon) { +- kernel_neon_begin(); +- poly1305_blocks_neon(&dctx->h, src, len, 1); +- kernel_neon_end(); ++ do { ++ unsigned int todo = min_t(unsigned int, len, SZ_4K); ++ ++ kernel_neon_begin(); ++ poly1305_blocks_neon(&dctx->h, src, todo, 1); ++ kernel_neon_end(); ++ ++ len -= todo; ++ src += todo; ++ } while (len); + } else { + poly1305_blocks_arm(&dctx->h, src, len, 1); ++ src += len; + } +- src += len; + nbytes %= POLY1305_BLOCK_SIZE; + } + +--- a/arch/arm64/crypto/chacha-neon-glue.c ++++ b/arch/arm64/crypto/chacha-neon-glue.c +@@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *d + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + +- kernel_neon_begin(); +- chacha_doneon(state, dst, src, bytes, nrounds); +- kernel_neon_end(); ++ do { ++ unsigned int todo = min_t(unsigned int, bytes, SZ_4K); ++ ++ kernel_neon_begin(); ++ chacha_doneon(state, dst, src, todo, nrounds); ++ kernel_neon_end(); ++ ++ bytes -= todo; ++ src += todo; ++ dst += todo; ++ } while (bytes); + } + EXPORT_SYMBOL(chacha_crypt_arch); + +--- a/arch/arm64/crypto/poly1305-glue.c ++++ b/arch/arm64/crypto/poly1305-glue.c +@@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly130 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); + + if (static_branch_likely(&have_neon) && crypto_simd_usable()) { +- kernel_neon_begin(); +- poly1305_blocks_neon(&dctx->h, src, len, 1); +- kernel_neon_end(); ++ do { ++ unsigned int todo = min_t(unsigned int, len, SZ_4K); ++ ++ kernel_neon_begin(); ++ poly1305_blocks_neon(&dctx->h, src, todo, 1); ++ kernel_neon_end(); ++ ++ len -= todo; ++ src += todo; ++ } while (len); + } else { + poly1305_blocks(&dctx->h, src, len, 1); ++ src += len; + } +- src += len; + nbytes %= POLY1305_BLOCK_SIZE; + } + +--- a/arch/x86/crypto/blake2s-glue.c ++++ b/arch/x86/crypto/blake2s-glue.c +@@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2 + const u32 inc) + { + /* SIMD disables preemption, so relax after processing each page. */ +- BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); ++ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); + + if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } + +- for (;;) { ++ do { + const size_t blocks = min_t(size_t, nblocks, +- PAGE_SIZE / BLAKE2S_BLOCK_SIZE); ++ SZ_4K / BLAKE2S_BLOCK_SIZE); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && +@@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2 + kernel_fpu_end(); + + nblocks -= blocks; +- if (!nblocks) +- break; + block += blocks * BLAKE2S_BLOCK_SIZE; +- } ++ } while (nblocks); + } + EXPORT_SYMBOL(blake2s_compress_arch); + +--- a/arch/x86/crypto/chacha_glue.c ++++ b/arch/x86/crypto/chacha_glue.c +@@ -154,9 +154,17 @@ void chacha_crypt_arch(u32 *state, u8 *d + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + +- kernel_fpu_begin(); +- chacha_dosimd(state, dst, src, bytes, nrounds); +- kernel_fpu_end(); ++ do { ++ unsigned int todo = min_t(unsigned int, bytes, SZ_4K); ++ ++ kernel_fpu_begin(); ++ chacha_dosimd(state, dst, src, todo, nrounds); ++ kernel_fpu_end(); ++ ++ bytes -= todo; ++ src += todo; ++ dst += todo; ++ } while (bytes); + } + EXPORT_SYMBOL(chacha_crypt_arch); + +--- a/arch/x86/crypto/poly1305_glue.c ++++ b/arch/x86/crypto/poly1305_glue.c +@@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *c + struct poly1305_arch_internal *state = ctx; + + /* SIMD disables preemption, so relax after processing each page. */ +- BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || +- PAGE_SIZE % POLY1305_BLOCK_SIZE); ++ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || ++ SZ_4K % POLY1305_BLOCK_SIZE); + + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || +@@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *c + return; + } + +- for (;;) { +- const size_t bytes = min_t(size_t, len, PAGE_SIZE); ++ do { ++ const size_t bytes = min_t(size_t, len, SZ_4K); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) +@@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *c + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); ++ + len -= bytes; +- if (!len) +- break; + inp += bytes; +- } ++ } while (len); + } + + static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], |