aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2021-02-19 14:29:04 +0100
committerDavid Bauer <mail@david-bauer.net>2021-02-26 20:41:01 +0100
commit3888fa78802354ab7bbd19b7d061fd80a16ce06b (patch)
tree2225a6313cb6482f0cb9c09df662a0d44197350e /target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
parent7d4143234c4dfdd050ebc64ec8231f9d81ea65af (diff)
downloadupstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.tar.gz
upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.tar.bz2
upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.zip
kernel: 5.4: import wireguard backport
Rather than using the clunky, old, slower wireguard-linux-compat out of tree module, this commit does a patch-by-patch backport of upstream's wireguard to 5.4. This specific backport is in widespread use, being part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's Android kernel, Gentoo's distro kernel, and probably more I've forgotten about. It's definately the "more proper" way of adding wireguard to a kernel than the ugly compat.h hell of the wireguard-linux-compat repo. And most importantly for OpenWRT, it allows using the same module configuration code for 5.10 as for 5.4, with no need for bifurcation. These patches are from the backport tree which is maintained in the open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y I'll be sending PRs to update this as needed. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch557
1 files changed, 557 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch b/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
new file mode 100644
index 0000000000..80bf831f81
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
@@ -0,0 +1,557 @@
+From 7960239adcaf7b56b081426ea3aa0ebf17398375 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:31 +0100
+Subject: [PATCH 024/124] crypto: blake2s - x86_64 SIMD implementation
+
+commit ed0356eda153f6a95649e11feb7b07083caf9e20 upstream.
+
+These implementations from Samuel Neves support AVX and AVX-512VL.
+Originally this used AVX-512F, but Skylake thermal throttling made
+AVX-512VL more attractive and possible to do with negligable difference.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
+Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
+[ardb: move to arch/x86/crypto, wire into lib/crypto framework]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/Makefile | 2 +
+ arch/x86/crypto/blake2s-core.S | 258 +++++++++++++++++++++++++++++++++
+ arch/x86/crypto/blake2s-glue.c | 233 +++++++++++++++++++++++++++++
+ crypto/Kconfig | 6 +
+ 4 files changed, 499 insertions(+)
+ create mode 100644 arch/x86/crypto/blake2s-core.S
+ create mode 100644 arch/x86/crypto/blake2s-glue.c
+
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
+ obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
++ obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
+ endif
+
+ # These modules require assembler to support AVX2.
+@@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x8
+ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+
+ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
++blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+
+ ifeq ($(avx_supported),yes)
+ camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+--- /dev/null
++++ b/arch/x86/crypto/blake2s-core.S
+@@ -0,0 +1,258 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
++ */
++
++#include <linux/linkage.h>
++
++.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
++.align 32
++IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
++ .octa 0x5BE0CD191F83D9AB9B05688C510E527F
++.section .rodata.cst16.ROT16, "aM", @progbits, 16
++.align 16
++ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
++.section .rodata.cst16.ROR328, "aM", @progbits, 16
++.align 16
++ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
++.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
++.align 64
++SIGMA:
++.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
++.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
++.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
++.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
++.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
++.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
++.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
++.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
++.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
++.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
++#ifdef CONFIG_AS_AVX512
++.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
++.align 64
++SIGMA2:
++.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
++.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
++.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
++.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
++.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
++.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
++.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
++.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
++.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
++.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
++#endif /* CONFIG_AS_AVX512 */
++
++.text
++#ifdef CONFIG_AS_SSSE3
++ENTRY(blake2s_compress_ssse3)
++ testq %rdx,%rdx
++ je .Lendofloop
++ movdqu (%rdi),%xmm0
++ movdqu 0x10(%rdi),%xmm1
++ movdqa ROT16(%rip),%xmm12
++ movdqa ROR328(%rip),%xmm13
++ movdqu 0x20(%rdi),%xmm14
++ movq %rcx,%xmm15
++ leaq SIGMA+0xa0(%rip),%r8
++ jmp .Lbeginofloop
++ .align 32
++.Lbeginofloop:
++ movdqa %xmm0,%xmm10
++ movdqa %xmm1,%xmm11
++ paddq %xmm15,%xmm14
++ movdqa IV(%rip),%xmm2
++ movdqa %xmm14,%xmm3
++ pxor IV+0x10(%rip),%xmm3
++ leaq SIGMA(%rip),%rcx
++.Lroundloop:
++ movzbl (%rcx),%eax
++ movd (%rsi,%rax,4),%xmm4
++ movzbl 0x1(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm5
++ movzbl 0x2(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm6
++ movzbl 0x3(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm7
++ punpckldq %xmm5,%xmm4
++ punpckldq %xmm7,%xmm6
++ punpcklqdq %xmm6,%xmm4
++ paddd %xmm4,%xmm0
++ paddd %xmm1,%xmm0
++ pxor %xmm0,%xmm3
++ pshufb %xmm12,%xmm3
++ paddd %xmm3,%xmm2
++ pxor %xmm2,%xmm1
++ movdqa %xmm1,%xmm8
++ psrld $0xc,%xmm1
++ pslld $0x14,%xmm8
++ por %xmm8,%xmm1
++ movzbl 0x4(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm5
++ movzbl 0x5(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm6
++ movzbl 0x6(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm7
++ movzbl 0x7(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm4
++ punpckldq %xmm6,%xmm5
++ punpckldq %xmm4,%xmm7
++ punpcklqdq %xmm7,%xmm5
++ paddd %xmm5,%xmm0
++ paddd %xmm1,%xmm0
++ pxor %xmm0,%xmm3
++ pshufb %xmm13,%xmm3
++ paddd %xmm3,%xmm2
++ pxor %xmm2,%xmm1
++ movdqa %xmm1,%xmm8
++ psrld $0x7,%xmm1
++ pslld $0x19,%xmm8
++ por %xmm8,%xmm1
++ pshufd $0x93,%xmm0,%xmm0
++ pshufd $0x4e,%xmm3,%xmm3
++ pshufd $0x39,%xmm2,%xmm2
++ movzbl 0x8(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm6
++ movzbl 0x9(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm7
++ movzbl 0xa(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm4
++ movzbl 0xb(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm5
++ punpckldq %xmm7,%xmm6
++ punpckldq %xmm5,%xmm4
++ punpcklqdq %xmm4,%xmm6
++ paddd %xmm6,%xmm0
++ paddd %xmm1,%xmm0
++ pxor %xmm0,%xmm3
++ pshufb %xmm12,%xmm3
++ paddd %xmm3,%xmm2
++ pxor %xmm2,%xmm1
++ movdqa %xmm1,%xmm8
++ psrld $0xc,%xmm1
++ pslld $0x14,%xmm8
++ por %xmm8,%xmm1
++ movzbl 0xc(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm7
++ movzbl 0xd(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm4
++ movzbl 0xe(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm5
++ movzbl 0xf(%rcx),%eax
++ movd (%rsi,%rax,4),%xmm6
++ punpckldq %xmm4,%xmm7
++ punpckldq %xmm6,%xmm5
++ punpcklqdq %xmm5,%xmm7
++ paddd %xmm7,%xmm0
++ paddd %xmm1,%xmm0
++ pxor %xmm0,%xmm3
++ pshufb %xmm13,%xmm3
++ paddd %xmm3,%xmm2
++ pxor %xmm2,%xmm1
++ movdqa %xmm1,%xmm8
++ psrld $0x7,%xmm1
++ pslld $0x19,%xmm8
++ por %xmm8,%xmm1
++ pshufd $0x39,%xmm0,%xmm0
++ pshufd $0x4e,%xmm3,%xmm3
++ pshufd $0x93,%xmm2,%xmm2
++ addq $0x10,%rcx
++ cmpq %r8,%rcx
++ jnz .Lroundloop
++ pxor %xmm2,%xmm0
++ pxor %xmm3,%xmm1
++ pxor %xmm10,%xmm0
++ pxor %xmm11,%xmm1
++ addq $0x40,%rsi
++ decq %rdx
++ jnz .Lbeginofloop
++ movdqu %xmm0,(%rdi)
++ movdqu %xmm1,0x10(%rdi)
++ movdqu %xmm14,0x20(%rdi)
++.Lendofloop:
++ ret
++ENDPROC(blake2s_compress_ssse3)
++#endif /* CONFIG_AS_SSSE3 */
++
++#ifdef CONFIG_AS_AVX512
++ENTRY(blake2s_compress_avx512)
++ vmovdqu (%rdi),%xmm0
++ vmovdqu 0x10(%rdi),%xmm1
++ vmovdqu 0x20(%rdi),%xmm4
++ vmovq %rcx,%xmm5
++ vmovdqa IV(%rip),%xmm14
++ vmovdqa IV+16(%rip),%xmm15
++ jmp .Lblake2s_compress_avx512_mainloop
++.align 32
++.Lblake2s_compress_avx512_mainloop:
++ vmovdqa %xmm0,%xmm10
++ vmovdqa %xmm1,%xmm11
++ vpaddq %xmm5,%xmm4,%xmm4
++ vmovdqa %xmm14,%xmm2
++ vpxor %xmm15,%xmm4,%xmm3
++ vmovdqu (%rsi),%ymm6
++ vmovdqu 0x20(%rsi),%ymm7
++ addq $0x40,%rsi
++ leaq SIGMA2(%rip),%rax
++ movb $0xa,%cl
++.Lblake2s_compress_avx512_roundloop:
++ addq $0x40,%rax
++ vmovdqa -0x40(%rax),%ymm8
++ vmovdqa -0x20(%rax),%ymm9
++ vpermi2d %ymm7,%ymm6,%ymm8
++ vpermi2d %ymm7,%ymm6,%ymm9
++ vmovdqa %ymm8,%ymm6
++ vmovdqa %ymm9,%ymm7
++ vpaddd %xmm8,%xmm0,%xmm0
++ vpaddd %xmm1,%xmm0,%xmm0
++ vpxor %xmm0,%xmm3,%xmm3
++ vprord $0x10,%xmm3,%xmm3
++ vpaddd %xmm3,%xmm2,%xmm2
++ vpxor %xmm2,%xmm1,%xmm1
++ vprord $0xc,%xmm1,%xmm1
++ vextracti128 $0x1,%ymm8,%xmm8
++ vpaddd %xmm8,%xmm0,%xmm0
++ vpaddd %xmm1,%xmm0,%xmm0
++ vpxor %xmm0,%xmm3,%xmm3
++ vprord $0x8,%xmm3,%xmm3
++ vpaddd %xmm3,%xmm2,%xmm2
++ vpxor %xmm2,%xmm1,%xmm1
++ vprord $0x7,%xmm1,%xmm1
++ vpshufd $0x93,%xmm0,%xmm0
++ vpshufd $0x4e,%xmm3,%xmm3
++ vpshufd $0x39,%xmm2,%xmm2
++ vpaddd %xmm9,%xmm0,%xmm0
++ vpaddd %xmm1,%xmm0,%xmm0
++ vpxor %xmm0,%xmm3,%xmm3
++ vprord $0x10,%xmm3,%xmm3
++ vpaddd %xmm3,%xmm2,%xmm2
++ vpxor %xmm2,%xmm1,%xmm1
++ vprord $0xc,%xmm1,%xmm1
++ vextracti128 $0x1,%ymm9,%xmm9
++ vpaddd %xmm9,%xmm0,%xmm0
++ vpaddd %xmm1,%xmm0,%xmm0
++ vpxor %xmm0,%xmm3,%xmm3
++ vprord $0x8,%xmm3,%xmm3
++ vpaddd %xmm3,%xmm2,%xmm2
++ vpxor %xmm2,%xmm1,%xmm1
++ vprord $0x7,%xmm1,%xmm1
++ vpshufd $0x39,%xmm0,%xmm0
++ vpshufd $0x4e,%xmm3,%xmm3
++ vpshufd $0x93,%xmm2,%xmm2
++ decb %cl
++ jne .Lblake2s_compress_avx512_roundloop
++ vpxor %xmm10,%xmm0,%xmm0
++ vpxor %xmm11,%xmm1,%xmm1
++ vpxor %xmm2,%xmm0,%xmm0
++ vpxor %xmm3,%xmm1,%xmm1
++ decq %rdx
++ jne .Lblake2s_compress_avx512_mainloop
++ vmovdqu %xmm0,(%rdi)
++ vmovdqu %xmm1,0x10(%rdi)
++ vmovdqu %xmm4,0x20(%rdi)
++ vzeroupper
++ retq
++ENDPROC(blake2s_compress_avx512)
++#endif /* CONFIG_AS_AVX512 */
+--- /dev/null
++++ b/arch/x86/crypto/blake2s-glue.c
+@@ -0,0 +1,233 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include <crypto/internal/blake2s.h>
++#include <crypto/internal/simd.h>
++#include <crypto/internal/hash.h>
++
++#include <linux/types.h>
++#include <linux/jump_label.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <asm/cpufeature.h>
++#include <asm/fpu/api.h>
++#include <asm/processor.h>
++#include <asm/simd.h>
++
++asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
++ const u8 *block, const size_t nblocks,
++ const u32 inc);
++asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
++ const u8 *block, const size_t nblocks,
++ const u32 inc);
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
++
++void blake2s_compress_arch(struct blake2s_state *state,
++ const u8 *block, size_t nblocks,
++ const u32 inc)
++{
++ /* SIMD disables preemption, so relax after processing each page. */
++ BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
++
++ if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
++ blake2s_compress_generic(state, block, nblocks, inc);
++ return;
++ }
++
++ for (;;) {
++ const size_t blocks = min_t(size_t, nblocks,
++ PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
++
++ kernel_fpu_begin();
++ if (IS_ENABLED(CONFIG_AS_AVX512) &&
++ static_branch_likely(&blake2s_use_avx512))
++ blake2s_compress_avx512(state, block, blocks, inc);
++ else
++ blake2s_compress_ssse3(state, block, blocks, inc);
++ kernel_fpu_end();
++
++ nblocks -= blocks;
++ if (!nblocks)
++ break;
++ block += blocks * BLAKE2S_BLOCK_SIZE;
++ }
++}
++EXPORT_SYMBOL(blake2s_compress_arch);
++
++static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
++ unsigned int keylen)
++{
++ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
++
++ if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
++ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
++ return -EINVAL;
++ }
++
++ memcpy(tctx->key, key, keylen);
++ tctx->keylen = keylen;
++
++ return 0;
++}
++
++static int crypto_blake2s_init(struct shash_desc *desc)
++{
++ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
++ struct blake2s_state *state = shash_desc_ctx(desc);
++ const int outlen = crypto_shash_digestsize(desc->tfm);
++
++ if (tctx->keylen)
++ blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
++ else
++ blake2s_init(state, outlen);
++
++ return 0;
++}
++
++static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
++ unsigned int inlen)
++{
++ struct blake2s_state *state = shash_desc_ctx(desc);
++ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
++
++ if (unlikely(!inlen))
++ return 0;
++ if (inlen > fill) {
++ memcpy(state->buf + state->buflen, in, fill);
++ blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
++ state->buflen = 0;
++ in += fill;
++ inlen -= fill;
++ }
++ if (inlen > BLAKE2S_BLOCK_SIZE) {
++ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
++ /* Hash one less (full) block than strictly possible */
++ blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
++ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++ }
++ memcpy(state->buf + state->buflen, in, inlen);
++ state->buflen += inlen;
++
++ return 0;
++}
++
++static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
++{
++ struct blake2s_state *state = shash_desc_ctx(desc);
++
++ blake2s_set_lastblock(state);
++ memset(state->buf + state->buflen, 0,
++ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
++ blake2s_compress_arch(state, state->buf, 1, state->buflen);
++ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
++ memcpy(out, state->h, state->outlen);
++ memzero_explicit(state, sizeof(*state));
++
++ return 0;
++}
++
++static struct shash_alg blake2s_algs[] = {{
++ .base.cra_name = "blake2s-128",
++ .base.cra_driver_name = "blake2s-128-x86",
++ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
++ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
++ .base.cra_priority = 200,
++ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
++ .base.cra_module = THIS_MODULE,
++
++ .digestsize = BLAKE2S_128_HASH_SIZE,
++ .setkey = crypto_blake2s_setkey,
++ .init = crypto_blake2s_init,
++ .update = crypto_blake2s_update,
++ .final = crypto_blake2s_final,
++ .descsize = sizeof(struct blake2s_state),
++}, {
++ .base.cra_name = "blake2s-160",
++ .base.cra_driver_name = "blake2s-160-x86",
++ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
++ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
++ .base.cra_priority = 200,
++ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
++ .base.cra_module = THIS_MODULE,
++
++ .digestsize = BLAKE2S_160_HASH_SIZE,
++ .setkey = crypto_blake2s_setkey,
++ .init = crypto_blake2s_init,
++ .update = crypto_blake2s_update,
++ .final = crypto_blake2s_final,
++ .descsize = sizeof(struct blake2s_state),
++}, {
++ .base.cra_name = "blake2s-224",
++ .base.cra_driver_name = "blake2s-224-x86",
++ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
++ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
++ .base.cra_priority = 200,
++ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
++ .base.cra_module = THIS_MODULE,
++
++ .digestsize = BLAKE2S_224_HASH_SIZE,
++ .setkey = crypto_blake2s_setkey,
++ .init = crypto_blake2s_init,
++ .update = crypto_blake2s_update,
++ .final = crypto_blake2s_final,
++ .descsize = sizeof(struct blake2s_state),
++}, {
++ .base.cra_name = "blake2s-256",
++ .base.cra_driver_name = "blake2s-256-x86",
++ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
++ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
++ .base.cra_priority = 200,
++ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
++ .base.cra_module = THIS_MODULE,
++
++ .digestsize = BLAKE2S_256_HASH_SIZE,
++ .setkey = crypto_blake2s_setkey,
++ .init = crypto_blake2s_init,
++ .update = crypto_blake2s_update,
++ .final = crypto_blake2s_final,
++ .descsize = sizeof(struct blake2s_state),
++}};
++
++static int __init blake2s_mod_init(void)
++{
++ if (!boot_cpu_has(X86_FEATURE_SSSE3))
++ return 0;
++
++ static_branch_enable(&blake2s_use_ssse3);
++
++ if (IS_ENABLED(CONFIG_AS_AVX512) &&
++ boot_cpu_has(X86_FEATURE_AVX) &&
++ boot_cpu_has(X86_FEATURE_AVX2) &&
++ boot_cpu_has(X86_FEATURE_AVX512F) &&
++ boot_cpu_has(X86_FEATURE_AVX512VL) &&
++ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
++ XFEATURE_MASK_AVX512, NULL))
++ static_branch_enable(&blake2s_use_avx512);
++
++ return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++}
++
++static void __exit blake2s_mod_exit(void)
++{
++ if (boot_cpu_has(X86_FEATURE_SSSE3))
++ crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++}
++
++module_init(blake2s_mod_init);
++module_exit(blake2s_mod_exit);
++
++MODULE_ALIAS_CRYPTO("blake2s-128");
++MODULE_ALIAS_CRYPTO("blake2s-128-x86");
++MODULE_ALIAS_CRYPTO("blake2s-160");
++MODULE_ALIAS_CRYPTO("blake2s-160-x86");
++MODULE_ALIAS_CRYPTO("blake2s-224");
++MODULE_ALIAS_CRYPTO("blake2s-224-x86");
++MODULE_ALIAS_CRYPTO("blake2s-256");
++MODULE_ALIAS_CRYPTO("blake2s-256-x86");
++MODULE_LICENSE("GPL v2");
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -657,6 +657,12 @@ config CRYPTO_BLAKE2S
+
+ See https://blake2.net for further information.
+
++config CRYPTO_BLAKE2S_X86
++ tristate "BLAKE2s digest algorithm (x86 accelerated version)"
++ depends on X86 && 64BIT
++ select CRYPTO_LIB_BLAKE2S_GENERIC
++ select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
++
+ config CRYPTO_CRCT10DIF
+ tristate "CRCT10DIF algorithm"
+ select CRYPTO_HASH