aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch2927
1 files changed, 2927 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch b/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch
new file mode 100644
index 0000000000..307c9b6ef3
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch
@@ -0,0 +1,2927 @@
+From a81b2f8bd42fe51705d7102e9d9a2a40c2a9d624 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 5 Jan 2020 22:40:48 -0500
+Subject: [PATCH 043/124] crypto: x86/poly1305 - wire up faster implementations
+ for kernel
+
+commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream.
+
+These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
+The AVX-512F implementation is disabled on Skylake, due to throttling,
+but it is quite fast on >= Cannonlake.
+
+On the left is cycle counts on a Core i7 6700HQ using the AVX-2
+codepath, comparing this implementation ("new") to the implementation in
+the current crypto api ("old"). On the right are benchmarks on a Xeon
+Gold 5120 using the AVX-512 codepath. The new implementation is faster
+on all benchmarks.
+
+ AVX-2 AVX-512
+ --------- -----------
+
+ size old new size old new
+ ---- ---- ---- ---- ---- ----
+ 0 70 68 0 74 70
+ 16 92 90 16 96 92
+ 32 134 104 32 136 106
+ 48 172 120 48 184 124
+ 64 218 136 64 218 138
+ 80 254 158 80 260 160
+ 96 298 174 96 300 176
+ 112 342 192 112 342 194
+ 128 388 212 128 384 212
+ 144 428 228 144 420 226
+ 160 466 246 160 464 248
+ 176 510 264 176 504 264
+ 192 550 282 192 544 282
+ 208 594 302 208 582 300
+ 224 628 316 224 624 318
+ 240 676 334 240 662 338
+ 256 716 354 256 708 358
+ 272 764 374 272 748 372
+ 288 802 352 288 788 358
+ 304 420 366 304 422 370
+ 320 428 360 320 432 364
+ 336 484 378 336 486 380
+ 352 426 384 352 434 390
+ 368 478 400 368 480 408
+ 384 488 394 384 490 398
+ 400 542 408 400 542 412
+ 416 486 416 416 492 426
+ 432 534 430 432 538 436
+ 448 544 422 448 546 432
+ 464 600 438 464 600 448
+ 480 540 448 480 548 456
+ 496 594 464 496 594 476
+ 512 602 456 512 606 470
+ 528 656 476 528 656 480
+ 544 600 480 544 606 498
+ 560 650 494 560 652 512
+ 576 664 490 576 662 508
+ 592 714 508 592 716 522
+ 608 656 514 608 664 538
+ 624 708 532 624 710 552
+ 640 716 524 640 720 516
+ 656 770 536 656 772 526
+ 672 716 548 672 722 544
+ 688 770 562 688 768 556
+ 704 774 552 704 778 556
+ 720 826 568 720 832 568
+ 736 768 574 736 780 584
+ 752 822 592 752 826 600
+ 768 830 584 768 836 560
+ 784 884 602 784 888 572
+ 800 828 610 800 838 588
+ 816 884 628 816 884 604
+ 832 888 618 832 894 598
+ 848 942 632 848 946 612
+ 864 884 644 864 896 628
+ 880 936 660 880 942 644
+ 896 948 652 896 952 608
+ 912 1000 664 912 1004 616
+ 928 942 676 928 954 634
+ 944 994 690 944 1000 646
+ 960 1002 680 960 1008 646
+ 976 1054 694 976 1062 658
+ 992 1002 706 992 1012 674
+ 1008 1052 720 1008 1058 690
+
+This commit wires in the prior implementation from Andy, and makes the
+following changes to be suitable for kernel land.
+
+ - Some cosmetic and structural changes, like renaming labels to
+ .Lname, constants, and other Linux conventions, as well as making
+ the code easy for us to maintain moving forward.
+
+ - CPU feature checking is done in C by the glue code.
+
+ - We avoid jumping into the middle of functions, to appease objtool,
+ and instead parameterize shared code.
+
+ - We maintain frame pointers so that stack traces make sense.
+
+ - We remove the dependency on the perl xlate code, which transforms
+ the output into things that assemblers we don't care about use.
+
+Importantly, none of our changes affect the arithmetic or core code, but
+just involve the differing environment of kernel space.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
+Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/.gitignore | 1 +
+ arch/x86/crypto/Makefile | 11 +-
+ arch/x86/crypto/poly1305-avx2-x86_64.S | 390 ----------
+ arch/x86/crypto/poly1305-sse2-x86_64.S | 590 ---------------
+ arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++--------
+ arch/x86/crypto/poly1305_glue.c | 473 +++++-------
+ lib/crypto/Kconfig | 2 +-
+ 7 files changed, 572 insertions(+), 1577 deletions(-)
+ create mode 100644 arch/x86/crypto/.gitignore
+ delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S
+ delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
+
+--- /dev/null
++++ b/arch/x86/crypto/.gitignore
+@@ -0,0 +1 @@
++poly1305-x86_64.S
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o
+
+ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+ blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
++poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
++ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
++targets += poly1305-x86_64-cryptogams.S
++endif
+
+ ifeq ($(avx_supported),yes)
+ camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+@@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni
+ aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+ sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+-poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
+ ifeq ($(avx2_supported),yes)
+ sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+-poly1305-x86_64-y += poly1305-avx2-x86_64.o
+ endif
+ ifeq ($(sha1_ni_supported),yes)
+ sha1-ssse3-y += sha1_ni_asm.o
+@@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
+ endif
+ sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+ crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
++
++quiet_cmd_perlasm = PERLASM $@
++ cmd_perlasm = $(PERL) $< > $@
++$(obj)/%.S: $(src)/%.pl FORCE
++ $(call if_changed,perlasm)
+--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
++++ /dev/null
+@@ -1,390 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0-or-later */
+-/*
+- * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
+- *
+- * Copyright (C) 2015 Martin Willi
+- */
+-
+-#include <linux/linkage.h>
+-
+-.section .rodata.cst32.ANMASK, "aM", @progbits, 32
+-.align 32
+-ANMASK: .octa 0x0000000003ffffff0000000003ffffff
+- .octa 0x0000000003ffffff0000000003ffffff
+-
+-.section .rodata.cst32.ORMASK, "aM", @progbits, 32
+-.align 32
+-ORMASK: .octa 0x00000000010000000000000001000000
+- .octa 0x00000000010000000000000001000000
+-
+-.text
+-
+-#define h0 0x00(%rdi)
+-#define h1 0x04(%rdi)
+-#define h2 0x08(%rdi)
+-#define h3 0x0c(%rdi)
+-#define h4 0x10(%rdi)
+-#define r0 0x00(%rdx)
+-#define r1 0x04(%rdx)
+-#define r2 0x08(%rdx)
+-#define r3 0x0c(%rdx)
+-#define r4 0x10(%rdx)
+-#define u0 0x00(%r8)
+-#define u1 0x04(%r8)
+-#define u2 0x08(%r8)
+-#define u3 0x0c(%r8)
+-#define u4 0x10(%r8)
+-#define w0 0x18(%r8)
+-#define w1 0x1c(%r8)
+-#define w2 0x20(%r8)
+-#define w3 0x24(%r8)
+-#define w4 0x28(%r8)
+-#define y0 0x30(%r8)
+-#define y1 0x34(%r8)
+-#define y2 0x38(%r8)
+-#define y3 0x3c(%r8)
+-#define y4 0x40(%r8)
+-#define m %rsi
+-#define hc0 %ymm0
+-#define hc1 %ymm1
+-#define hc2 %ymm2
+-#define hc3 %ymm3
+-#define hc4 %ymm4
+-#define hc0x %xmm0
+-#define hc1x %xmm1
+-#define hc2x %xmm2
+-#define hc3x %xmm3
+-#define hc4x %xmm4
+-#define t1 %ymm5
+-#define t2 %ymm6
+-#define t1x %xmm5
+-#define t2x %xmm6
+-#define ruwy0 %ymm7
+-#define ruwy1 %ymm8
+-#define ruwy2 %ymm9
+-#define ruwy3 %ymm10
+-#define ruwy4 %ymm11
+-#define ruwy0x %xmm7
+-#define ruwy1x %xmm8
+-#define ruwy2x %xmm9
+-#define ruwy3x %xmm10
+-#define ruwy4x %xmm11
+-#define svxz1 %ymm12
+-#define svxz2 %ymm13
+-#define svxz3 %ymm14
+-#define svxz4 %ymm15
+-#define d0 %r9
+-#define d1 %r10
+-#define d2 %r11
+-#define d3 %r12
+-#define d4 %r13
+-
+-ENTRY(poly1305_4block_avx2)
+- # %rdi: Accumulator h[5]
+- # %rsi: 64 byte input block m
+- # %rdx: Poly1305 key r[5]
+- # %rcx: Quadblock count
+- # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
+-
+- # This four-block variant uses loop unrolled block processing. It
+- # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
+- # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
+-
+- vzeroupper
+- push %rbx
+- push %r12
+- push %r13
+-
+- # combine r0,u0,w0,y0
+- vmovd y0,ruwy0x
+- vmovd w0,t1x
+- vpunpcklqdq t1,ruwy0,ruwy0
+- vmovd u0,t1x
+- vmovd r0,t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,ruwy0,ruwy0
+-
+- # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
+- vmovd y1,ruwy1x
+- vmovd w1,t1x
+- vpunpcklqdq t1,ruwy1,ruwy1
+- vmovd u1,t1x
+- vmovd r1,t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,ruwy1,ruwy1
+- vpslld $2,ruwy1,svxz1
+- vpaddd ruwy1,svxz1,svxz1
+-
+- # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
+- vmovd y2,ruwy2x
+- vmovd w2,t1x
+- vpunpcklqdq t1,ruwy2,ruwy2
+- vmovd u2,t1x
+- vmovd r2,t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,ruwy2,ruwy2
+- vpslld $2,ruwy2,svxz2
+- vpaddd ruwy2,svxz2,svxz2
+-
+- # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
+- vmovd y3,ruwy3x
+- vmovd w3,t1x
+- vpunpcklqdq t1,ruwy3,ruwy3
+- vmovd u3,t1x
+- vmovd r3,t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,ruwy3,ruwy3
+- vpslld $2,ruwy3,svxz3
+- vpaddd ruwy3,svxz3,svxz3
+-
+- # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
+- vmovd y4,ruwy4x
+- vmovd w4,t1x
+- vpunpcklqdq t1,ruwy4,ruwy4
+- vmovd u4,t1x
+- vmovd r4,t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,ruwy4,ruwy4
+- vpslld $2,ruwy4,svxz4
+- vpaddd ruwy4,svxz4,svxz4
+-
+-.Ldoblock4:
+- # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
+- # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
+- vmovd 0x00(m),hc0x
+- vmovd 0x10(m),t1x
+- vpunpcklqdq t1,hc0,hc0
+- vmovd 0x20(m),t1x
+- vmovd 0x30(m),t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,hc0,hc0
+- vpand ANMASK(%rip),hc0,hc0
+- vmovd h0,t1x
+- vpaddd t1,hc0,hc0
+- # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
+- # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
+- vmovd 0x03(m),hc1x
+- vmovd 0x13(m),t1x
+- vpunpcklqdq t1,hc1,hc1
+- vmovd 0x23(m),t1x
+- vmovd 0x33(m),t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,hc1,hc1
+- vpsrld $2,hc1,hc1
+- vpand ANMASK(%rip),hc1,hc1
+- vmovd h1,t1x
+- vpaddd t1,hc1,hc1
+- # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
+- # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
+- vmovd 0x06(m),hc2x
+- vmovd 0x16(m),t1x
+- vpunpcklqdq t1,hc2,hc2
+- vmovd 0x26(m),t1x
+- vmovd 0x36(m),t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,hc2,hc2
+- vpsrld $4,hc2,hc2
+- vpand ANMASK(%rip),hc2,hc2
+- vmovd h2,t1x
+- vpaddd t1,hc2,hc2
+- # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
+- # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
+- vmovd 0x09(m),hc3x
+- vmovd 0x19(m),t1x
+- vpunpcklqdq t1,hc3,hc3
+- vmovd 0x29(m),t1x
+- vmovd 0x39(m),t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,hc3,hc3
+- vpsrld $6,hc3,hc3
+- vpand ANMASK(%rip),hc3,hc3
+- vmovd h3,t1x
+- vpaddd t1,hc3,hc3
+- # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
+- # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
+- vmovd 0x0c(m),hc4x
+- vmovd 0x1c(m),t1x
+- vpunpcklqdq t1,hc4,hc4
+- vmovd 0x2c(m),t1x
+- vmovd 0x3c(m),t2x
+- vpunpcklqdq t2,t1,t1
+- vperm2i128 $0x20,t1,hc4,hc4
+- vpsrld $8,hc4,hc4
+- vpor ORMASK(%rip),hc4,hc4
+- vmovd h4,t1x
+- vpaddd t1,hc4,hc4
+-
+- # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
+- vpmuludq hc0,ruwy0,t1
+- # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
+- vpmuludq hc1,svxz4,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
+- vpmuludq hc2,svxz3,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
+- vpmuludq hc3,svxz2,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
+- vpmuludq hc4,svxz1,t2
+- vpaddq t2,t1,t1
+- # d0 = t1[0] + t1[1] + t[2] + t[3]
+- vpermq $0xee,t1,t2
+- vpaddq t2,t1,t1
+- vpsrldq $8,t1,t2
+- vpaddq t2,t1,t1
+- vmovq t1x,d0
+-
+- # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
+- vpmuludq hc0,ruwy1,t1
+- # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
+- vpmuludq hc1,ruwy0,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
+- vpmuludq hc2,svxz4,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
+- vpmuludq hc3,svxz3,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
+- vpmuludq hc4,svxz2,t2
+- vpaddq t2,t1,t1
+- # d1 = t1[0] + t1[1] + t1[3] + t1[4]
+- vpermq $0xee,t1,t2
+- vpaddq t2,t1,t1
+- vpsrldq $8,t1,t2
+- vpaddq t2,t1,t1
+- vmovq t1x,d1
+-
+- # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
+- vpmuludq hc0,ruwy2,t1
+- # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
+- vpmuludq hc1,ruwy1,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
+- vpmuludq hc2,ruwy0,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
+- vpmuludq hc3,svxz4,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
+- vpmuludq hc4,svxz3,t2
+- vpaddq t2,t1,t1
+- # d2 = t1[0] + t1[1] + t1[2] + t1[3]
+- vpermq $0xee,t1,t2
+- vpaddq t2,t1,t1
+- vpsrldq $8,t1,t2
+- vpaddq t2,t1,t1
+- vmovq t1x,d2
+-
+- # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
+- vpmuludq hc0,ruwy3,t1
+- # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
+- vpmuludq hc1,ruwy2,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
+- vpmuludq hc2,ruwy1,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
+- vpmuludq hc3,ruwy0,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
+- vpmuludq hc4,svxz4,t2
+- vpaddq t2,t1,t1
+- # d3 = t1[0] + t1[1] + t1[2] + t1[3]
+- vpermq $0xee,t1,t2
+- vpaddq t2,t1,t1
+- vpsrldq $8,t1,t2
+- vpaddq t2,t1,t1
+- vmovq t1x,d3
+-
+- # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
+- vpmuludq hc0,ruwy4,t1
+- # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
+- vpmuludq hc1,ruwy3,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
+- vpmuludq hc2,ruwy2,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
+- vpmuludq hc3,ruwy1,t2
+- vpaddq t2,t1,t1
+- # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
+- vpmuludq hc4,ruwy0,t2
+- vpaddq t2,t1,t1
+- # d4 = t1[0] + t1[1] + t1[2] + t1[3]
+- vpermq $0xee,t1,t2
+- vpaddq t2,t1,t1
+- vpsrldq $8,t1,t2
+- vpaddq t2,t1,t1
+- vmovq t1x,d4
+-
+- # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+- # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+- # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+- # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+- # integers. It's true in a single-block implementation, but not here.
+-
+- # d1 += d0 >> 26
+- mov d0,%rax
+- shr $26,%rax
+- add %rax,d1
+- # h0 = d0 & 0x3ffffff
+- mov d0,%rbx
+- and $0x3ffffff,%ebx
+-
+- # d2 += d1 >> 26
+- mov d1,%rax
+- shr $26,%rax
+- add %rax,d2
+- # h1 = d1 & 0x3ffffff
+- mov d1,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h1
+-
+- # d3 += d2 >> 26
+- mov d2,%rax
+- shr $26,%rax
+- add %rax,d3
+- # h2 = d2 & 0x3ffffff
+- mov d2,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h2
+-
+- # d4 += d3 >> 26
+- mov d3,%rax
+- shr $26,%rax
+- add %rax,d4
+- # h3 = d3 & 0x3ffffff
+- mov d3,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h3
+-
+- # h0 += (d4 >> 26) * 5
+- mov d4,%rax
+- shr $26,%rax
+- lea (%rax,%rax,4),%rax
+- add %rax,%rbx
+- # h4 = d4 & 0x3ffffff
+- mov d4,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h4
+-
+- # h1 += h0 >> 26
+- mov %rbx,%rax
+- shr $26,%rax
+- add %eax,h1
+- # h0 = h0 & 0x3ffffff
+- andl $0x3ffffff,%ebx
+- mov %ebx,h0
+-
+- add $0x40,m
+- dec %rcx
+- jnz .Ldoblock4
+-
+- vzeroupper
+- pop %r13
+- pop %r12
+- pop %rbx
+- ret
+-ENDPROC(poly1305_4block_avx2)
+--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
++++ /dev/null
+@@ -1,590 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0-or-later */
+-/*
+- * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+- *
+- * Copyright (C) 2015 Martin Willi
+- */
+-
+-#include <linux/linkage.h>
+-
+-.section .rodata.cst16.ANMASK, "aM", @progbits, 16
+-.align 16
+-ANMASK: .octa 0x0000000003ffffff0000000003ffffff
+-
+-.section .rodata.cst16.ORMASK, "aM", @progbits, 16
+-.align 16
+-ORMASK: .octa 0x00000000010000000000000001000000
+-
+-.text
+-
+-#define h0 0x00(%rdi)
+-#define h1 0x04(%rdi)
+-#define h2 0x08(%rdi)
+-#define h3 0x0c(%rdi)
+-#define h4 0x10(%rdi)
+-#define r0 0x00(%rdx)
+-#define r1 0x04(%rdx)
+-#define r2 0x08(%rdx)
+-#define r3 0x0c(%rdx)
+-#define r4 0x10(%rdx)
+-#define s1 0x00(%rsp)
+-#define s2 0x04(%rsp)
+-#define s3 0x08(%rsp)
+-#define s4 0x0c(%rsp)
+-#define m %rsi
+-#define h01 %xmm0
+-#define h23 %xmm1
+-#define h44 %xmm2
+-#define t1 %xmm3
+-#define t2 %xmm4
+-#define t3 %xmm5
+-#define t4 %xmm6
+-#define mask %xmm7
+-#define d0 %r8
+-#define d1 %r9
+-#define d2 %r10
+-#define d3 %r11
+-#define d4 %r12
+-
+-ENTRY(poly1305_block_sse2)
+- # %rdi: Accumulator h[5]
+- # %rsi: 16 byte input block m
+- # %rdx: Poly1305 key r[5]
+- # %rcx: Block count
+-
+- # This single block variant tries to improve performance by doing two
+- # multiplications in parallel using SSE instructions. There is quite
+- # some quardword packing involved, hence the speedup is marginal.
+-
+- push %rbx
+- push %r12
+- sub $0x10,%rsp
+-
+- # s1..s4 = r1..r4 * 5
+- mov r1,%eax
+- lea (%eax,%eax,4),%eax
+- mov %eax,s1
+- mov r2,%eax
+- lea (%eax,%eax,4),%eax
+- mov %eax,s2
+- mov r3,%eax
+- lea (%eax,%eax,4),%eax
+- mov %eax,s3
+- mov r4,%eax
+- lea (%eax,%eax,4),%eax
+- mov %eax,s4
+-
+- movdqa ANMASK(%rip),mask
+-
+-.Ldoblock:
+- # h01 = [0, h1, 0, h0]
+- # h23 = [0, h3, 0, h2]
+- # h44 = [0, h4, 0, h4]
+- movd h0,h01
+- movd h1,t1
+- movd h2,h23
+- movd h3,t2
+- movd h4,h44
+- punpcklqdq t1,h01
+- punpcklqdq t2,h23
+- punpcklqdq h44,h44
+-
+- # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
+- movd 0x00(m),t1
+- movd 0x03(m),t2
+- psrld $2,t2
+- punpcklqdq t2,t1
+- pand mask,t1
+- paddd t1,h01
+- # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
+- movd 0x06(m),t1
+- movd 0x09(m),t2
+- psrld $4,t1
+- psrld $6,t2
+- punpcklqdq t2,t1
+- pand mask,t1
+- paddd t1,h23
+- # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
+- mov 0x0c(m),%eax
+- shr $8,%eax
+- or $0x01000000,%eax
+- movd %eax,t1
+- pshufd $0xc4,t1,t1
+- paddd t1,h44
+-
+- # t1[0] = h0 * r0 + h2 * s3
+- # t1[1] = h1 * s4 + h3 * s2
+- movd r0,t1
+- movd s4,t2
+- punpcklqdq t2,t1
+- pmuludq h01,t1
+- movd s3,t2
+- movd s2,t3
+- punpcklqdq t3,t2
+- pmuludq h23,t2
+- paddq t2,t1
+- # t2[0] = h0 * r1 + h2 * s4
+- # t2[1] = h1 * r0 + h3 * s3
+- movd r1,t2
+- movd r0,t3
+- punpcklqdq t3,t2
+- pmuludq h01,t2
+- movd s4,t3
+- movd s3,t4
+- punpcklqdq t4,t3
+- pmuludq h23,t3
+- paddq t3,t2
+- # t3[0] = h4 * s1
+- # t3[1] = h4 * s2
+- movd s1,t3
+- movd s2,t4
+- punpcklqdq t4,t3
+- pmuludq h44,t3
+- # d0 = t1[0] + t1[1] + t3[0]
+- # d1 = t2[0] + t2[1] + t3[1]
+- movdqa t1,t4
+- punpcklqdq t2,t4
+- punpckhqdq t2,t1
+- paddq t4,t1
+- paddq t3,t1
+- movq t1,d0
+- psrldq $8,t1
+- movq t1,d1
+-
+- # t1[0] = h0 * r2 + h2 * r0
+- # t1[1] = h1 * r1 + h3 * s4
+- movd r2,t1
+- movd r1,t2
+- punpcklqdq t2,t1
+- pmuludq h01,t1
+- movd r0,t2
+- movd s4,t3
+- punpcklqdq t3,t2
+- pmuludq h23,t2
+- paddq t2,t1
+- # t2[0] = h0 * r3 + h2 * r1
+- # t2[1] = h1 * r2 + h3 * r0
+- movd r3,t2
+- movd r2,t3
+- punpcklqdq t3,t2
+- pmuludq h01,t2
+- movd r1,t3
+- movd r0,t4
+- punpcklqdq t4,t3
+- pmuludq h23,t3
+- paddq t3,t2
+- # t3[0] = h4 * s3
+- # t3[1] = h4 * s4
+- movd s3,t3
+- movd s4,t4
+- punpcklqdq t4,t3
+- pmuludq h44,t3
+- # d2 = t1[0] + t1[1] + t3[0]
+- # d3 = t2[0] + t2[1] + t3[1]
+- movdqa t1,t4
+- punpcklqdq t2,t4
+- punpckhqdq t2,t1
+- paddq t4,t1
+- paddq t3,t1
+- movq t1,d2
+- psrldq $8,t1
+- movq t1,d3
+-
+- # t1[0] = h0 * r4 + h2 * r2
+- # t1[1] = h1 * r3 + h3 * r1
+- movd r4,t1
+- movd r3,t2
+- punpcklqdq t2,t1
+- pmuludq h01,t1
+- movd r2,t2
+- movd r1,t3
+- punpcklqdq t3,t2
+- pmuludq h23,t2
+- paddq t2,t1
+- # t3[0] = h4 * r0
+- movd r0,t3
+- pmuludq h44,t3
+- # d4 = t1[0] + t1[1] + t3[0]
+- movdqa t1,t4
+- psrldq $8,t4
+- paddq t4,t1
+- paddq t3,t1
+- movq t1,d4
+-
+- # d1 += d0 >> 26
+- mov d0,%rax
+- shr $26,%rax
+- add %rax,d1
+- # h0 = d0 & 0x3ffffff
+- mov d0,%rbx
+- and $0x3ffffff,%ebx
+-
+- # d2 += d1 >> 26
+- mov d1,%rax
+- shr $26,%rax
+- add %rax,d2
+- # h1 = d1 & 0x3ffffff
+- mov d1,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h1
+-
+- # d3 += d2 >> 26
+- mov d2,%rax
+- shr $26,%rax
+- add %rax,d3
+- # h2 = d2 & 0x3ffffff
+- mov d2,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h2
+-
+- # d4 += d3 >> 26
+- mov d3,%rax
+- shr $26,%rax
+- add %rax,d4
+- # h3 = d3 & 0x3ffffff
+- mov d3,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h3
+-
+- # h0 += (d4 >> 26) * 5
+- mov d4,%rax
+- shr $26,%rax
+- lea (%rax,%rax,4),%rax
+- add %rax,%rbx
+- # h4 = d4 & 0x3ffffff
+- mov d4,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h4
+-
+- # h1 += h0 >> 26
+- mov %rbx,%rax
+- shr $26,%rax
+- add %eax,h1
+- # h0 = h0 & 0x3ffffff
+- andl $0x3ffffff,%ebx
+- mov %ebx,h0
+-
+- add $0x10,m
+- dec %rcx
+- jnz .Ldoblock
+-
+- # Zeroing of key material
+- mov %rcx,0x00(%rsp)
+- mov %rcx,0x08(%rsp)
+-
+- add $0x10,%rsp
+- pop %r12
+- pop %rbx
+- ret
+-ENDPROC(poly1305_block_sse2)
+-
+-
+-#define u0 0x00(%r8)
+-#define u1 0x04(%r8)
+-#define u2 0x08(%r8)
+-#define u3 0x0c(%r8)
+-#define u4 0x10(%r8)
+-#define hc0 %xmm0
+-#define hc1 %xmm1
+-#define hc2 %xmm2
+-#define hc3 %xmm5
+-#define hc4 %xmm6
+-#define ru0 %xmm7
+-#define ru1 %xmm8
+-#define ru2 %xmm9
+-#define ru3 %xmm10
+-#define ru4 %xmm11
+-#define sv1 %xmm12
+-#define sv2 %xmm13
+-#define sv3 %xmm14
+-#define sv4 %xmm15
+-#undef d0
+-#define d0 %r13
+-
+-ENTRY(poly1305_2block_sse2)
+- # %rdi: Accumulator h[5]
+- # %rsi: 16 byte input block m
+- # %rdx: Poly1305 key r[5]
+- # %rcx: Doubleblock count
+- # %r8: Poly1305 derived key r^2 u[5]
+-
+- # This two-block variant further improves performance by using loop
+- # unrolled block processing. This is more straight forward and does
+- # less byte shuffling, but requires a second Poly1305 key r^2:
+- # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
+-
+- push %rbx
+- push %r12
+- push %r13
+-
+- # combine r0,u0
+- movd u0,ru0
+- movd r0,t1
+- punpcklqdq t1,ru0
+-
+- # combine r1,u1 and s1=r1*5,v1=u1*5
+- movd u1,ru1
+- movd r1,t1
+- punpcklqdq t1,ru1
+- movdqa ru1,sv1
+- pslld $2,sv1
+- paddd ru1,sv1
+-
+- # combine r2,u2 and s2=r2*5,v2=u2*5
+- movd u2,ru2
+- movd r2,t1
+- punpcklqdq t1,ru2
+- movdqa ru2,sv2
+- pslld $2,sv2
+- paddd ru2,sv2
+-
+- # combine r3,u3 and s3=r3*5,v3=u3*5
+- movd u3,ru3
+- movd r3,t1
+- punpcklqdq t1,ru3
+- movdqa ru3,sv3
+- pslld $2,sv3
+- paddd ru3,sv3
+-
+- # combine r4,u4 and s4=r4*5,v4=u4*5
+- movd u4,ru4
+- movd r4,t1
+- punpcklqdq t1,ru4
+- movdqa ru4,sv4
+- pslld $2,sv4
+- paddd ru4,sv4
+-
+-.Ldoblock2:
+- # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
+- movd 0x00(m),hc0
+- movd 0x10(m),t1
+- punpcklqdq t1,hc0
+- pand ANMASK(%rip),hc0
+- movd h0,t1
+- paddd t1,hc0
+- # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
+- movd 0x03(m),hc1
+- movd 0x13(m),t1
+- punpcklqdq t1,hc1
+- psrld $2,hc1
+- pand ANMASK(%rip),hc1
+- movd h1,t1
+- paddd t1,hc1
+- # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
+- movd 0x06(m),hc2
+- movd 0x16(m),t1
+- punpcklqdq t1,hc2
+- psrld $4,hc2
+- pand ANMASK(%rip),hc2
+- movd h2,t1
+- paddd t1,hc2
+- # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
+- movd 0x09(m),hc3
+- movd 0x19(m),t1
+- punpcklqdq t1,hc3
+- psrld $6,hc3
+- pand ANMASK(%rip),hc3
+- movd h3,t1
+- paddd t1,hc3
+- # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
+- movd 0x0c(m),hc4
+- movd 0x1c(m),t1
+- punpcklqdq t1,hc4
+- psrld $8,hc4
+- por ORMASK(%rip),hc4
+- movd h4,t1
+- paddd t1,hc4
+-
+- # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
+- movdqa ru0,t1
+- pmuludq hc0,t1
+- # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
+- movdqa sv4,t2
+- pmuludq hc1,t2
+- paddq t2,t1
+- # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
+- movdqa sv3,t2
+- pmuludq hc2,t2
+- paddq t2,t1
+- # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
+- movdqa sv2,t2
+- pmuludq hc3,t2
+- paddq t2,t1
+- # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
+- movdqa sv1,t2
+- pmuludq hc4,t2
+- paddq t2,t1
+- # d0 = t1[0] + t1[1]
+- movdqa t1,t2
+- psrldq $8,t2
+- paddq t2,t1
+- movq t1,d0
+-
+- # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
+- movdqa ru1,t1
+- pmuludq hc0,t1
+- # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
+- movdqa ru0,t2
+- pmuludq hc1,t2
+- paddq t2,t1
+- # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
+- movdqa sv4,t2
+- pmuludq hc2,t2
+- paddq t2,t1
+- # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
+- movdqa sv3,t2
+- pmuludq hc3,t2
+- paddq t2,t1
+- # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
+- movdqa sv2,t2
+- pmuludq hc4,t2
+- paddq t2,t1
+- # d1 = t1[0] + t1[1]
+- movdqa t1,t2
+- psrldq $8,t2
+- paddq t2,t1
+- movq t1,d1
+-
+- # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
+- movdqa ru2,t1
+- pmuludq hc0,t1
+- # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
+- movdqa ru1,t2
+- pmuludq hc1,t2
+- paddq t2,t1
+- # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
+- movdqa ru0,t2
+- pmuludq hc2,t2
+- paddq t2,t1
+- # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
+- movdqa sv4,t2
+- pmuludq hc3,t2
+- paddq t2,t1
+- # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
+- movdqa sv3,t2
+- pmuludq hc4,t2
+- paddq t2,t1
+- # d2 = t1[0] + t1[1]
+- movdqa t1,t2
+- psrldq $8,t2
+- paddq t2,t1
+- movq t1,d2
+-
+- # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
+- movdqa ru3,t1
+- pmuludq hc0,t1
+- # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
+- movdqa ru2,t2
+- pmuludq hc1,t2
+- paddq t2,t1
+- # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
+- movdqa ru1,t2
+- pmuludq hc2,t2
+- paddq t2,t1
+- # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
+- movdqa ru0,t2
+- pmuludq hc3,t2
+- paddq t2,t1
+- # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
+- movdqa sv4,t2
+- pmuludq hc4,t2
+- paddq t2,t1
+- # d3 = t1[0] + t1[1]
+- movdqa t1,t2
+- psrldq $8,t2
+- paddq t2,t1
+- movq t1,d3
+-
+- # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
+- movdqa ru4,t1
+- pmuludq hc0,t1
+- # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
+- movdqa ru3,t2
+- pmuludq hc1,t2
+- paddq t2,t1
+- # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
+- movdqa ru2,t2
+- pmuludq hc2,t2
+- paddq t2,t1
+- # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
+- movdqa ru1,t2
+- pmuludq hc3,t2
+- paddq t2,t1
+- # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
+- movdqa ru0,t2
+- pmuludq hc4,t2
+- paddq t2,t1
+- # d4 = t1[0] + t1[1]
+- movdqa t1,t2
+- psrldq $8,t2
+- paddq t2,t1
+- movq t1,d4
+-
+- # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+- # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+- # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+- # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+- # integers. It's true in a single-block implementation, but not here.
+-
+- # d1 += d0 >> 26
+- mov d0,%rax
+- shr $26,%rax
+- add %rax,d1
+- # h0 = d0 & 0x3ffffff
+- mov d0,%rbx
+- and $0x3ffffff,%ebx
+-
+- # d2 += d1 >> 26
+- mov d1,%rax
+- shr $26,%rax
+- add %rax,d2
+- # h1 = d1 & 0x3ffffff
+- mov d1,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h1
+-
+- # d3 += d2 >> 26
+- mov d2,%rax
+- shr $26,%rax
+- add %rax,d3
+- # h2 = d2 & 0x3ffffff
+- mov d2,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h2
+-
+- # d4 += d3 >> 26
+- mov d3,%rax
+- shr $26,%rax
+- add %rax,d4
+- # h3 = d3 & 0x3ffffff
+- mov d3,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h3
+-
+- # h0 += (d4 >> 26) * 5
+- mov d4,%rax
+- shr $26,%rax
+- lea (%rax,%rax,4),%rax
+- add %rax,%rbx
+- # h4 = d4 & 0x3ffffff
+- mov d4,%rax
+- and $0x3ffffff,%eax
+- mov %eax,h4
+-
+- # h1 += h0 >> 26
+- mov %rbx,%rax
+- shr $26,%rax
+- add %eax,h1
+- # h0 = h0 & 0x3ffffff
+- andl $0x3ffffff,%ebx
+- mov %ebx,h0
+-
+- add $0x20,m
+- dec %rcx
+- jnz .Ldoblock2
+-
+- pop %r13
+- pop %r12
+- pop %rbx
+- ret
+-ENDPROC(poly1305_2block_sse2)
+--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
++++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+@@ -1,11 +1,14 @@
+-#! /usr/bin/env perl
+-# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
++#!/usr/bin/env perl
++# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+ #
+-# Licensed under the OpenSSL license (the "License"). You may not use
+-# this file except in compliance with the License. You can obtain a copy
+-# in the file LICENSE in the source distribution or at
+-# https://www.openssl.org/source/license.html
+-
++# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
++# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
++#
++# This code is taken from the OpenSSL project but the author, Andy Polyakov,
++# has relicensed it under the licenses specified in the SPDX header above.
++# The original headers, including the original license headers, are
++# included below for completeness.
+ #
+ # ====================================================================
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@@ -32,7 +35,7 @@
+ # Skylake-X system performance. Since we are likely to suppress
+ # AVX512F capability flag [at least on Skylake-X], conversion serves
+ # as kind of "investment protection". Note that next *lake processor,
+-# Cannolake, has AVX512IFMA code path to execute...
++# Cannonlake, has AVX512IFMA code path to execute...
+ #
+ # Numbers are cycles per processed byte with poly1305_blocks alone,
+ # measured with rdtsc at fixed clock frequency.
+@@ -68,39 +71,114 @@ $output = shift;
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
++$kernel=0; $kernel=1 if (!$flavour && !$output);
+
+-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+-die "can't locate x86_64-xlate.pl";
+-
+-if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
++if (!$kernel) {
++ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
++ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
++ die "can't locate x86_64-xlate.pl";
++
++ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
++ *STDOUT=*OUT;
++
++ if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
++ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
++ $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
++ }
++
++ if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
++ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
++ $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
++ $avx += 1 if ($1==2.11 && $2>=8);
++ }
++
++ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
++ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
++ $avx = ($1>=10) + ($1>=11);
++ }
++
++ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
++ $avx = ($2>=3.0) + ($2>3.0);
++ }
++} else {
++ $avx = 4; # The kernel uses ifdefs for this.
+ }
+
+-if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+- $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
+- $avx += 2 if ($1==2.11 && $2>=8);
++sub declare_function() {
++ my ($name, $align, $nargs) = @_;
++ if($kernel) {
++ $code .= ".align $align\n";
++ $code .= "ENTRY($name)\n";
++ $code .= ".L$name:\n";
++ } else {
++ $code .= ".globl $name\n";
++ $code .= ".type $name,\@function,$nargs\n";
++ $code .= ".align $align\n";
++ $code .= "$name:\n";
++ }
+ }
+
+-if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+- $avx = ($1>=10) + ($1>=12);
++sub end_function() {
++ my ($name) = @_;
++ if($kernel) {
++ $code .= "ENDPROC($name)\n";
++ } else {
++ $code .= ".size $name,.-$name\n";
++ }
+ }
+
+-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+- $avx = ($2>=3.0) + ($2>3.0);
+-}
++$code.=<<___ if $kernel;
++#include <linux/linkage.h>
++___
+
+-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+-*STDOUT=*OUT;
++if ($avx) {
++$code.=<<___ if $kernel;
++.section .rodata
++___
++$code.=<<___;
++.align 64
++.Lconst:
++.Lmask24:
++.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
++.L129:
++.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
++.Lmask26:
++.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
++.Lpermd_avx2:
++.long 2,2,2,3,2,0,2,1
++.Lpermd_avx512:
++.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
++
++.L2_44_inp_permd:
++.long 0,1,1,2,2,3,7,7
++.L2_44_inp_shift:
++.quad 0,12,24,64
++.L2_44_mask:
++.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
++.L2_44_shift_rgt:
++.quad 44,44,42,64
++.L2_44_shift_lft:
++.quad 8,8,10,64
++
++.align 64
++.Lx_mask44:
++.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
++.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
++.Lx_mask42:
++.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
++.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
++___
++}
++$code.=<<___ if (!$kernel);
++.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
++.align 16
++___
+
+ my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+ my ($mac,$nonce)=($inp,$len); # *_emit arguments
+-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
+-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
++my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
++my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
+
+ sub poly1305_iteration {
+ # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
+@@ -155,19 +233,19 @@ ___
+
+ $code.=<<___;
+ .text
+-
++___
++$code.=<<___ if (!$kernel);
+ .extern OPENSSL_ia32cap_P
+
+-.globl poly1305_init
+-.hidden poly1305_init
+-.globl poly1305_blocks
+-.hidden poly1305_blocks
+-.globl poly1305_emit
+-.hidden poly1305_emit
+-
+-.type poly1305_init,\@function,3
+-.align 32
+-poly1305_init:
++.globl poly1305_init_x86_64
++.hidden poly1305_init_x86_64
++.globl poly1305_blocks_x86_64
++.hidden poly1305_blocks_x86_64
++.globl poly1305_emit_x86_64
++.hidden poly1305_emit_x86_64
++___
++&declare_function("poly1305_init_x86_64", 32, 3);
++$code.=<<___;
+ xor %rax,%rax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+@@ -175,11 +253,12 @@ poly1305_init:
+
+ cmp \$0,$inp
+ je .Lno_key
+-
+- lea poly1305_blocks(%rip),%r10
+- lea poly1305_emit(%rip),%r11
+ ___
+-$code.=<<___ if ($avx);
++$code.=<<___ if (!$kernel);
++ lea poly1305_blocks_x86_64(%rip),%r10
++ lea poly1305_emit_x86_64(%rip),%r11
++___
++$code.=<<___ if (!$kernel && $avx);
+ mov OPENSSL_ia32cap_P+4(%rip),%r9
+ lea poly1305_blocks_avx(%rip),%rax
+ lea poly1305_emit_avx(%rip),%rcx
+@@ -187,12 +266,12 @@ $code.=<<___ if ($avx);
+ cmovc %rax,%r10
+ cmovc %rcx,%r11
+ ___
+-$code.=<<___ if ($avx>1);
++$code.=<<___ if (!$kernel && $avx>1);
+ lea poly1305_blocks_avx2(%rip),%rax
+ bt \$`5+32`,%r9 # AVX2?
+ cmovc %rax,%r10
+ ___
+-$code.=<<___ if ($avx>3);
++$code.=<<___ if (!$kernel && $avx>3);
+ mov \$`(1<<31|1<<21|1<<16)`,%rax
+ shr \$32,%r9
+ and %rax,%r9
+@@ -207,11 +286,11 @@ $code.=<<___;
+ mov %rax,24($ctx)
+ mov %rcx,32($ctx)
+ ___
+-$code.=<<___ if ($flavour !~ /elf32/);
++$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+ ___
+-$code.=<<___ if ($flavour =~ /elf32/);
++$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+ ___
+@@ -219,11 +298,11 @@ $code.=<<___;
+ mov \$1,%eax
+ .Lno_key:
+ ret
+-.size poly1305_init,.-poly1305_init
++___
++&end_function("poly1305_init_x86_64");
+
+-.type poly1305_blocks,\@function,4
+-.align 32
+-poly1305_blocks:
++&declare_function("poly1305_blocks_x86_64", 32, 4);
++$code.=<<___;
+ .cfi_startproc
+ .Lblocks:
+ shr \$4,$len
+@@ -231,8 +310,6 @@ poly1305_blocks:
+
+ push %rbx
+ .cfi_push %rbx
+- push %rbp
+-.cfi_push %rbp
+ push %r12
+ .cfi_push %r12
+ push %r13
+@@ -241,6 +318,8 @@ poly1305_blocks:
+ .cfi_push %r14
+ push %r15
+ .cfi_push %r15
++ push $ctx
++.cfi_push $ctx
+ .Lblocks_body:
+
+ mov $len,%r15 # reassign $len
+@@ -265,26 +344,29 @@ poly1305_blocks:
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ ___
++
+ &poly1305_iteration();
++
+ $code.=<<___;
+ mov $r1,%rax
+ dec %r15 # len-=16
+ jnz .Loop
+
++ mov 0(%rsp),$ctx
++.cfi_restore $ctx
++
+ mov $h0,0($ctx) # store hash value
+ mov $h1,8($ctx)
+ mov $h2,16($ctx)
+
+- mov 0(%rsp),%r15
++ mov 8(%rsp),%r15
+ .cfi_restore %r15
+- mov 8(%rsp),%r14
++ mov 16(%rsp),%r14
+ .cfi_restore %r14
+- mov 16(%rsp),%r13
++ mov 24(%rsp),%r13
+ .cfi_restore %r13
+- mov 24(%rsp),%r12
++ mov 32(%rsp),%r12
+ .cfi_restore %r12
+- mov 32(%rsp),%rbp
+-.cfi_restore %rbp
+ mov 40(%rsp),%rbx
+ .cfi_restore %rbx
+ lea 48(%rsp),%rsp
+@@ -293,11 +375,11 @@ $code.=<<___;
+ .Lblocks_epilogue:
+ ret
+ .cfi_endproc
+-.size poly1305_blocks,.-poly1305_blocks
++___
++&end_function("poly1305_blocks_x86_64");
+
+-.type poly1305_emit,\@function,3
+-.align 32
+-poly1305_emit:
++&declare_function("poly1305_emit_x86_64", 32, 3);
++$code.=<<___;
+ .Lemit:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+@@ -318,10 +400,14 @@ poly1305_emit:
+ mov %rcx,8($mac)
+
+ ret
+-.size poly1305_emit,.-poly1305_emit
+ ___
++&end_function("poly1305_emit_x86_64");
+ if ($avx) {
+
++if($kernel) {
++ $code .= "#ifdef CONFIG_AS_AVX\n";
++}
++
+ ########################################################################
+ # Layout of opaque area is following.
+ #
+@@ -342,15 +428,19 @@ $code.=<<___;
+ .type __poly1305_block,\@abi-omnipotent
+ .align 32
+ __poly1305_block:
++ push $ctx
+ ___
+ &poly1305_iteration();
+ $code.=<<___;
++ pop $ctx
+ ret
+ .size __poly1305_block,.-__poly1305_block
+
+ .type __poly1305_init_avx,\@abi-omnipotent
+ .align 32
+ __poly1305_init_avx:
++ push %rbp
++ mov %rsp,%rbp
+ mov $r0,$h0
+ mov $r1,$h1
+ xor $h2,$h2
+@@ -507,12 +597,13 @@ __poly1305_init_avx:
+ mov $d1#d,`16*8+8-64`($ctx)
+
+ lea -48-64($ctx),$ctx # size [de-]optimization
++ pop %rbp
+ ret
+ .size __poly1305_init_avx,.-__poly1305_init_avx
++___
+
+-.type poly1305_blocks_avx,\@function,4
+-.align 32
+-poly1305_blocks_avx:
++&declare_function("poly1305_blocks_avx", 32, 4);
++$code.=<<___;
+ .cfi_startproc
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+@@ -532,10 +623,11 @@ poly1305_blocks_avx:
+ test \$31,$len
+ jz .Leven_avx
+
+- push %rbx
+-.cfi_push %rbx
+ push %rbp
+ .cfi_push %rbp
++ mov %rsp,%rbp
++ push %rbx
++.cfi_push %rbx
+ push %r12
+ .cfi_push %r12
+ push %r13
+@@ -645,20 +737,18 @@ poly1305_blocks_avx:
+ mov $h2#d,16($ctx)
+ .align 16
+ .Ldone_avx:
+- mov 0(%rsp),%r15
++ pop %r15
+ .cfi_restore %r15
+- mov 8(%rsp),%r14
++ pop %r14
+ .cfi_restore %r14
+- mov 16(%rsp),%r13
++ pop %r13
+ .cfi_restore %r13
+- mov 24(%rsp),%r12
++ pop %r12
+ .cfi_restore %r12
+- mov 32(%rsp),%rbp
+-.cfi_restore %rbp
+- mov 40(%rsp),%rbx
++ pop %rbx
+ .cfi_restore %rbx
+- lea 48(%rsp),%rsp
+-.cfi_adjust_cfa_offset -48
++ pop %rbp
++.cfi_restore %rbp
+ .Lno_data_avx:
+ .Lblocks_avx_epilogue:
+ ret
+@@ -667,10 +757,11 @@ poly1305_blocks_avx:
+ .align 32
+ .Lbase2_64_avx:
+ .cfi_startproc
+- push %rbx
+-.cfi_push %rbx
+ push %rbp
+ .cfi_push %rbp
++ mov %rsp,%rbp
++ push %rbx
++.cfi_push %rbx
+ push %r12
+ .cfi_push %r12
+ push %r13
+@@ -736,22 +827,18 @@ poly1305_blocks_avx:
+
+ .Lproceed_avx:
+ mov %r15,$len
+-
+- mov 0(%rsp),%r15
++ pop %r15
+ .cfi_restore %r15
+- mov 8(%rsp),%r14
++ pop %r14
+ .cfi_restore %r14
+- mov 16(%rsp),%r13
++ pop %r13
+ .cfi_restore %r13
+- mov 24(%rsp),%r12
++ pop %r12
+ .cfi_restore %r12
+- mov 32(%rsp),%rbp
+-.cfi_restore %rbp
+- mov 40(%rsp),%rbx
++ pop %rbx
+ .cfi_restore %rbx
+- lea 48(%rsp),%rax
+- lea 48(%rsp),%rsp
+-.cfi_adjust_cfa_offset -48
++ pop %rbp
++.cfi_restore %rbp
+ .Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+ .cfi_endproc
+@@ -768,8 +855,11 @@ poly1305_blocks_avx:
+ .Ldo_avx:
+ ___
+ $code.=<<___ if (!$win64);
++ lea 8(%rsp),%r10
++.cfi_def_cfa_register %r10
++ and \$-32,%rsp
++ sub \$-8,%rsp
+ lea -0x58(%rsp),%r11
+-.cfi_def_cfa %r11,0x60
+ sub \$0x178,%rsp
+ ___
+ $code.=<<___ if ($win64);
+@@ -1361,18 +1451,18 @@ $code.=<<___ if ($win64);
+ .Ldo_avx_epilogue:
+ ___
+ $code.=<<___ if (!$win64);
+- lea 0x58(%r11),%rsp
+-.cfi_def_cfa %rsp,8
++ lea -8(%r10),%rsp
++.cfi_def_cfa_register %rsp
+ ___
+ $code.=<<___;
+ vzeroupper
+ ret
+ .cfi_endproc
+-.size poly1305_blocks_avx,.-poly1305_blocks_avx
++___
++&end_function("poly1305_blocks_avx");
+
+-.type poly1305_emit_avx,\@function,3
+-.align 32
+-poly1305_emit_avx:
++&declare_function("poly1305_emit_avx", 32, 3);
++$code.=<<___;
+ cmpl \$0,20($ctx) # is_base2_26?
+ je .Lemit
+
+@@ -1423,41 +1513,51 @@ poly1305_emit_avx:
+ mov %rcx,8($mac)
+
+ ret
+-.size poly1305_emit_avx,.-poly1305_emit_avx
+ ___
++&end_function("poly1305_emit_avx");
++
++if ($kernel) {
++ $code .= "#endif\n";
++}
+
+ if ($avx>1) {
++
++if ($kernel) {
++ $code .= "#ifdef CONFIG_AS_AVX2\n";
++}
++
+ my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+ map("%ymm$_",(0..15));
+ my $S4=$MASK;
+
++sub poly1305_blocks_avxN {
++ my ($avx512) = @_;
++ my $suffix = $avx512 ? "_avx512" : "";
+ $code.=<<___;
+-.type poly1305_blocks_avx2,\@function,4
+-.align 32
+-poly1305_blocks_avx2:
+ .cfi_startproc
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+- jae .Lblocks_avx2
++ jae .Lblocks_avx2$suffix
+ test %r8d,%r8d
+ jz .Lblocks
+
+-.Lblocks_avx2:
++.Lblocks_avx2$suffix:
+ and \$-16,$len
+- jz .Lno_data_avx2
++ jz .Lno_data_avx2$suffix
+
+ vzeroupper
+
+ test %r8d,%r8d
+- jz .Lbase2_64_avx2
++ jz .Lbase2_64_avx2$suffix
+
+ test \$63,$len
+- jz .Leven_avx2
++ jz .Leven_avx2$suffix
+
+- push %rbx
+-.cfi_push %rbx
+ push %rbp
+ .cfi_push %rbp
++ mov %rsp,%rbp
++ push %rbx
++.cfi_push %rbx
+ push %r12
+ .cfi_push %r12
+ push %r13
+@@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
+ .cfi_push %r14
+ push %r15
+ .cfi_push %r15
+-.Lblocks_avx2_body:
++.Lblocks_avx2_body$suffix:
+
+ mov $len,%r15 # reassign $len
+
+@@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+-.Lbase2_26_pre_avx2:
++.Lbase2_26_pre_avx2$suffix:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+@@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
+ mov $r1,%rax
+
+ test \$63,%r15
+- jnz .Lbase2_26_pre_avx2
++ jnz .Lbase2_26_pre_avx2$suffix
+
+ test $padbit,$padbit # if $padbit is zero,
+- jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
++ jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+@@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
+ or $r1,$h2 # h[4]
+
+ test %r15,%r15
+- jz .Lstore_base2_26_avx2
++ jz .Lstore_base2_26_avx2$suffix
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+- jmp .Lproceed_avx2
++ jmp .Lproceed_avx2$suffix
+
+ .align 32
+-.Lstore_base2_64_avx2:
++.Lstore_base2_64_avx2$suffix:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+- jmp .Ldone_avx2
++ jmp .Ldone_avx2$suffix
+
+ .align 16
+-.Lstore_base2_26_avx2:
++.Lstore_base2_26_avx2$suffix:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+ .align 16
+-.Ldone_avx2:
+- mov 0(%rsp),%r15
++.Ldone_avx2$suffix:
++ pop %r15
+ .cfi_restore %r15
+- mov 8(%rsp),%r14
++ pop %r14
+ .cfi_restore %r14
+- mov 16(%rsp),%r13
++ pop %r13
+ .cfi_restore %r13
+- mov 24(%rsp),%r12
++ pop %r12
+ .cfi_restore %r12
+- mov 32(%rsp),%rbp
+-.cfi_restore %rbp
+- mov 40(%rsp),%rbx
++ pop %rbx
+ .cfi_restore %rbx
+- lea 48(%rsp),%rsp
+-.cfi_adjust_cfa_offset -48
+-.Lno_data_avx2:
+-.Lblocks_avx2_epilogue:
++ pop %rbp
++.cfi_restore %rbp
++.Lno_data_avx2$suffix:
++.Lblocks_avx2_epilogue$suffix:
+ ret
+ .cfi_endproc
+
+ .align 32
+-.Lbase2_64_avx2:
++.Lbase2_64_avx2$suffix:
+ .cfi_startproc
+- push %rbx
+-.cfi_push %rbx
+ push %rbp
+ .cfi_push %rbp
++ mov %rsp,%rbp
++ push %rbx
++.cfi_push %rbx
+ push %r12
+ .cfi_push %r12
+ push %r13
+@@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
+ .cfi_push %r14
+ push %r15
+ .cfi_push %r15
+-.Lbase2_64_avx2_body:
++.Lbase2_64_avx2_body$suffix:
+
+ mov $len,%r15 # reassign $len
+
+@@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$63,$len
+- jz .Linit_avx2
++ jz .Linit_avx2$suffix
+
+-.Lbase2_64_pre_avx2:
++.Lbase2_64_pre_avx2$suffix:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+@@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
+ mov $r1,%rax
+
+ test \$63,%r15
+- jnz .Lbase2_64_pre_avx2
++ jnz .Lbase2_64_pre_avx2$suffix
+
+-.Linit_avx2:
++.Linit_avx2$suffix:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+@@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
+
+ call __poly1305_init_avx
+
+-.Lproceed_avx2:
++.Lproceed_avx2$suffix:
+ mov %r15,$len # restore $len
+- mov OPENSSL_ia32cap_P+8(%rip),%r10d
++___
++$code.=<<___ if (!$kernel);
++ mov OPENSSL_ia32cap_P+8(%rip),%r9d
+ mov \$`(1<<31|1<<30|1<<16)`,%r11d
+-
+- mov 0(%rsp),%r15
++___
++$code.=<<___;
++ pop %r15
+ .cfi_restore %r15
+- mov 8(%rsp),%r14
++ pop %r14
+ .cfi_restore %r14
+- mov 16(%rsp),%r13
++ pop %r13
+ .cfi_restore %r13
+- mov 24(%rsp),%r12
++ pop %r12
+ .cfi_restore %r12
+- mov 32(%rsp),%rbp
+-.cfi_restore %rbp
+- mov 40(%rsp),%rbx
++ pop %rbx
+ .cfi_restore %rbx
+- lea 48(%rsp),%rax
+- lea 48(%rsp),%rsp
+-.cfi_adjust_cfa_offset -48
+-.Lbase2_64_avx2_epilogue:
+- jmp .Ldo_avx2
++ pop %rbp
++.cfi_restore %rbp
++.Lbase2_64_avx2_epilogue$suffix:
++ jmp .Ldo_avx2$suffix
+ .cfi_endproc
+
+ .align 32
+-.Leven_avx2:
++.Leven_avx2$suffix:
+ .cfi_startproc
+- mov OPENSSL_ia32cap_P+8(%rip),%r10d
++___
++$code.=<<___ if (!$kernel);
++ mov OPENSSL_ia32cap_P+8(%rip),%r9d
++___
++$code.=<<___;
+ vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
+ vmovd 4*1($ctx),%x#$H1
+ vmovd 4*2($ctx),%x#$H2
+ vmovd 4*3($ctx),%x#$H3
+ vmovd 4*4($ctx),%x#$H4
+
+-.Ldo_avx2:
++.Ldo_avx2$suffix:
+ ___
+-$code.=<<___ if ($avx>2);
++$code.=<<___ if (!$kernel && $avx>2);
+ cmp \$512,$len
+ jb .Lskip_avx512
+- and %r11d,%r10d
+- test \$`1<<16`,%r10d # check for AVX512F
++ and %r11d,%r9d
++ test \$`1<<16`,%r9d # check for AVX512F
+ jnz .Lblocks_avx512
+-.Lskip_avx512:
++.Lskip_avx512$suffix:
++___
++$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
++ cmp \$512,$len
++ jae .Lblocks_avx512
+ ___
+ $code.=<<___ if (!$win64);
+- lea -8(%rsp),%r11
+-.cfi_def_cfa %r11,16
++ lea 8(%rsp),%r10
++.cfi_def_cfa_register %r10
+ sub \$0x128,%rsp
+ ___
+ $code.=<<___ if ($win64);
+- lea -0xf8(%rsp),%r11
++ lea 8(%rsp),%r10
+ sub \$0x1c8,%rsp
+- vmovdqa %xmm6,0x50(%r11)
+- vmovdqa %xmm7,0x60(%r11)
+- vmovdqa %xmm8,0x70(%r11)
+- vmovdqa %xmm9,0x80(%r11)
+- vmovdqa %xmm10,0x90(%r11)
+- vmovdqa %xmm11,0xa0(%r11)
+- vmovdqa %xmm12,0xb0(%r11)
+- vmovdqa %xmm13,0xc0(%r11)
+- vmovdqa %xmm14,0xd0(%r11)
+- vmovdqa %xmm15,0xe0(%r11)
+-.Ldo_avx2_body:
++ vmovdqa %xmm6,-0xb0(%r10)
++ vmovdqa %xmm7,-0xa0(%r10)
++ vmovdqa %xmm8,-0x90(%r10)
++ vmovdqa %xmm9,-0x80(%r10)
++ vmovdqa %xmm10,-0x70(%r10)
++ vmovdqa %xmm11,-0x60(%r10)
++ vmovdqa %xmm12,-0x50(%r10)
++ vmovdqa %xmm13,-0x40(%r10)
++ vmovdqa %xmm14,-0x30(%r10)
++ vmovdqa %xmm15,-0x20(%r10)
++.Ldo_avx2_body$suffix:
+ ___
+ $code.=<<___;
+ lea .Lconst(%rip),%rcx
+@@ -1794,11 +1901,11 @@ $code.=<<___;
+
+ vpaddq $H2,$T2,$H2 # accumulate input
+ sub \$64,$len
+- jz .Ltail_avx2
+- jmp .Loop_avx2
++ jz .Ltail_avx2$suffix
++ jmp .Loop_avx2$suffix
+
+ .align 32
+-.Loop_avx2:
++.Loop_avx2$suffix:
+ ################################################################
+ # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
+ # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
+@@ -1946,10 +2053,10 @@ $code.=<<___;
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ sub \$64,$len
+- jnz .Loop_avx2
++ jnz .Loop_avx2$suffix
+
+ .byte 0x66,0x90
+-.Ltail_avx2:
++.Ltail_avx2$suffix:
+ ################################################################
+ # while above multiplications were by r^4 in all lanes, in last
+ # iteration we multiply least significant lane by r^4 and most
+@@ -2087,37 +2194,29 @@ $code.=<<___;
+ vmovd %x#$H4,`4*4-48-64`($ctx)
+ ___
+ $code.=<<___ if ($win64);
+- vmovdqa 0x50(%r11),%xmm6
+- vmovdqa 0x60(%r11),%xmm7
+- vmovdqa 0x70(%r11),%xmm8
+- vmovdqa 0x80(%r11),%xmm9
+- vmovdqa 0x90(%r11),%xmm10
+- vmovdqa 0xa0(%r11),%xmm11
+- vmovdqa 0xb0(%r11),%xmm12
+- vmovdqa 0xc0(%r11),%xmm13
+- vmovdqa 0xd0(%r11),%xmm14
+- vmovdqa 0xe0(%r11),%xmm15
+- lea 0xf8(%r11),%rsp
+-.Ldo_avx2_epilogue:
++ vmovdqa -0xb0(%r10),%xmm6
++ vmovdqa -0xa0(%r10),%xmm7
++ vmovdqa -0x90(%r10),%xmm8
++ vmovdqa -0x80(%r10),%xmm9
++ vmovdqa -0x70(%r10),%xmm10
++ vmovdqa -0x60(%r10),%xmm11
++ vmovdqa -0x50(%r10),%xmm12
++ vmovdqa -0x40(%r10),%xmm13
++ vmovdqa -0x30(%r10),%xmm14
++ vmovdqa -0x20(%r10),%xmm15
++ lea -8(%r10),%rsp
++.Ldo_avx2_epilogue$suffix:
+ ___
+ $code.=<<___ if (!$win64);
+- lea 8(%r11),%rsp
+-.cfi_def_cfa %rsp,8
++ lea -8(%r10),%rsp
++.cfi_def_cfa_register %rsp
+ ___
+ $code.=<<___;
+ vzeroupper
+ ret
+ .cfi_endproc
+-.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
+ ___
+-#######################################################################
+-if ($avx>2) {
+-# On entry we have input length divisible by 64. But since inner loop
+-# processes 128 bytes per iteration, cases when length is not divisible
+-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+-# for this tail, we wouldn't have to even allocate stack frame...
+-
++if($avx > 2 && $avx512) {
+ my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+ my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
+ my $PADBIT="%zmm30";
+@@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+ map(s/%y/%z/,($MASK));
+
+ $code.=<<___;
+-.type poly1305_blocks_avx512,\@function,4
+-.align 32
+-poly1305_blocks_avx512:
+ .cfi_startproc
+ .Lblocks_avx512:
+ mov \$15,%eax
+ kmovw %eax,%k2
+ ___
+ $code.=<<___ if (!$win64);
+- lea -8(%rsp),%r11
+-.cfi_def_cfa %r11,16
++ lea 8(%rsp),%r10
++.cfi_def_cfa_register %r10
+ sub \$0x128,%rsp
+ ___
+ $code.=<<___ if ($win64);
+- lea -0xf8(%rsp),%r11
++ lea 8(%rsp),%r10
+ sub \$0x1c8,%rsp
+- vmovdqa %xmm6,0x50(%r11)
+- vmovdqa %xmm7,0x60(%r11)
+- vmovdqa %xmm8,0x70(%r11)
+- vmovdqa %xmm9,0x80(%r11)
+- vmovdqa %xmm10,0x90(%r11)
+- vmovdqa %xmm11,0xa0(%r11)
+- vmovdqa %xmm12,0xb0(%r11)
+- vmovdqa %xmm13,0xc0(%r11)
+- vmovdqa %xmm14,0xd0(%r11)
+- vmovdqa %xmm15,0xe0(%r11)
++ vmovdqa %xmm6,-0xb0(%r10)
++ vmovdqa %xmm7,-0xa0(%r10)
++ vmovdqa %xmm8,-0x90(%r10)
++ vmovdqa %xmm9,-0x80(%r10)
++ vmovdqa %xmm10,-0x70(%r10)
++ vmovdqa %xmm11,-0x60(%r10)
++ vmovdqa %xmm12,-0x50(%r10)
++ vmovdqa %xmm13,-0x40(%r10)
++ vmovdqa %xmm14,-0x30(%r10)
++ vmovdqa %xmm15,-0x20(%r10)
+ .Ldo_avx512_body:
+ ___
+ $code.=<<___;
+@@ -2679,7 +2775,7 @@ $code.=<<___;
+
+ lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
+ add \$64,$len
+- jnz .Ltail_avx2
++ jnz .Ltail_avx2$suffix
+
+ vpsubq $T2,$H2,$H2 # undo input accumulation
+ vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+@@ -2690,29 +2786,61 @@ $code.=<<___;
+ vzeroall
+ ___
+ $code.=<<___ if ($win64);
+- movdqa 0x50(%r11),%xmm6
+- movdqa 0x60(%r11),%xmm7
+- movdqa 0x70(%r11),%xmm8
+- movdqa 0x80(%r11),%xmm9
+- movdqa 0x90(%r11),%xmm10
+- movdqa 0xa0(%r11),%xmm11
+- movdqa 0xb0(%r11),%xmm12
+- movdqa 0xc0(%r11),%xmm13
+- movdqa 0xd0(%r11),%xmm14
+- movdqa 0xe0(%r11),%xmm15
+- lea 0xf8(%r11),%rsp
++ movdqa -0xb0(%r10),%xmm6
++ movdqa -0xa0(%r10),%xmm7
++ movdqa -0x90(%r10),%xmm8
++ movdqa -0x80(%r10),%xmm9
++ movdqa -0x70(%r10),%xmm10
++ movdqa -0x60(%r10),%xmm11
++ movdqa -0x50(%r10),%xmm12
++ movdqa -0x40(%r10),%xmm13
++ movdqa -0x30(%r10),%xmm14
++ movdqa -0x20(%r10),%xmm15
++ lea -8(%r10),%rsp
+ .Ldo_avx512_epilogue:
+ ___
+ $code.=<<___ if (!$win64);
+- lea 8(%r11),%rsp
+-.cfi_def_cfa %rsp,8
++ lea -8(%r10),%rsp
++.cfi_def_cfa_register %rsp
+ ___
+ $code.=<<___;
+ ret
+ .cfi_endproc
+-.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
+ ___
+-if ($avx>3) {
++
++}
++
++}
++
++&declare_function("poly1305_blocks_avx2", 32, 4);
++poly1305_blocks_avxN(0);
++&end_function("poly1305_blocks_avx2");
++
++if($kernel) {
++ $code .= "#endif\n";
++}
++
++#######################################################################
++if ($avx>2) {
++# On entry we have input length divisible by 64. But since inner loop
++# processes 128 bytes per iteration, cases when length is not divisible
++# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
++# reason stack layout is kept identical to poly1305_blocks_avx2. If not
++# for this tail, we wouldn't have to even allocate stack frame...
++
++if($kernel) {
++ $code .= "#ifdef CONFIG_AS_AVX512\n";
++}
++
++&declare_function("poly1305_blocks_avx512", 32, 4);
++poly1305_blocks_avxN(1);
++&end_function("poly1305_blocks_avx512");
++
++if ($kernel) {
++ $code .= "#endif\n";
++}
++
++if (!$kernel && $avx>3) {
+ ########################################################################
+ # VPMADD52 version using 2^44 radix.
+ #
+@@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
+ .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
+ ___
+ } } }
+-$code.=<<___;
+-.align 64
+-.Lconst:
+-.Lmask24:
+-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+-.L129:
+-.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+-.Lmask26:
+-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+-.Lpermd_avx2:
+-.long 2,2,2,3,2,0,2,1
+-.Lpermd_avx512:
+-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+-
+-.L2_44_inp_permd:
+-.long 0,1,1,2,2,3,7,7
+-.L2_44_inp_shift:
+-.quad 0,12,24,64
+-.L2_44_mask:
+-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+-.L2_44_shift_rgt:
+-.quad 44,44,42,64
+-.L2_44_shift_lft:
+-.quad 8,8,10,64
+-
+-.align 64
+-.Lx_mask44:
+-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+-.Lx_mask42:
+-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+-___
+ }
+-$code.=<<___;
+-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+-.align 16
+-___
+
++if (!$kernel)
+ { # chacha20-poly1305 helpers
+ my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+@@ -4038,17 +4130,17 @@ avx_handler:
+
+ .section .pdata
+ .align 4
+- .rva .LSEH_begin_poly1305_init
+- .rva .LSEH_end_poly1305_init
+- .rva .LSEH_info_poly1305_init
+-
+- .rva .LSEH_begin_poly1305_blocks
+- .rva .LSEH_end_poly1305_blocks
+- .rva .LSEH_info_poly1305_blocks
+-
+- .rva .LSEH_begin_poly1305_emit
+- .rva .LSEH_end_poly1305_emit
+- .rva .LSEH_info_poly1305_emit
++ .rva .LSEH_begin_poly1305_init_x86_64
++ .rva .LSEH_end_poly1305_init_x86_64
++ .rva .LSEH_info_poly1305_init_x86_64
++
++ .rva .LSEH_begin_poly1305_blocks_x86_64
++ .rva .LSEH_end_poly1305_blocks_x86_64
++ .rva .LSEH_info_poly1305_blocks_x86_64
++
++ .rva .LSEH_begin_poly1305_emit_x86_64
++ .rva .LSEH_end_poly1305_emit_x86_64
++ .rva .LSEH_info_poly1305_emit_x86_64
+ ___
+ $code.=<<___ if ($avx);
+ .rva .LSEH_begin_poly1305_blocks_avx
+@@ -4088,20 +4180,20 @@ ___
+ $code.=<<___;
+ .section .xdata
+ .align 8
+-.LSEH_info_poly1305_init:
++.LSEH_info_poly1305_init_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+- .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
++ .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
+
+-.LSEH_info_poly1305_blocks:
++.LSEH_info_poly1305_blocks_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_body,.Lblocks_epilogue
+
+-.LSEH_info_poly1305_emit:
++.LSEH_info_poly1305_emit_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+- .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
++ .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
+ ___
+ $code.=<<___ if ($avx);
+ .LSEH_info_poly1305_blocks_avx_1:
+@@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
+ ___
+ }
+
++open SELF,$0;
++while(<SELF>) {
++ next if (/^#!/);
++ last if (!s/^#/\/\// and !/^$/);
++ print;
++}
++close SELF;
++
+ foreach (split('\n',$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+ s/%r([a-z]+)#d/%e$1/g;
+ s/%r([0-9]+)#d/%r$1d/g;
+ s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
+
++ if ($kernel) {
++ s/(^\.type.*),[0-9]+$/\1/;
++ s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
++ next if /^\.cfi.*/;
++ }
++
+ print $_,"\n";
+ }
+ close STDOUT;
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -1,8 +1,6 @@
+-// SPDX-License-Identifier: GPL-2.0-or-later
++// SPDX-License-Identifier: GPL-2.0 OR MIT
+ /*
+- * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
+- *
+- * Copyright (C) 2015 Martin Willi
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+ #include <crypto/algapi.h>
+@@ -13,279 +11,170 @@
+ #include <linux/jump_label.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
++#include <asm/intel-family.h>
+ #include <asm/simd.h>
+
+-asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
+- const u32 *r, unsigned int blocks);
+-asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
+- unsigned int blocks, const u32 *u);
+-asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+- unsigned int blocks, const u32 *u);
++asmlinkage void poly1305_init_x86_64(void *ctx,
++ const u8 key[POLY1305_KEY_SIZE]);
++asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
++ const size_t len, const u32 padbit);
++asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
++ const u32 nonce[4]);
++asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
++ const u32 nonce[4]);
++asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
++ const u32 padbit);
++asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
++ const u32 padbit);
++asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
++ const size_t len, const u32 padbit);
+
+-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
+ static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+-static inline u64 mlt(u64 a, u64 b)
+-{
+- return a * b;
+-}
+-
+-static inline u32 sr(u64 v, u_char n)
+-{
+- return v >> n;
+-}
+-
+-static inline u32 and(u32 v, u32 mask)
+-{
+- return v & mask;
+-}
+-
+-static void poly1305_simd_mult(u32 *a, const u32 *b)
+-{
+- u8 m[POLY1305_BLOCK_SIZE];
+-
+- memset(m, 0, sizeof(m));
+- /* The poly1305 block function adds a hi-bit to the accumulator which
+- * we don't need for key multiplication; compensate for it. */
+- a[4] -= 1 << 24;
+- poly1305_block_sse2(a, m, b, 1);
+-}
+-
+-static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
+-{
+- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+- key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
+- key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
+- key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
+- key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
+- key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
+-}
++struct poly1305_arch_internal {
++ union {
++ struct {
++ u32 h[5];
++ u32 is_base2_26;
++ };
++ u64 hs[3];
++ };
++ u64 r[2];
++ u64 pad;
++ struct { u32 r2, r1, r4, r3; } rn[9];
++};
+
+-static void poly1305_integer_blocks(struct poly1305_state *state,
+- const struct poly1305_key *key,
+- const void *src,
+- unsigned int nblocks, u32 hibit)
++/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
++ * the unfortunate situation of using AVX and then having to go back to scalar
++ * -- because the user is silly and has called the update function from two
++ * separate contexts -- then we need to convert back to the original base before
++ * proceeding. It is possible to reason that the initial reduction below is
++ * sufficient given the implementation invariants. However, for an avoidance of
++ * doubt and because this is not performance critical, we do the full reduction
++ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
++ */
++static void convert_to_base2_64(void *ctx)
+ {
+- u32 r0, r1, r2, r3, r4;
+- u32 s1, s2, s3, s4;
+- u32 h0, h1, h2, h3, h4;
+- u64 d0, d1, d2, d3, d4;
++ struct poly1305_arch_internal *state = ctx;
++ u32 cy;
+
+- if (!nblocks)
++ if (!state->is_base2_26)
+ return;
+
+- r0 = key->r[0];
+- r1 = key->r[1];
+- r2 = key->r[2];
+- r3 = key->r[3];
+- r4 = key->r[4];
+-
+- s1 = r1 * 5;
+- s2 = r2 * 5;
+- s3 = r3 * 5;
+- s4 = r4 * 5;
+-
+- h0 = state->h[0];
+- h1 = state->h[1];
+- h2 = state->h[2];
+- h3 = state->h[3];
+- h4 = state->h[4];
+-
+- do {
+- /* h += m[i] */
+- h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
+- h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
+- h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
+- h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
+- h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
+-
+- /* h *= r */
+- d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
+- mlt(h3, s2) + mlt(h4, s1);
+- d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
+- mlt(h3, s3) + mlt(h4, s2);
+- d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
+- mlt(h3, s4) + mlt(h4, s3);
+- d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
+- mlt(h3, r0) + mlt(h4, s4);
+- d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
+- mlt(h3, r1) + mlt(h4, r0);
+-
+- /* (partial) h %= p */
+- d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
+- d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
+- d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
+- d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
+- h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
+- h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
+-
+- src += POLY1305_BLOCK_SIZE;
+- } while (--nblocks);
+-
+- state->h[0] = h0;
+- state->h[1] = h1;
+- state->h[2] = h2;
+- state->h[3] = h3;
+- state->h[4] = h4;
+-}
+-
+-static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
+-{
+- u32 h0, h1, h2, h3, h4;
+- u32 g0, g1, g2, g3, g4;
+- u32 mask;
+-
+- /* fully carry h */
+- h0 = state->h[0];
+- h1 = state->h[1];
+- h2 = state->h[2];
+- h3 = state->h[3];
+- h4 = state->h[4];
+-
+- h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
+- h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
+- h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
+- h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
+- h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
+-
+- /* compute h + -p */
+- g0 = h0 + 5;
+- g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
+- g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
+- g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
+- g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
+-
+- /* select h if h < p, or h + -p if h >= p */
+- mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
+- g0 &= mask;
+- g1 &= mask;
+- g2 &= mask;
+- g3 &= mask;
+- g4 &= mask;
+- mask = ~mask;
+- h0 = (h0 & mask) | g0;
+- h1 = (h1 & mask) | g1;
+- h2 = (h2 & mask) | g2;
+- h3 = (h3 & mask) | g3;
+- h4 = (h4 & mask) | g4;
+-
+- /* h = h % (2^128) */
+- put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
+- put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
+- put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
+- put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
+-}
+-
+-void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
+-{
+- poly1305_integer_setkey(desc->opaque_r, key);
+- desc->s[0] = get_unaligned_le32(key + 16);
+- desc->s[1] = get_unaligned_le32(key + 20);
+- desc->s[2] = get_unaligned_le32(key + 24);
+- desc->s[3] = get_unaligned_le32(key + 28);
+- poly1305_core_init(&desc->h);
+- desc->buflen = 0;
+- desc->sset = true;
+- desc->rset = 1;
+-}
+-EXPORT_SYMBOL_GPL(poly1305_init_arch);
+-
+-static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+- const u8 *src, unsigned int srclen)
+-{
+- if (!dctx->sset) {
+- if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
+- poly1305_integer_setkey(dctx->r, src);
+- src += POLY1305_BLOCK_SIZE;
+- srclen -= POLY1305_BLOCK_SIZE;
+- dctx->rset = 1;
+- }
+- if (srclen >= POLY1305_BLOCK_SIZE) {
+- dctx->s[0] = get_unaligned_le32(src + 0);
+- dctx->s[1] = get_unaligned_le32(src + 4);
+- dctx->s[2] = get_unaligned_le32(src + 8);
+- dctx->s[3] = get_unaligned_le32(src + 12);
+- src += POLY1305_BLOCK_SIZE;
+- srclen -= POLY1305_BLOCK_SIZE;
+- dctx->sset = true;
+- }
++ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
++ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
++ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
++ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
++ state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
++ state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
++ state->hs[2] = state->h[4] >> 24;
++#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
++ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
++ state->hs[2] &= 3;
++ state->hs[0] += cy;
++ state->hs[1] += (cy = ULT(state->hs[0], cy));
++ state->hs[2] += ULT(state->hs[1], cy);
++#undef ULT
++ state->is_base2_26 = 0;
++}
++
++static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
++{
++ poly1305_init_x86_64(ctx, key);
++}
++
++static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
++ const u32 padbit)
++{
++ struct poly1305_arch_internal *state = ctx;
++
++ /* SIMD disables preemption, so relax after processing each page. */
++ BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
++ PAGE_SIZE % POLY1305_BLOCK_SIZE);
++
++ if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
++ (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
++ !crypto_simd_usable()) {
++ convert_to_base2_64(ctx);
++ poly1305_blocks_x86_64(ctx, inp, len, padbit);
++ return;
+ }
+- return srclen;
+-}
+
+-static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
+- const u8 *src, unsigned int srclen)
+-{
+- unsigned int datalen;
++ for (;;) {
++ const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+
+- if (unlikely(!dctx->sset)) {
+- datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+- src += srclen - datalen;
+- srclen = datalen;
+- }
+- if (srclen >= POLY1305_BLOCK_SIZE) {
+- poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
+- srclen / POLY1305_BLOCK_SIZE, 1);
+- srclen %= POLY1305_BLOCK_SIZE;
++ kernel_fpu_begin();
++ if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
++ poly1305_blocks_avx512(ctx, inp, bytes, padbit);
++ else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
++ poly1305_blocks_avx2(ctx, inp, bytes, padbit);
++ else
++ poly1305_blocks_avx(ctx, inp, bytes, padbit);
++ kernel_fpu_end();
++ len -= bytes;
++ if (!len)
++ break;
++ inp += bytes;
+ }
+- return srclen;
+ }
+
+-static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+- const u8 *src, unsigned int srclen)
+-{
+- unsigned int blocks, datalen;
++static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
++ const u32 nonce[4])
++{
++ struct poly1305_arch_internal *state = ctx;
++
++ if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
++ !state->is_base2_26 || !crypto_simd_usable()) {
++ convert_to_base2_64(ctx);
++ poly1305_emit_x86_64(ctx, mac, nonce);
++ } else
++ poly1305_emit_avx(ctx, mac, nonce);
++}
++
++void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
++{
++ poly1305_simd_init(&dctx->h, key);
++ dctx->s[0] = get_unaligned_le32(&key[16]);
++ dctx->s[1] = get_unaligned_le32(&key[20]);
++ dctx->s[2] = get_unaligned_le32(&key[24]);
++ dctx->s[3] = get_unaligned_le32(&key[28]);
++ dctx->buflen = 0;
++ dctx->sset = true;
++}
++EXPORT_SYMBOL(poly1305_init_arch);
+
++static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
++ const u8 *inp, unsigned int len)
++{
++ unsigned int acc = 0;
+ if (unlikely(!dctx->sset)) {
+- datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+- src += srclen - datalen;
+- srclen = datalen;
+- }
+-
+- if (IS_ENABLED(CONFIG_AS_AVX2) &&
+- static_branch_likely(&poly1305_use_avx2) &&
+- srclen >= POLY1305_BLOCK_SIZE * 4) {
+- if (unlikely(dctx->rset < 4)) {
+- if (dctx->rset < 2) {
+- dctx->r[1] = dctx->r[0];
+- poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
+- }
+- dctx->r[2] = dctx->r[1];
+- poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
+- dctx->r[3] = dctx->r[2];
+- poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
+- dctx->rset = 4;
++ if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
++ poly1305_simd_init(&dctx->h, inp);
++ inp += POLY1305_BLOCK_SIZE;
++ len -= POLY1305_BLOCK_SIZE;
++ acc += POLY1305_BLOCK_SIZE;
++ dctx->rset = 1;
+ }
+- blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
+- poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
+- dctx->r[1].r);
+- src += POLY1305_BLOCK_SIZE * 4 * blocks;
+- srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
+- }
+-
+- if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
+- if (unlikely(dctx->rset < 2)) {
+- dctx->r[1] = dctx->r[0];
+- poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
+- dctx->rset = 2;
++ if (len >= POLY1305_BLOCK_SIZE) {
++ dctx->s[0] = get_unaligned_le32(&inp[0]);
++ dctx->s[1] = get_unaligned_le32(&inp[4]);
++ dctx->s[2] = get_unaligned_le32(&inp[8]);
++ dctx->s[3] = get_unaligned_le32(&inp[12]);
++ inp += POLY1305_BLOCK_SIZE;
++ len -= POLY1305_BLOCK_SIZE;
++ acc += POLY1305_BLOCK_SIZE;
++ dctx->sset = true;
+ }
+- blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
+- poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
+- blocks, dctx->r[1].r);
+- src += POLY1305_BLOCK_SIZE * 2 * blocks;
+- srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
+- }
+- if (srclen >= POLY1305_BLOCK_SIZE) {
+- poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
+- srclen -= POLY1305_BLOCK_SIZE;
+ }
+- return srclen;
++ return acc;
+ }
+
+ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+ unsigned int srclen)
+ {
+- unsigned int bytes;
++ unsigned int bytes, used;
+
+ if (unlikely(dctx->buflen)) {
+ bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+@@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+- if (static_branch_likely(&poly1305_use_simd) &&
+- likely(crypto_simd_usable())) {
+- kernel_fpu_begin();
+- poly1305_simd_blocks(dctx, dctx->buf,
+- POLY1305_BLOCK_SIZE);
+- kernel_fpu_end();
+- } else {
+- poly1305_scalar_blocks(dctx, dctx->buf,
+- POLY1305_BLOCK_SIZE);
+- }
++ if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
++ poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+- if (static_branch_likely(&poly1305_use_simd) &&
+- likely(crypto_simd_usable())) {
+- kernel_fpu_begin();
+- bytes = poly1305_simd_blocks(dctx, src, srclen);
+- kernel_fpu_end();
+- } else {
+- bytes = poly1305_scalar_blocks(dctx, src, srclen);
+- }
+- src += srclen - bytes;
+- srclen = bytes;
++ bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
++ srclen -= bytes;
++ used = crypto_poly1305_setdctxkey(dctx, src, bytes);
++ if (likely(bytes - used))
++ poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
++ src += bytes;
+ }
+
+ if (unlikely(srclen)) {
+@@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130
+ }
+ EXPORT_SYMBOL(poly1305_update_arch);
+
+-void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
++void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+ {
+- __le32 digest[4];
+- u64 f = 0;
+-
+- if (unlikely(desc->buflen)) {
+- desc->buf[desc->buflen++] = 1;
+- memset(desc->buf + desc->buflen, 0,
+- POLY1305_BLOCK_SIZE - desc->buflen);
+- poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
++ if (unlikely(dctx->buflen)) {
++ dctx->buf[dctx->buflen++] = 1;
++ memset(dctx->buf + dctx->buflen, 0,
++ POLY1305_BLOCK_SIZE - dctx->buflen);
++ poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+- poly1305_integer_emit(&desc->h, digest);
+-
+- /* mac = (h + s) % (2^128) */
+- f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
+- put_unaligned_le32(f, dst + 0);
+- f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
+- put_unaligned_le32(f, dst + 4);
+- f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
+- put_unaligned_le32(f, dst + 8);
+- f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
+- put_unaligned_le32(f, dst + 12);
+-
+- *desc = (struct poly1305_desc_ctx){};
++ poly1305_simd_emit(&dctx->h, dst, dctx->s);
++ *dctx = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
+
+@@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s
+ {
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+- poly1305_core_init(&dctx->h);
+- dctx->buflen = 0;
+- dctx->rset = 0;
+- dctx->sset = false;
+-
++ *dctx = (struct poly1305_desc_ctx){};
+ return 0;
+ }
+
+-static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
++static int crypto_poly1305_update(struct shash_desc *desc,
++ const u8 *src, unsigned int srclen)
+ {
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+- if (unlikely(!dctx->sset))
+- return -ENOKEY;
+-
+- poly1305_final_arch(dctx, dst);
++ poly1305_update_arch(dctx, src, srclen);
+ return 0;
+ }
+
+-static int poly1305_simd_update(struct shash_desc *desc,
+- const u8 *src, unsigned int srclen)
++static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+ {
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+- poly1305_update_arch(dctx, src, srclen);
++ if (unlikely(!dctx->sset))
++ return -ENOKEY;
++
++ poly1305_final_arch(dctx, dst);
+ return 0;
+ }
+
+ static struct shash_alg alg = {
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_poly1305_init,
+- .update = poly1305_simd_update,
++ .update = crypto_poly1305_update,
+ .final = crypto_poly1305_final,
+ .descsize = sizeof(struct poly1305_desc_ctx),
+ .base = {
+@@ -406,17 +265,19 @@ static struct shash_alg alg = {
+
+ static int __init poly1305_simd_mod_init(void)
+ {
+- if (!boot_cpu_has(X86_FEATURE_XMM2))
+- return 0;
+-
+- static_branch_enable(&poly1305_use_simd);
+-
+- if (IS_ENABLED(CONFIG_AS_AVX2) &&
+- boot_cpu_has(X86_FEATURE_AVX) &&
++ if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
++ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
++ static_branch_enable(&poly1305_use_avx);
++ if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ static_branch_enable(&poly1305_use_avx2);
+-
++ if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
++ boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
++ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
++ /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
++ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
++ static_branch_enable(&poly1305_use_avx512);
+ return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
+ }
+
+@@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
+ module_exit(poly1305_simd_mod_exit);
+
+ MODULE_LICENSE("GPL");
+-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+ MODULE_DESCRIPTION("Poly1305 authenticator");
+ MODULE_ALIAS_CRYPTO("poly1305");
+ MODULE_ALIAS_CRYPTO("poly1305-simd");
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -90,7 +90,7 @@ config CRYPTO_LIB_DES
+ config CRYPTO_LIB_POLY1305_RSIZE
+ int
+ default 2 if MIPS
+- default 4 if X86_64
++ default 11 if X86_64
+ default 9 if ARM || ARM64
+ default 1
+