From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 5 Jan 2020 22:40:48 -0500 Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for kernel commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream. These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F. The AVX-512F implementation is disabled on Skylake, due to throttling, but it is quite fast on >= Cannonlake. On the left is cycle counts on a Core i7 6700HQ using the AVX-2 codepath, comparing this implementation ("new") to the implementation in the current crypto api ("old"). On the right are benchmarks on a Xeon Gold 5120 using the AVX-512 codepath. The new implementation is faster on all benchmarks. AVX-2 AVX-512 --------- ----------- size old new size old new ---- ---- ---- ---- ---- ---- 0 70 68 0 74 70 16 92 90 16 96 92 32 134 104 32 136 106 48 172 120 48 184 124 64 218 136 64 218 138 80 254 158 80 260 160 96 298 174 96 300 176 112 342 192 112 342 194 128 388 212 128 384 212 144 428 228 144 420 226 160 466 246 160 464 248 176 510 264 176 504 264 192 550 282 192 544 282 208 594 302 208 582 300 224 628 316 224 624 318 240 676 334 240 662 338 256 716 354 256 708 358 272 764 374 272 748 372 288 802 352 288 788 358 304 420 366 304 422 370 320 428 360 320 432 364 336 484 378 336 486 380 352 426 384 352 434 390 368 478 400 368 480 408 384 488 394 384 490 398 400 542 408 400 542 412 416 486 416 416 492 426 432 534 430 432 538 436 448 544 422 448 546 432 464 600 438 464 600 448 480 540 448 480 548 456 496 594 464 496 594 476 512 602 456 512 606 470 528 656 476 528 656 480 544 600 480 544 606 498 560 650 494 560 652 512 576 664 490 576 662 508 592 714 508 592 716 522 608 656 514 608 664 538 624 708 532 624 710 552 640 716 524 640 720 516 656 770 536 656 772 526 672 716 548 672 722 544 688 770 562 688 768 556 704 774 552 704 778 556 720 826 568 720 832 568 736 768 574 736 780 584 752 822 592 752 826 600 768 830 584 768 836 560 784 884 602 784 888 572 800 828 610 800 838 588 816 884 628 816 884 604 832 888 618 832 894 598 848 942 632 848 946 612 864 884 644 864 896 628 880 936 660 880 942 644 896 948 652 896 952 608 912 1000 664 912 1004 616 928 942 676 928 954 634 944 994 690 944 1000 646 960 1002 680 960 1008 646 976 1054 694 976 1062 658 992 1002 706 992 1012 674 1008 1052 720 1008 1058 690 This commit wires in the prior implementation from Andy, and makes the following changes to be suitable for kernel land. - Some cosmetic and structural changes, like renaming labels to .Lname, constants, and other Linux conventions, as well as making the code easy for us to maintain moving forward. - CPU feature checking is done in C by the glue code. - We avoid jumping into the middle of functions, to appease objtool, and instead parameterize shared code. - We maintain frame pointers so that stack traces make sense. - We remove the dependency on the perl xlate code, which transforms the output into things that assemblers we don't care about use. Importantly, none of our changes affect the arithmetic or core code, but just involve the differing environment of kernel space. Signed-off-by: Jason A. Donenfeld Signed-off-by: Samuel Neves Co-developed-by: Samuel Neves Signed-off-by: Herbert Xu Signed-off-by: Jason A. Donenfeld --- arch/x86/crypto/.gitignore | 1 + arch/x86/crypto/Makefile | 11 +- arch/x86/crypto/poly1305-avx2-x86_64.S | 390 ---------- arch/x86/crypto/poly1305-sse2-x86_64.S | 590 --------------- arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++-------- arch/x86/crypto/poly1305_glue.c | 473 +++++------- lib/crypto/Kconfig | 2 +- 7 files changed, 572 insertions(+), 1577 deletions(-) create mode 100644 arch/x86/crypto/.gitignore delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S --- /dev/null +++ b/arch/x86/crypto/.gitignore @@ -0,0 +1 @@ +poly1305-x86_64.S --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o blake2s-x86_64-y := blake2s-core.o blake2s-glue.o +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),) +targets += poly1305-x86_64-cryptogams.S +endif ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ @@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o ifeq ($(avx2_supported),yes) sha1-ssse3-y += sha1_avx2_x86_64_asm.o -poly1305-x86_64-y += poly1305-avx2-x86_64.o endif ifeq ($(sha1_ni_supported),yes) sha1-ssse3-y += sha1_ni_asm.o @@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o endif sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $< > $@ +$(obj)/%.S: $(src)/%.pl FORCE + $(call if_changed,perlasm) --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ /dev/null @@ -1,390 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include - -.section .rodata.cst32.ANMASK, "aM", @progbits, 32 -.align 32 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff - .octa 0x0000000003ffffff0000000003ffffff - -.section .rodata.cst32.ORMASK, "aM", @progbits, 32 -.align 32 -ORMASK: .octa 0x00000000010000000000000001000000 - .octa 0x00000000010000000000000001000000 - -.text - -#define h0 0x00(%rdi) -#define h1 0x04(%rdi) -#define h2 0x08(%rdi) -#define h3 0x0c(%rdi) -#define h4 0x10(%rdi) -#define r0 0x00(%rdx) -#define r1 0x04(%rdx) -#define r2 0x08(%rdx) -#define r3 0x0c(%rdx) -#define r4 0x10(%rdx) -#define u0 0x00(%r8) -#define u1 0x04(%r8) -#define u2 0x08(%r8) -#define u3 0x0c(%r8) -#define u4 0x10(%r8) -#define w0 0x18(%r8) -#define w1 0x1c(%r8) -#define w2 0x20(%r8) -#define w3 0x24(%r8) -#define w4 0x28(%r8) -#define y0 0x30(%r8) -#define y1 0x34(%r8) -#define y2 0x38(%r8) -#define y3 0x3c(%r8) -#define y4 0x40(%r8) -#define m %rsi -#define hc0 %ymm0 -#define hc1 %ymm1 -#define hc2 %ymm2 -#define hc3 %ymm3 -#define hc4 %ymm4 -#define hc0x %xmm0 -#define hc1x %xmm1 -#define hc2x %xmm2 -#define hc3x %xmm3 -#define hc4x %xmm4 -#define t1 %ymm5 -#define t2 %ymm6 -#define t1x %xmm5 -#define t2x %xmm6 -#define ruwy0 %ymm7 -#define ruwy1 %ymm8 -#define ruwy2 %ymm9 -#define ruwy3 %ymm10 -#define ruwy4 %ymm11 -#define ruwy0x %xmm7 -#define ruwy1x %xmm8 -#define ruwy2x %xmm9 -#define ruwy3x %xmm10 -#define ruwy4x %xmm11 -#define svxz1 %ymm12 -#define svxz2 %ymm13 -#define svxz3 %ymm14 -#define svxz4 %ymm15 -#define d0 %r9 -#define d1 %r10 -#define d2 %r11 -#define d3 %r12 -#define d4 %r13 - -ENTRY(poly1305_4block_avx2) - # %rdi: Accumulator h[5] - # %rsi: 64 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Quadblock count - # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], - - # This four-block variant uses loop unrolled block processing. It - # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: - # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r - - vzeroupper - push %rbx - push %r12 - push %r13 - - # combine r0,u0,w0,y0 - vmovd y0,ruwy0x - vmovd w0,t1x - vpunpcklqdq t1,ruwy0,ruwy0 - vmovd u0,t1x - vmovd r0,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy0,ruwy0 - - # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 - vmovd y1,ruwy1x - vmovd w1,t1x - vpunpcklqdq t1,ruwy1,ruwy1 - vmovd u1,t1x - vmovd r1,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy1,ruwy1 - vpslld $2,ruwy1,svxz1 - vpaddd ruwy1,svxz1,svxz1 - - # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 - vmovd y2,ruwy2x - vmovd w2,t1x - vpunpcklqdq t1,ruwy2,ruwy2 - vmovd u2,t1x - vmovd r2,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy2,ruwy2 - vpslld $2,ruwy2,svxz2 - vpaddd ruwy2,svxz2,svxz2 - - # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 - vmovd y3,ruwy3x - vmovd w3,t1x - vpunpcklqdq t1,ruwy3,ruwy3 - vmovd u3,t1x - vmovd r3,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy3,ruwy3 - vpslld $2,ruwy3,svxz3 - vpaddd ruwy3,svxz3,svxz3 - - # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 - vmovd y4,ruwy4x - vmovd w4,t1x - vpunpcklqdq t1,ruwy4,ruwy4 - vmovd u4,t1x - vmovd r4,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy4,ruwy4 - vpslld $2,ruwy4,svxz4 - vpaddd ruwy4,svxz4,svxz4 - -.Ldoblock4: - # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, - # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] - vmovd 0x00(m),hc0x - vmovd 0x10(m),t1x - vpunpcklqdq t1,hc0,hc0 - vmovd 0x20(m),t1x - vmovd 0x30(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc0,hc0 - vpand ANMASK(%rip),hc0,hc0 - vmovd h0,t1x - vpaddd t1,hc0,hc0 - # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, - # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] - vmovd 0x03(m),hc1x - vmovd 0x13(m),t1x - vpunpcklqdq t1,hc1,hc1 - vmovd 0x23(m),t1x - vmovd 0x33(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc1,hc1 - vpsrld $2,hc1,hc1 - vpand ANMASK(%rip),hc1,hc1 - vmovd h1,t1x - vpaddd t1,hc1,hc1 - # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, - # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] - vmovd 0x06(m),hc2x - vmovd 0x16(m),t1x - vpunpcklqdq t1,hc2,hc2 - vmovd 0x26(m),t1x - vmovd 0x36(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc2,hc2 - vpsrld $4,hc2,hc2 - vpand ANMASK(%rip),hc2,hc2 - vmovd h2,t1x - vpaddd t1,hc2,hc2 - # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, - # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] - vmovd 0x09(m),hc3x - vmovd 0x19(m),t1x - vpunpcklqdq t1,hc3,hc3 - vmovd 0x29(m),t1x - vmovd 0x39(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc3,hc3 - vpsrld $6,hc3,hc3 - vpand ANMASK(%rip),hc3,hc3 - vmovd h3,t1x - vpaddd t1,hc3,hc3 - # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), - # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] - vmovd 0x0c(m),hc4x - vmovd 0x1c(m),t1x - vpunpcklqdq t1,hc4,hc4 - vmovd 0x2c(m),t1x - vmovd 0x3c(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc4,hc4 - vpsrld $8,hc4,hc4 - vpor ORMASK(%rip),hc4,hc4 - vmovd h4,t1x - vpaddd t1,hc4,hc4 - - # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] - vpmuludq hc0,ruwy0,t1 - # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] - vpmuludq hc1,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] - vpmuludq hc2,svxz3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] - vpmuludq hc3,svxz2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] - vpmuludq hc4,svxz1,t2 - vpaddq t2,t1,t1 - # d0 = t1[0] + t1[1] + t[2] + t[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d0 - - # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] - vpmuludq hc0,ruwy1,t1 - # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] - vpmuludq hc1,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] - vpmuludq hc2,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] - vpmuludq hc3,svxz3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] - vpmuludq hc4,svxz2,t2 - vpaddq t2,t1,t1 - # d1 = t1[0] + t1[1] + t1[3] + t1[4] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d1 - - # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] - vpmuludq hc0,ruwy2,t1 - # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] - vpmuludq hc1,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] - vpmuludq hc2,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] - vpmuludq hc3,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] - vpmuludq hc4,svxz3,t2 - vpaddq t2,t1,t1 - # d2 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d2 - - # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] - vpmuludq hc0,ruwy3,t1 - # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] - vpmuludq hc1,ruwy2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] - vpmuludq hc2,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] - vpmuludq hc3,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] - vpmuludq hc4,svxz4,t2 - vpaddq t2,t1,t1 - # d3 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d3 - - # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] - vpmuludq hc0,ruwy4,t1 - # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] - vpmuludq hc1,ruwy3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] - vpmuludq hc2,ruwy2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] - vpmuludq hc3,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] - vpmuludq hc4,ruwy0,t2 - vpaddq t2,t1,t1 - # d4 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d4 - - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small - # amount. Careful: we must not assume the carry bits 'd0 >> 26', - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit - # integers. It's true in a single-block implementation, but not here. - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x40,m - dec %rcx - jnz .Ldoblock4 - - vzeroupper - pop %r13 - pop %r12 - pop %rbx - ret -ENDPROC(poly1305_4block_avx2) --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ /dev/null @@ -1,590 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include - -.section .rodata.cst16.ANMASK, "aM", @progbits, 16 -.align 16 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff - -.section .rodata.cst16.ORMASK, "aM", @progbits, 16 -.align 16 -ORMASK: .octa 0x00000000010000000000000001000000 - -.text - -#define h0 0x00(%rdi) -#define h1 0x04(%rdi) -#define h2 0x08(%rdi) -#define h3 0x0c(%rdi) -#define h4 0x10(%rdi) -#define r0 0x00(%rdx) -#define r1 0x04(%rdx) -#define r2 0x08(%rdx) -#define r3 0x0c(%rdx) -#define r4 0x10(%rdx) -#define s1 0x00(%rsp) -#define s2 0x04(%rsp) -#define s3 0x08(%rsp) -#define s4 0x0c(%rsp) -#define m %rsi -#define h01 %xmm0 -#define h23 %xmm1 -#define h44 %xmm2 -#define t1 %xmm3 -#define t2 %xmm4 -#define t3 %xmm5 -#define t4 %xmm6 -#define mask %xmm7 -#define d0 %r8 -#define d1 %r9 -#define d2 %r10 -#define d3 %r11 -#define d4 %r12 - -ENTRY(poly1305_block_sse2) - # %rdi: Accumulator h[5] - # %rsi: 16 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Block count - - # This single block variant tries to improve performance by doing two - # multiplications in parallel using SSE instructions. There is quite - # some quardword packing involved, hence the speedup is marginal. - - push %rbx - push %r12 - sub $0x10,%rsp - - # s1..s4 = r1..r4 * 5 - mov r1,%eax - lea (%eax,%eax,4),%eax - mov %eax,s1 - mov r2,%eax - lea (%eax,%eax,4),%eax - mov %eax,s2 - mov r3,%eax - lea (%eax,%eax,4),%eax - mov %eax,s3 - mov r4,%eax - lea (%eax,%eax,4),%eax - mov %eax,s4 - - movdqa ANMASK(%rip),mask - -.Ldoblock: - # h01 = [0, h1, 0, h0] - # h23 = [0, h3, 0, h2] - # h44 = [0, h4, 0, h4] - movd h0,h01 - movd h1,t1 - movd h2,h23 - movd h3,t2 - movd h4,h44 - punpcklqdq t1,h01 - punpcklqdq t2,h23 - punpcklqdq h44,h44 - - # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] - movd 0x00(m),t1 - movd 0x03(m),t2 - psrld $2,t2 - punpcklqdq t2,t1 - pand mask,t1 - paddd t1,h01 - # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] - movd 0x06(m),t1 - movd 0x09(m),t2 - psrld $4,t1 - psrld $6,t2 - punpcklqdq t2,t1 - pand mask,t1 - paddd t1,h23 - # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] - mov 0x0c(m),%eax - shr $8,%eax - or $0x01000000,%eax - movd %eax,t1 - pshufd $0xc4,t1,t1 - paddd t1,h44 - - # t1[0] = h0 * r0 + h2 * s3 - # t1[1] = h1 * s4 + h3 * s2 - movd r0,t1 - movd s4,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd s3,t2 - movd s2,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t2[0] = h0 * r1 + h2 * s4 - # t2[1] = h1 * r0 + h3 * s3 - movd r1,t2 - movd r0,t3 - punpcklqdq t3,t2 - pmuludq h01,t2 - movd s4,t3 - movd s3,t4 - punpcklqdq t4,t3 - pmuludq h23,t3 - paddq t3,t2 - # t3[0] = h4 * s1 - # t3[1] = h4 * s2 - movd s1,t3 - movd s2,t4 - punpcklqdq t4,t3 - pmuludq h44,t3 - # d0 = t1[0] + t1[1] + t3[0] - # d1 = t2[0] + t2[1] + t3[1] - movdqa t1,t4 - punpcklqdq t2,t4 - punpckhqdq t2,t1 - paddq t4,t1 - paddq t3,t1 - movq t1,d0 - psrldq $8,t1 - movq t1,d1 - - # t1[0] = h0 * r2 + h2 * r0 - # t1[1] = h1 * r1 + h3 * s4 - movd r2,t1 - movd r1,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd r0,t2 - movd s4,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t2[0] = h0 * r3 + h2 * r1 - # t2[1] = h1 * r2 + h3 * r0 - movd r3,t2 - movd r2,t3 - punpcklqdq t3,t2 - pmuludq h01,t2 - movd r1,t3 - movd r0,t4 - punpcklqdq t4,t3 - pmuludq h23,t3 - paddq t3,t2 - # t3[0] = h4 * s3 - # t3[1] = h4 * s4 - movd s3,t3 - movd s4,t4 - punpcklqdq t4,t3 - pmuludq h44,t3 - # d2 = t1[0] + t1[1] + t3[0] - # d3 = t2[0] + t2[1] + t3[1] - movdqa t1,t4 - punpcklqdq t2,t4 - punpckhqdq t2,t1 - paddq t4,t1 - paddq t3,t1 - movq t1,d2 - psrldq $8,t1 - movq t1,d3 - - # t1[0] = h0 * r4 + h2 * r2 - # t1[1] = h1 * r3 + h3 * r1 - movd r4,t1 - movd r3,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd r2,t2 - movd r1,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t3[0] = h4 * r0 - movd r0,t3 - pmuludq h44,t3 - # d4 = t1[0] + t1[1] + t3[0] - movdqa t1,t4 - psrldq $8,t4 - paddq t4,t1 - paddq t3,t1 - movq t1,d4 - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x10,m - dec %rcx - jnz .Ldoblock - - # Zeroing of key material - mov %rcx,0x00(%rsp) - mov %rcx,0x08(%rsp) - - add $0x10,%rsp - pop %r12 - pop %rbx - ret -ENDPROC(poly1305_block_sse2) - - -#define u0 0x00(%r8) -#define u1 0x04(%r8) -#define u2 0x08(%r8) -#define u3 0x0c(%r8) -#define u4 0x10(%r8) -#define hc0 %xmm0 -#define hc1 %xmm1 -#define hc2 %xmm2 -#define hc3 %xmm5 -#define hc4 %xmm6 -#define ru0 %xmm7 -#define ru1 %xmm8 -#define ru2 %xmm9 -#define ru3 %xmm10 -#define ru4 %xmm11 -#define sv1 %xmm12 -#define sv2 %xmm13 -#define sv3 %xmm14 -#define sv4 %xmm15 -#undef d0 -#define d0 %r13 - -ENTRY(poly1305_2block_sse2) - # %rdi: Accumulator h[5] - # %rsi: 16 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Doubleblock count - # %r8: Poly1305 derived key r^2 u[5] - - # This two-block variant further improves performance by using loop - # unrolled block processing. This is more straight forward and does - # less byte shuffling, but requires a second Poly1305 key r^2: - # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r - - push %rbx - push %r12 - push %r13 - - # combine r0,u0 - movd u0,ru0 - movd r0,t1 - punpcklqdq t1,ru0 - - # combine r1,u1 and s1=r1*5,v1=u1*5 - movd u1,ru1 - movd r1,t1 - punpcklqdq t1,ru1 - movdqa ru1,sv1 - pslld $2,sv1 - paddd ru1,sv1 - - # combine r2,u2 and s2=r2*5,v2=u2*5 - movd u2,ru2 - movd r2,t1 - punpcklqdq t1,ru2 - movdqa ru2,sv2 - pslld $2,sv2 - paddd ru2,sv2 - - # combine r3,u3 and s3=r3*5,v3=u3*5 - movd u3,ru3 - movd r3,t1 - punpcklqdq t1,ru3 - movdqa ru3,sv3 - pslld $2,sv3 - paddd ru3,sv3 - - # combine r4,u4 and s4=r4*5,v4=u4*5 - movd u4,ru4 - movd r4,t1 - punpcklqdq t1,ru4 - movdqa ru4,sv4 - pslld $2,sv4 - paddd ru4,sv4 - -.Ldoblock2: - # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] - movd 0x00(m),hc0 - movd 0x10(m),t1 - punpcklqdq t1,hc0 - pand ANMASK(%rip),hc0 - movd h0,t1 - paddd t1,hc0 - # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] - movd 0x03(m),hc1 - movd 0x13(m),t1 - punpcklqdq t1,hc1 - psrld $2,hc1 - pand ANMASK(%rip),hc1 - movd h1,t1 - paddd t1,hc1 - # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] - movd 0x06(m),hc2 - movd 0x16(m),t1 - punpcklqdq t1,hc2 - psrld $4,hc2 - pand ANMASK(%rip),hc2 - movd h2,t1 - paddd t1,hc2 - # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] - movd 0x09(m),hc3 - movd 0x19(m),t1 - punpcklqdq t1,hc3 - psrld $6,hc3 - pand ANMASK(%rip),hc3 - movd h3,t1 - paddd t1,hc3 - # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] - movd 0x0c(m),hc4 - movd 0x1c(m),t1 - punpcklqdq t1,hc4 - psrld $8,hc4 - por ORMASK(%rip),hc4 - movd h4,t1 - paddd t1,hc4 - - # t1 = [ hc0[1] * r0, hc0[0] * u0 ] - movdqa ru0,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * s4, hc1[0] * v4 ] - movdqa sv4,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * s3, hc2[0] * v3 ] - movdqa sv3,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s2, hc3[0] * v2 ] - movdqa sv2,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s1, hc4[0] * v1 ] - movdqa sv1,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d0 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d0 - - # t1 = [ hc0[1] * r1, hc0[0] * u1 ] - movdqa ru1,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r0, hc1[0] * u0 ] - movdqa ru0,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * s4, hc2[0] * v4 ] - movdqa sv4,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s3, hc3[0] * v3 ] - movdqa sv3,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s2, hc4[0] * v2 ] - movdqa sv2,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d1 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d1 - - # t1 = [ hc0[1] * r2, hc0[0] * u2 ] - movdqa ru2,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r1, hc1[0] * u1 ] - movdqa ru1,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r0, hc2[0] * u0 ] - movdqa ru0,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s4, hc3[0] * v4 ] - movdqa sv4,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s3, hc4[0] * v3 ] - movdqa sv3,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d2 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d2 - - # t1 = [ hc0[1] * r3, hc0[0] * u3 ] - movdqa ru3,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r2, hc1[0] * u2 ] - movdqa ru2,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r1, hc2[0] * u1 ] - movdqa ru1,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * r0, hc3[0] * u0 ] - movdqa ru0,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s4, hc4[0] * v4 ] - movdqa sv4,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d3 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d3 - - # t1 = [ hc0[1] * r4, hc0[0] * u4 ] - movdqa ru4,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r3, hc1[0] * u3 ] - movdqa ru3,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r2, hc2[0] * u2 ] - movdqa ru2,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * r1, hc3[0] * u1 ] - movdqa ru1,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * r0, hc4[0] * u0 ] - movdqa ru0,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d4 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d4 - - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small - # amount. Careful: we must not assume the carry bits 'd0 >> 26', - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit - # integers. It's true in a single-block implementation, but not here. - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x20,m - dec %rcx - jnz .Ldoblock2 - - pop %r13 - pop %r12 - pop %rbx - ret -ENDPROC(poly1305_2block_sse2) --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -1,11 +1,14 @@ -#! /usr/bin/env perl -# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause # -# Licensed under the OpenSSL license (the "License"). You may not use -# this file except in compliance with the License. You can obtain a copy -# in the file LICENSE in the source distribution or at -# https://www.openssl.org/source/license.html - +# Copyright (C) 2017-2018 Samuel Neves . All Rights Reserved. +# Copyright (C) 2017-2019 Jason A. Donenfeld . All Rights Reserved. +# Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -32,7 +35,7 @@ # Skylake-X system performance. Since we are likely to suppress # AVX512F capability flag [at least on Skylake-X], conversion serves # as kind of "investment protection". Note that next *lake processor, -# Cannolake, has AVX512IFMA code path to execute... +# Cannonlake, has AVX512IFMA code path to execute... # # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. @@ -68,39 +71,114 @@ $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); +$kernel=0; $kernel=1 if (!$flavour && !$output); -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); +if (!$kernel) { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or + die "can't locate x86_64-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; + + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); + } + + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); + $avx += 1 if ($1==2.11 && $2>=8); + } + + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); + } + + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); + } +} else { + $avx = 4; # The kernel uses ifdefs for this. } -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { - $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); - $avx += 2 if ($1==2.11 && $2>=8); +sub declare_function() { + my ($name, $align, $nargs) = @_; + if($kernel) { + $code .= ".align $align\n"; + $code .= "ENTRY($name)\n"; + $code .= ".L$name:\n"; + } else { + $code .= ".globl $name\n"; + $code .= ".type $name,\@function,$nargs\n"; + $code .= ".align $align\n"; + $code .= "$name:\n"; + } } -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=12); +sub end_function() { + my ($name) = @_; + if($kernel) { + $code .= "ENDPROC($name)\n"; + } else { + $code .= ".size $name,.-$name\n"; + } } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); -} +$code.=<<___ if $kernel; +#include +___ -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; -*STDOUT=*OUT; +if ($avx) { +$code.=<<___ if $kernel; +.section .rodata +___ +$code.=<<___; +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +___ +} +$code.=<<___ if (!$kernel); +.asciz "Poly1305 for x86_64, CRYPTOGAMS by " +.align 16 +___ my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); my ($mac,$nonce)=($inp,$len); # *_emit arguments -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); +my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); sub poly1305_iteration { # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 @@ -155,19 +233,19 @@ ___ $code.=<<___; .text - +___ +$code.=<<___ if (!$kernel); .extern OPENSSL_ia32cap_P -.globl poly1305_init -.hidden poly1305_init -.globl poly1305_blocks -.hidden poly1305_blocks -.globl poly1305_emit -.hidden poly1305_emit - -.type poly1305_init,\@function,3 -.align 32 -poly1305_init: +.globl poly1305_init_x86_64 +.hidden poly1305_init_x86_64 +.globl poly1305_blocks_x86_64 +.hidden poly1305_blocks_x86_64 +.globl poly1305_emit_x86_64 +.hidden poly1305_emit_x86_64 +___ +&declare_function("poly1305_init_x86_64", 32, 3); +$code.=<<___; xor %rax,%rax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) @@ -175,11 +253,12 @@ poly1305_init: cmp \$0,$inp je .Lno_key - - lea poly1305_blocks(%rip),%r10 - lea poly1305_emit(%rip),%r11 ___ -$code.=<<___ if ($avx); +$code.=<<___ if (!$kernel); + lea poly1305_blocks_x86_64(%rip),%r10 + lea poly1305_emit_x86_64(%rip),%r11 +___ +$code.=<<___ if (!$kernel && $avx); mov OPENSSL_ia32cap_P+4(%rip),%r9 lea poly1305_blocks_avx(%rip),%rax lea poly1305_emit_avx(%rip),%rcx @@ -187,12 +266,12 @@ $code.=<<___ if ($avx); cmovc %rax,%r10 cmovc %rcx,%r11 ___ -$code.=<<___ if ($avx>1); +$code.=<<___ if (!$kernel && $avx>1); lea poly1305_blocks_avx2(%rip),%rax bt \$`5+32`,%r9 # AVX2? cmovc %rax,%r10 ___ -$code.=<<___ if ($avx>3); +$code.=<<___ if (!$kernel && $avx>3); mov \$`(1<<31|1<<21|1<<16)`,%rax shr \$32,%r9 and %rax,%r9 @@ -207,11 +286,11 @@ $code.=<<___; mov %rax,24($ctx) mov %rcx,32($ctx) ___ -$code.=<<___ if ($flavour !~ /elf32/); +$code.=<<___ if (!$kernel && $flavour !~ /elf32/); mov %r10,0(%rdx) mov %r11,8(%rdx) ___ -$code.=<<___ if ($flavour =~ /elf32/); +$code.=<<___ if (!$kernel && $flavour =~ /elf32/); mov %r10d,0(%rdx) mov %r11d,4(%rdx) ___ @@ -219,11 +298,11 @@ $code.=<<___; mov \$1,%eax .Lno_key: ret -.size poly1305_init,.-poly1305_init +___ +&end_function("poly1305_init_x86_64"); -.type poly1305_blocks,\@function,4 -.align 32 -poly1305_blocks: +&declare_function("poly1305_blocks_x86_64", 32, 4); +$code.=<<___; .cfi_startproc .Lblocks: shr \$4,$len @@ -231,8 +310,6 @@ poly1305_blocks: push %rbx .cfi_push %rbx - push %rbp -.cfi_push %rbp push %r12 .cfi_push %r12 push %r13 @@ -241,6 +318,8 @@ poly1305_blocks: .cfi_push %r14 push %r15 .cfi_push %r15 + push $ctx +.cfi_push $ctx .Lblocks_body: mov $len,%r15 # reassign $len @@ -265,26 +344,29 @@ poly1305_blocks: lea 16($inp),$inp adc $padbit,$h2 ___ + &poly1305_iteration(); + $code.=<<___; mov $r1,%rax dec %r15 # len-=16 jnz .Loop + mov 0(%rsp),$ctx +.cfi_restore $ctx + mov $h0,0($ctx) # store hash value mov $h1,8($ctx) mov $h2,16($ctx) - mov 0(%rsp),%r15 + mov 8(%rsp),%r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + mov 16(%rsp),%r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + mov 24(%rsp),%r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + mov 32(%rsp),%r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rsp @@ -293,11 +375,11 @@ $code.=<<___; .Lblocks_epilogue: ret .cfi_endproc -.size poly1305_blocks,.-poly1305_blocks +___ +&end_function("poly1305_blocks_x86_64"); -.type poly1305_emit,\@function,3 -.align 32 -poly1305_emit: +&declare_function("poly1305_emit_x86_64", 32, 3); +$code.=<<___; .Lemit: mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 @@ -318,10 +400,14 @@ poly1305_emit: mov %rcx,8($mac) ret -.size poly1305_emit,.-poly1305_emit ___ +&end_function("poly1305_emit_x86_64"); if ($avx) { +if($kernel) { + $code .= "#ifdef CONFIG_AS_AVX\n"; +} + ######################################################################## # Layout of opaque area is following. # @@ -342,15 +428,19 @@ $code.=<<___; .type __poly1305_block,\@abi-omnipotent .align 32 __poly1305_block: + push $ctx ___ &poly1305_iteration(); $code.=<<___; + pop $ctx ret .size __poly1305_block,.-__poly1305_block .type __poly1305_init_avx,\@abi-omnipotent .align 32 __poly1305_init_avx: + push %rbp + mov %rsp,%rbp mov $r0,$h0 mov $r1,$h1 xor $h2,$h2 @@ -507,12 +597,13 @@ __poly1305_init_avx: mov $d1#d,`16*8+8-64`($ctx) lea -48-64($ctx),$ctx # size [de-]optimization + pop %rbp ret .size __poly1305_init_avx,.-__poly1305_init_avx +___ -.type poly1305_blocks_avx,\@function,4 -.align 32 -poly1305_blocks_avx: +&declare_function("poly1305_blocks_avx", 32, 4); +$code.=<<___; .cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len @@ -532,10 +623,11 @@ poly1305_blocks_avx: test \$31,$len jz .Leven_avx - push %rbx -.cfi_push %rbx push %rbp .cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx push %r12 .cfi_push %r12 push %r13 @@ -645,20 +737,18 @@ poly1305_blocks_avx: mov $h2#d,16($ctx) .align 16 .Ldone_avx: - mov 0(%rsp),%r15 + pop %r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + pop %r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + pop %r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + pop %r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp - mov 40(%rsp),%rbx + pop %rbx .cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 + pop %rbp +.cfi_restore %rbp .Lno_data_avx: .Lblocks_avx_epilogue: ret @@ -667,10 +757,11 @@ poly1305_blocks_avx: .align 32 .Lbase2_64_avx: .cfi_startproc - push %rbx -.cfi_push %rbx push %rbp .cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx push %r12 .cfi_push %r12 push %r13 @@ -736,22 +827,18 @@ poly1305_blocks_avx: .Lproceed_avx: mov %r15,$len - - mov 0(%rsp),%r15 + pop %r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + pop %r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + pop %r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + pop %r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp - mov 40(%rsp),%rbx + pop %rbx .cfi_restore %rbx - lea 48(%rsp),%rax - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 + pop %rbp +.cfi_restore %rbp .Lbase2_64_avx_epilogue: jmp .Ldo_avx .cfi_endproc @@ -768,8 +855,11 @@ poly1305_blocks_avx: .Ldo_avx: ___ $code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + and \$-32,%rsp + sub \$-8,%rsp lea -0x58(%rsp),%r11 -.cfi_def_cfa %r11,0x60 sub \$0x178,%rsp ___ $code.=<<___ if ($win64); @@ -1361,18 +1451,18 @@ $code.=<<___ if ($win64); .Ldo_avx_epilogue: ___ $code.=<<___ if (!$win64); - lea 0x58(%r11),%rsp -.cfi_def_cfa %rsp,8 + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp ___ $code.=<<___; vzeroupper ret .cfi_endproc -.size poly1305_blocks_avx,.-poly1305_blocks_avx +___ +&end_function("poly1305_blocks_avx"); -.type poly1305_emit_avx,\@function,3 -.align 32 -poly1305_emit_avx: +&declare_function("poly1305_emit_avx", 32, 3); +$code.=<<___; cmpl \$0,20($ctx) # is_base2_26? je .Lemit @@ -1423,41 +1513,51 @@ poly1305_emit_avx: mov %rcx,8($mac) ret -.size poly1305_emit_avx,.-poly1305_emit_avx ___ +&end_function("poly1305_emit_avx"); + +if ($kernel) { + $code .= "#endif\n"; +} if ($avx>1) { + +if ($kernel) { + $code .= "#ifdef CONFIG_AS_AVX2\n"; +} + my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = map("%ymm$_",(0..15)); my $S4=$MASK; +sub poly1305_blocks_avxN { + my ($avx512) = @_; + my $suffix = $avx512 ? "_avx512" : ""; $code.=<<___; -.type poly1305_blocks_avx2,\@function,4 -.align 32 -poly1305_blocks_avx2: .cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len - jae .Lblocks_avx2 + jae .Lblocks_avx2$suffix test %r8d,%r8d jz .Lblocks -.Lblocks_avx2: +.Lblocks_avx2$suffix: and \$-16,$len - jz .Lno_data_avx2 + jz .Lno_data_avx2$suffix vzeroupper test %r8d,%r8d - jz .Lbase2_64_avx2 + jz .Lbase2_64_avx2$suffix test \$63,$len - jz .Leven_avx2 + jz .Leven_avx2$suffix - push %rbx -.cfi_push %rbx push %rbp .cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx push %r12 .cfi_push %r12 push %r13 @@ -1466,7 +1566,7 @@ poly1305_blocks_avx2: .cfi_push %r14 push %r15 .cfi_push %r15 -.Lblocks_avx2_body: +.Lblocks_avx2_body$suffix: mov $len,%r15 # reassign $len @@ -1513,7 +1613,7 @@ poly1305_blocks_avx2: shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) -.Lbase2_26_pre_avx2: +.Lbase2_26_pre_avx2$suffix: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp @@ -1524,10 +1624,10 @@ poly1305_blocks_avx2: mov $r1,%rax test \$63,%r15 - jnz .Lbase2_26_pre_avx2 + jnz .Lbase2_26_pre_avx2$suffix test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx2 # store hash in base 2^64 format + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format ################################# base 2^64 -> base 2^26 mov $h0,%rax @@ -1548,57 +1648,56 @@ poly1305_blocks_avx2: or $r1,$h2 # h[4] test %r15,%r15 - jz .Lstore_base2_26_avx2 + jz .Lstore_base2_26_avx2$suffix vmovd %rax#d,%x#$H0 vmovd %rdx#d,%x#$H1 vmovd $h0#d,%x#$H2 vmovd $h1#d,%x#$H3 vmovd $h2#d,%x#$H4 - jmp .Lproceed_avx2 + jmp .Lproceed_avx2$suffix .align 32 -.Lstore_base2_64_avx2: +.Lstore_base2_64_avx2$suffix: mov $h0,0($ctx) mov $h1,8($ctx) mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx2 + jmp .Ldone_avx2$suffix .align 16 -.Lstore_base2_26_avx2: +.Lstore_base2_26_avx2$suffix: mov %rax#d,0($ctx) # store hash value base 2^26 mov %rdx#d,4($ctx) mov $h0#d,8($ctx) mov $h1#d,12($ctx) mov $h2#d,16($ctx) .align 16 -.Ldone_avx2: - mov 0(%rsp),%r15 +.Ldone_avx2$suffix: + pop %r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + pop %r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + pop %r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + pop %r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp - mov 40(%rsp),%rbx + pop %rbx .cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data_avx2: -.Lblocks_avx2_epilogue: + pop %rbp +.cfi_restore %rbp +.Lno_data_avx2$suffix: +.Lblocks_avx2_epilogue$suffix: ret .cfi_endproc .align 32 -.Lbase2_64_avx2: +.Lbase2_64_avx2$suffix: .cfi_startproc - push %rbx -.cfi_push %rbx push %rbp .cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx push %r12 .cfi_push %r12 push %r13 @@ -1607,7 +1706,7 @@ poly1305_blocks_avx2: .cfi_push %r14 push %r15 .cfi_push %r15 -.Lbase2_64_avx2_body: +.Lbase2_64_avx2_body$suffix: mov $len,%r15 # reassign $len @@ -1624,9 +1723,9 @@ poly1305_blocks_avx2: add $r1,$s1 # s1 = r1 + (r1 >> 2) test \$63,$len - jz .Linit_avx2 + jz .Linit_avx2$suffix -.Lbase2_64_pre_avx2: +.Lbase2_64_pre_avx2$suffix: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp @@ -1637,9 +1736,9 @@ poly1305_blocks_avx2: mov $r1,%rax test \$63,%r15 - jnz .Lbase2_64_pre_avx2 + jnz .Lbase2_64_pre_avx2$suffix -.Linit_avx2: +.Linit_avx2$suffix: ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx @@ -1667,69 +1766,77 @@ poly1305_blocks_avx2: call __poly1305_init_avx -.Lproceed_avx2: +.Lproceed_avx2$suffix: mov %r15,$len # restore $len - mov OPENSSL_ia32cap_P+8(%rip),%r10d +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d mov \$`(1<<31|1<<30|1<<16)`,%r11d - - mov 0(%rsp),%r15 +___ +$code.=<<___; + pop %r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + pop %r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + pop %r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + pop %r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp - mov 40(%rsp),%rbx + pop %rbx .cfi_restore %rbx - lea 48(%rsp),%rax - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lbase2_64_avx2_epilogue: - jmp .Ldo_avx2 + pop %rbp +.cfi_restore %rbp +.Lbase2_64_avx2_epilogue$suffix: + jmp .Ldo_avx2$suffix .cfi_endproc .align 32 -.Leven_avx2: +.Leven_avx2$suffix: .cfi_startproc - mov OPENSSL_ia32cap_P+8(%rip),%r10d +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d +___ +$code.=<<___; vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 vmovd 4*1($ctx),%x#$H1 vmovd 4*2($ctx),%x#$H2 vmovd 4*3($ctx),%x#$H3 vmovd 4*4($ctx),%x#$H4 -.Ldo_avx2: +.Ldo_avx2$suffix: ___ -$code.=<<___ if ($avx>2); +$code.=<<___ if (!$kernel && $avx>2); cmp \$512,$len jb .Lskip_avx512 - and %r11d,%r10d - test \$`1<<16`,%r10d # check for AVX512F + and %r11d,%r9d + test \$`1<<16`,%r9d # check for AVX512F jnz .Lblocks_avx512 -.Lskip_avx512: +.Lskip_avx512$suffix: +___ +$code.=<<___ if ($avx > 2 && $avx512 && $kernel); + cmp \$512,$len + jae .Lblocks_avx512 ___ $code.=<<___ if (!$win64); - lea -8(%rsp),%r11 -.cfi_def_cfa %r11,16 + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 + lea 8(%rsp),%r10 sub \$0x1c8,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx2_body: + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) +.Ldo_avx2_body$suffix: ___ $code.=<<___; lea .Lconst(%rip),%rcx @@ -1794,11 +1901,11 @@ $code.=<<___; vpaddq $H2,$T2,$H2 # accumulate input sub \$64,$len - jz .Ltail_avx2 - jmp .Loop_avx2 + jz .Ltail_avx2$suffix + jmp .Loop_avx2$suffix .align 32 -.Loop_avx2: +.Loop_avx2$suffix: ################################################################ # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 @@ -1946,10 +2053,10 @@ $code.=<<___; vpor 32(%rcx),$T4,$T4 # padbit, yes, always sub \$64,$len - jnz .Loop_avx2 + jnz .Loop_avx2$suffix .byte 0x66,0x90 -.Ltail_avx2: +.Ltail_avx2$suffix: ################################################################ # while above multiplications were by r^4 in all lanes, in last # iteration we multiply least significant lane by r^4 and most @@ -2087,37 +2194,29 @@ $code.=<<___; vmovd %x#$H4,`4*4-48-64`($ctx) ___ $code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx2_epilogue: + vmovdqa -0xb0(%r10),%xmm6 + vmovdqa -0xa0(%r10),%xmm7 + vmovdqa -0x90(%r10),%xmm8 + vmovdqa -0x80(%r10),%xmm9 + vmovdqa -0x70(%r10),%xmm10 + vmovdqa -0x60(%r10),%xmm11 + vmovdqa -0x50(%r10),%xmm12 + vmovdqa -0x40(%r10),%xmm13 + vmovdqa -0x30(%r10),%xmm14 + vmovdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp +.Ldo_avx2_epilogue$suffix: ___ $code.=<<___ if (!$win64); - lea 8(%r11),%rsp -.cfi_def_cfa %rsp,8 + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp ___ $code.=<<___; vzeroupper ret .cfi_endproc -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 ___ -####################################################################### -if ($avx>2) { -# On entry we have input length divisible by 64. But since inner loop -# processes 128 bytes per iteration, cases when length is not divisible -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this -# reason stack layout is kept identical to poly1305_blocks_avx2. If not -# for this tail, we wouldn't have to even allocate stack frame... - +if($avx > 2 && $avx512) { my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); my $PADBIT="%zmm30"; @@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); map(s/%y/%z/,($MASK)); $code.=<<___; -.type poly1305_blocks_avx512,\@function,4 -.align 32 -poly1305_blocks_avx512: .cfi_startproc .Lblocks_avx512: mov \$15,%eax kmovw %eax,%k2 ___ $code.=<<___ if (!$win64); - lea -8(%rsp),%r11 -.cfi_def_cfa %r11,16 + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 + lea 8(%rsp),%r10 sub \$0x1c8,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) .Ldo_avx512_body: ___ $code.=<<___; @@ -2679,7 +2775,7 @@ $code.=<<___; lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 add \$64,$len - jnz .Ltail_avx2 + jnz .Ltail_avx2$suffix vpsubq $T2,$H2,$H2 # undo input accumulation vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced @@ -2690,29 +2786,61 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - movdqa 0x50(%r11),%xmm6 - movdqa 0x60(%r11),%xmm7 - movdqa 0x70(%r11),%xmm8 - movdqa 0x80(%r11),%xmm9 - movdqa 0x90(%r11),%xmm10 - movdqa 0xa0(%r11),%xmm11 - movdqa 0xb0(%r11),%xmm12 - movdqa 0xc0(%r11),%xmm13 - movdqa 0xd0(%r11),%xmm14 - movdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp + movdqa -0xb0(%r10),%xmm6 + movdqa -0xa0(%r10),%xmm7 + movdqa -0x90(%r10),%xmm8 + movdqa -0x80(%r10),%xmm9 + movdqa -0x70(%r10),%xmm10 + movdqa -0x60(%r10),%xmm11 + movdqa -0x50(%r10),%xmm12 + movdqa -0x40(%r10),%xmm13 + movdqa -0x30(%r10),%xmm14 + movdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp .Ldo_avx512_epilogue: ___ $code.=<<___ if (!$win64); - lea 8(%r11),%rsp -.cfi_def_cfa %rsp,8 + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp ___ $code.=<<___; ret .cfi_endproc -.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 ___ -if ($avx>3) { + +} + +} + +&declare_function("poly1305_blocks_avx2", 32, 4); +poly1305_blocks_avxN(0); +&end_function("poly1305_blocks_avx2"); + +if($kernel) { + $code .= "#endif\n"; +} + +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +if($kernel) { + $code .= "#ifdef CONFIG_AS_AVX512\n"; +} + +&declare_function("poly1305_blocks_avx512", 32, 4); +poly1305_blocks_avxN(1); +&end_function("poly1305_blocks_avx512"); + +if ($kernel) { + $code .= "#endif\n"; +} + +if (!$kernel && $avx>3) { ######################################################################## # VPMADD52 version using 2^44 radix. # @@ -3753,45 +3881,9 @@ poly1305_emit_base2_44: .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 ___ } } } -$code.=<<___; -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lpermd_avx2: -.long 2,2,2,3,2,0,2,1 -.Lpermd_avx512: -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.L2_44_inp_permd: -.long 0,1,1,2,2,3,7,7 -.L2_44_inp_shift: -.quad 0,12,24,64 -.L2_44_mask: -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff -.L2_44_shift_rgt: -.quad 44,44,42,64 -.L2_44_shift_lft: -.quad 8,8,10,64 - -.align 64 -.Lx_mask44: -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.Lx_mask42: -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -___ } -$code.=<<___; -.asciz "Poly1305 for x86_64, CRYPTOGAMS by " -.align 16 -___ +if (!$kernel) { # chacha20-poly1305 helpers my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order @@ -4038,17 +4130,17 @@ avx_handler: .section .pdata .align 4 - .rva .LSEH_begin_poly1305_init - .rva .LSEH_end_poly1305_init - .rva .LSEH_info_poly1305_init - - .rva .LSEH_begin_poly1305_blocks - .rva .LSEH_end_poly1305_blocks - .rva .LSEH_info_poly1305_blocks - - .rva .LSEH_begin_poly1305_emit - .rva .LSEH_end_poly1305_emit - .rva .LSEH_info_poly1305_emit + .rva .LSEH_begin_poly1305_init_x86_64 + .rva .LSEH_end_poly1305_init_x86_64 + .rva .LSEH_info_poly1305_init_x86_64 + + .rva .LSEH_begin_poly1305_blocks_x86_64 + .rva .LSEH_end_poly1305_blocks_x86_64 + .rva .LSEH_info_poly1305_blocks_x86_64 + + .rva .LSEH_begin_poly1305_emit_x86_64 + .rva .LSEH_end_poly1305_emit_x86_64 + .rva .LSEH_info_poly1305_emit_x86_64 ___ $code.=<<___ if ($avx); .rva .LSEH_begin_poly1305_blocks_avx @@ -4088,20 +4180,20 @@ ___ $code.=<<___; .section .xdata .align 8 -.LSEH_info_poly1305_init: +.LSEH_info_poly1305_init_x86_64: .byte 9,0,0,0 .rva se_handler - .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 -.LSEH_info_poly1305_blocks: +.LSEH_info_poly1305_blocks_x86_64: .byte 9,0,0,0 .rva se_handler .rva .Lblocks_body,.Lblocks_epilogue -.LSEH_info_poly1305_emit: +.LSEH_info_poly1305_emit_x86_64: .byte 9,0,0,0 .rva se_handler - .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 ___ $code.=<<___ if ($avx); .LSEH_info_poly1305_blocks_avx_1: @@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2); ___ } +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + foreach (split('\n',$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/%r([a-z]+)#d/%e$1/g; s/%r([0-9]+)#d/%r$1d/g; s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; + if ($kernel) { + s/(^\.type.*),[0-9]+$/\1/; + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; + next if /^\.cfi.*/; + } + print $_,"\n"; } close STDOUT; --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -1,8 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0 OR MIT /* - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. */ #include @@ -13,279 +11,170 @@ #include #include #include +#include #include -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, - const u32 *r, unsigned int blocks); -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, - unsigned int blocks, const u32 *u); -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, - unsigned int blocks, const u32 *u); +asmlinkage void poly1305_init_x86_64(void *ctx, + const u8 key[POLY1305_KEY_SIZE]); +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); -static inline u64 mlt(u64 a, u64 b) -{ - return a * b; -} - -static inline u32 sr(u64 v, u_char n) -{ - return v >> n; -} - -static inline u32 and(u32 v, u32 mask) -{ - return v & mask; -} - -static void poly1305_simd_mult(u32 *a, const u32 *b) -{ - u8 m[POLY1305_BLOCK_SIZE]; - - memset(m, 0, sizeof(m)); - /* The poly1305 block function adds a hi-bit to the accumulator which - * we don't need for key multiplication; compensate for it. */ - a[4] -= 1 << 24; - poly1305_block_sse2(a, m, b, 1); -} - -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key) -{ - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff; - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03; - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff; - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff; - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff; -} +struct poly1305_arch_internal { + union { + struct { + u32 h[5]; + u32 is_base2_26; + }; + u64 hs[3]; + }; + u64 r[2]; + u64 pad; + struct { u32 r2, r1, r4, r3; } rn[9]; +}; -static void poly1305_integer_blocks(struct poly1305_state *state, - const struct poly1305_key *key, - const void *src, - unsigned int nblocks, u32 hibit) +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit + * the unfortunate situation of using AVX and then having to go back to scalar + * -- because the user is silly and has called the update function from two + * separate contexts -- then we need to convert back to the original base before + * proceeding. It is possible to reason that the initial reduction below is + * sufficient given the implementation invariants. However, for an avoidance of + * doubt and because this is not performance critical, we do the full reduction + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py + */ +static void convert_to_base2_64(void *ctx) { - u32 r0, r1, r2, r3, r4; - u32 s1, s2, s3, s4; - u32 h0, h1, h2, h3, h4; - u64 d0, d1, d2, d3, d4; + struct poly1305_arch_internal *state = ctx; + u32 cy; - if (!nblocks) + if (!state->is_base2_26) return; - r0 = key->r[0]; - r1 = key->r[1]; - r2 = key->r[2]; - r3 = key->r[3]; - r4 = key->r[4]; - - s1 = r1 * 5; - s2 = r2 * 5; - s3 = r3 * 5; - s4 = r4 * 5; - - h0 = state->h[0]; - h1 = state->h[1]; - h2 = state->h[2]; - h3 = state->h[3]; - h4 = state->h[4]; - - do { - /* h += m[i] */ - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff; - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff; - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff; - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24); - - /* h *= r */ - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + - mlt(h3, s2) + mlt(h4, s1); - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + - mlt(h3, s3) + mlt(h4, s2); - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + - mlt(h3, s4) + mlt(h4, s3); - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + - mlt(h3, r0) + mlt(h4, s4); - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + - mlt(h3, r1) + mlt(h4, r0); - - /* (partial) h %= p */ - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff); - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff); - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff); - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff); - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff); - h1 += h0 >> 26; h0 = h0 & 0x3ffffff; - - src += POLY1305_BLOCK_SIZE; - } while (--nblocks); - - state->h[0] = h0; - state->h[1] = h1; - state->h[2] = h2; - state->h[3] = h3; - state->h[4] = h4; -} - -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst) -{ - u32 h0, h1, h2, h3, h4; - u32 g0, g1, g2, g3, g4; - u32 mask; - - /* fully carry h */ - h0 = state->h[0]; - h1 = state->h[1]; - h2 = state->h[2]; - h3 = state->h[3]; - h4 = state->h[4]; - - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff; - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff; - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff; - - /* compute h + -p */ - g0 = h0 + 5; - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff; - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff; - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff; - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff; - - /* select h if h < p, or h + -p if h >= p */ - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = ~mask; - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - /* h = h % (2^128) */ - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0); - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4); - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8); - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12); -} - -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) -{ - poly1305_integer_setkey(desc->opaque_r, key); - desc->s[0] = get_unaligned_le32(key + 16); - desc->s[1] = get_unaligned_le32(key + 20); - desc->s[2] = get_unaligned_le32(key + 24); - desc->s[3] = get_unaligned_le32(key + 28); - poly1305_core_init(&desc->h); - desc->buflen = 0; - desc->sset = true; - desc->rset = 1; -} -EXPORT_SYMBOL_GPL(poly1305_init_arch); - -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) -{ - if (!dctx->sset) { - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { - poly1305_integer_setkey(dctx->r, src); - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; - dctx->rset = 1; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(src + 0); - dctx->s[1] = get_unaligned_le32(src + 4); - dctx->s[2] = get_unaligned_le32(src + 8); - dctx->s[3] = get_unaligned_le32(src + 12); - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; - dctx->sset = true; - } + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); + state->hs[2] = state->h[4] >> 24; +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); + state->hs[2] &= 3; + state->hs[0] += cy; + state->hs[1] += (cy = ULT(state->hs[0], cy)); + state->hs[2] += ULT(state->hs[1], cy); +#undef ULT + state->is_base2_26 = 0; +} + +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) +{ + poly1305_init_x86_64(ctx, key); +} + +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, + const u32 padbit) +{ + struct poly1305_arch_internal *state = ctx; + + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || + PAGE_SIZE % POLY1305_BLOCK_SIZE); + + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || + !crypto_simd_usable()) { + convert_to_base2_64(ctx); + poly1305_blocks_x86_64(ctx, inp, len, padbit); + return; } - return srclen; -} -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) -{ - unsigned int datalen; + for (;;) { + const size_t bytes = min_t(size_t, len, PAGE_SIZE); - if (unlikely(!dctx->sset)) { - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); - src += srclen - datalen; - srclen = datalen; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src, - srclen / POLY1305_BLOCK_SIZE, 1); - srclen %= POLY1305_BLOCK_SIZE; + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) + poly1305_blocks_avx512(ctx, inp, bytes, padbit); + else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2)) + poly1305_blocks_avx2(ctx, inp, bytes, padbit); + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); + len -= bytes; + if (!len) + break; + inp += bytes; } - return srclen; } -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) -{ - unsigned int blocks, datalen; +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]) +{ + struct poly1305_arch_internal *state = ctx; + + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + !state->is_base2_26 || !crypto_simd_usable()) { + convert_to_base2_64(ctx); + poly1305_emit_x86_64(ctx, mac, nonce); + } else + poly1305_emit_avx(ctx, mac, nonce); +} + +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +{ + poly1305_simd_init(&dctx->h, key); + dctx->s[0] = get_unaligned_le32(&key[16]); + dctx->s[1] = get_unaligned_le32(&key[20]); + dctx->s[2] = get_unaligned_le32(&key[24]); + dctx->s[3] = get_unaligned_le32(&key[28]); + dctx->buflen = 0; + dctx->sset = true; +} +EXPORT_SYMBOL(poly1305_init_arch); +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, + const u8 *inp, unsigned int len) +{ + unsigned int acc = 0; if (unlikely(!dctx->sset)) { - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); - src += srclen - datalen; - srclen = datalen; - } - - if (IS_ENABLED(CONFIG_AS_AVX2) && - static_branch_likely(&poly1305_use_avx2) && - srclen >= POLY1305_BLOCK_SIZE * 4) { - if (unlikely(dctx->rset < 4)) { - if (dctx->rset < 2) { - dctx->r[1] = dctx->r[0]; - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); - } - dctx->r[2] = dctx->r[1]; - poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r); - dctx->r[3] = dctx->r[2]; - poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r); - dctx->rset = 4; + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { + poly1305_simd_init(&dctx->h, inp); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; + dctx->rset = 1; } - blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks, - dctx->r[1].r); - src += POLY1305_BLOCK_SIZE * 4 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; - } - - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { - if (unlikely(dctx->rset < 2)) { - dctx->r[1] = dctx->r[0]; - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); - dctx->rset = 2; + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(&inp[0]); + dctx->s[1] = get_unaligned_le32(&inp[4]); + dctx->s[2] = get_unaligned_le32(&inp[8]); + dctx->s[3] = get_unaligned_le32(&inp[12]); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; + dctx->sset = true; } - blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r, - blocks, dctx->r[1].r); - src += POLY1305_BLOCK_SIZE * 2 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1); - srclen -= POLY1305_BLOCK_SIZE; } - return srclen; + return acc; } void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { - unsigned int bytes; + unsigned int bytes, used; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130 dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { - if (static_branch_likely(&poly1305_use_simd) && - likely(crypto_simd_usable())) { - kernel_fpu_begin(); - poly1305_simd_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); - kernel_fpu_end(); - } else { - poly1305_scalar_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); - } + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - if (static_branch_likely(&poly1305_use_simd) && - likely(crypto_simd_usable())) { - kernel_fpu_begin(); - bytes = poly1305_simd_blocks(dctx, src, srclen); - kernel_fpu_end(); - } else { - bytes = poly1305_scalar_blocks(dctx, src, srclen); - } - src += srclen - bytes; - srclen = bytes; + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); + srclen -= bytes; + used = crypto_poly1305_setdctxkey(dctx, src, bytes); + if (likely(bytes - used)) + poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); + src += bytes; } if (unlikely(srclen)) { @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130 } EXPORT_SYMBOL(poly1305_update_arch); -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst) +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) { - __le32 digest[4]; - u64 f = 0; - - if (unlikely(desc->buflen)) { - desc->buf[desc->buflen++] = 1; - memset(desc->buf + desc->buflen, 0, - POLY1305_BLOCK_SIZE - desc->buflen); - poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0); + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); } - poly1305_integer_emit(&desc->h, digest); - - /* mac = (h + s) % (2^128) */ - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0]; - put_unaligned_le32(f, dst + 0); - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1]; - put_unaligned_le32(f, dst + 4); - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2]; - put_unaligned_le32(f, dst + 8); - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3]; - put_unaligned_le32(f, dst + 12); - - *desc = (struct poly1305_desc_ctx){}; + poly1305_simd_emit(&dctx->h, dst, dctx->s); + *dctx = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL(poly1305_final_arch); @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - poly1305_core_init(&dctx->h); - dctx->buflen = 0; - dctx->rset = 0; - dctx->sset = false; - + *dctx = (struct poly1305_desc_ctx){}; return 0; } -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +static int crypto_poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - if (unlikely(!dctx->sset)) - return -ENOKEY; - - poly1305_final_arch(dctx, dst); + poly1305_update_arch(dctx, src, srclen); return 0; } -static int poly1305_simd_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - poly1305_update_arch(dctx, src, srclen); + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_arch(dctx, dst); return 0; } static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, .init = crypto_poly1305_init, - .update = poly1305_simd_update, + .update = crypto_poly1305_update, .final = crypto_poly1305_final, .descsize = sizeof(struct poly1305_desc_ctx), .base = { @@ -406,17 +265,19 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { - if (!boot_cpu_has(X86_FEATURE_XMM2)) - return 0; - - static_branch_enable(&poly1305_use_simd); - - if (IS_ENABLED(CONFIG_AS_AVX2) && - boot_cpu_has(X86_FEATURE_AVX) && + if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx); + if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) static_branch_enable(&poly1305_use_avx2); - + if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) + static_branch_enable(&poly1305_use_avx512); return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; } @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init); module_exit(poly1305_simd_mod_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); +MODULE_AUTHOR("Jason A. Donenfeld "); MODULE_DESCRIPTION("Poly1305 authenticator"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-simd"); --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -90,7 +90,7 @@ config CRYPTO_LIB_DES config CRYPTO_LIB_POLY1305_RSIZE int default 2 if MIPS - default 4 if X86_64 + default 11 if X86_64 default 9 if ARM || ARM64 default 1