diff options
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch')
-rw-r--r-- | target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch | 1058 |
1 files changed, 1058 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch b/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch new file mode 100644 index 0000000000..14a75e10eb --- /dev/null +++ b/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch @@ -0,0 +1,1058 @@ +From ec96c25c1ce09c78e44bd4627bc0a3e610b7f5d8 Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" <Jason@zx2c4.com> +Date: Fri, 8 Nov 2019 13:22:38 +0100 +Subject: [PATCH 031/124] crypto: arm/curve25519 - wire up NEON implementation + +commit d8f1308a025fc7e00414194ed742d5f05a21e13c upstream. + +This ports the SUPERCOP implementation for usage in kernel space. In +addition to the usual header, macro, and style changes required for +kernel space, it makes a few small changes to the code: + + - The stack alignment is relaxed to 16 bytes. + - Superfluous mov statements have been removed. + - ldr for constants has been replaced with movw. + - ldreq has been replaced with moveq. + - The str epilogue has been made more idiomatic. + - SIMD registers are not pushed and popped at the beginning and end. + - The prologue and epilogue have been made idiomatic. + - A hole has been removed from the stack, saving 32 bytes. + - We write-back the base register whenever possible for vld1.8. + - Some multiplications have been reordered for better A7 performance. + +There are more opportunities for cleanup, since this code is from qhasm, +which doesn't always do the most opportune thing. But even prior to +extensive hand optimizations, this code delivers significant performance +improvements (given in get_cycles() per call): + + ----------- ------------- + | generic C | this commit | + ------------ ----------- ------------- + | Cortex-A7 | 49136 | 22395 | + ------------ ----------- ------------- + | Cortex-A17 | 17326 | 4983 | + ------------ ----------- ------------- + +Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> +[ardb: - move to arch/arm/crypto + - wire into lib/crypto framework + - implement crypto API KPP hooks ] +Signed-off-by: Ard Biesheuvel <ardb@kernel.org> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> +Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> +--- + arch/arm/crypto/Kconfig | 6 + + arch/arm/crypto/Makefile | 2 + + arch/arm/crypto/curve25519-core.S | 347 +++++++++++++----------------- + arch/arm/crypto/curve25519-glue.c | 127 +++++++++++ + 4 files changed, 287 insertions(+), 195 deletions(-) + create mode 100644 arch/arm/crypto/curve25519-glue.c + +--- a/arch/arm/crypto/Kconfig ++++ b/arch/arm/crypto/Kconfig +@@ -141,4 +141,10 @@ config CRYPTO_NHPOLY1305_NEON + depends on KERNEL_MODE_NEON + select CRYPTO_NHPOLY1305 + ++config CRYPTO_CURVE25519_NEON ++ tristate "NEON accelerated Curve25519 scalar multiplication library" ++ depends on KERNEL_MODE_NEON ++ select CRYPTO_LIB_CURVE25519_GENERIC ++ select CRYPTO_ARCH_HAVE_LIB_CURVE25519 ++ + endif +--- a/arch/arm/crypto/Makefile ++++ b/arch/arm/crypto/Makefile +@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha51 + obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o + obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o + obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o ++obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o + + ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o + ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o +@@ -58,6 +59,7 @@ chacha-neon-y := chacha-scalar-core.o ch + chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o + poly1305-arm-y := poly1305-core.o poly1305-glue.o + nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o ++curve25519-neon-y := curve25519-core.o curve25519-glue.o + + ifdef REGENERATE_ARM_CRYPTO + quiet_cmd_perl = PERL $@ +--- a/arch/arm/crypto/curve25519-core.S ++++ b/arch/arm/crypto/curve25519-core.S +@@ -1,43 +1,35 @@ ++/* SPDX-License-Identifier: GPL-2.0 OR MIT */ + /* +- * Public domain code from Daniel J. Bernstein and Peter Schwabe, from +- * SUPERCOP's curve25519/neon2/scalarmult.s. ++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. ++ * ++ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This ++ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been ++ * manually reworked for use in kernel space. + */ + +-.fpu neon ++#include <linux/linkage.h> ++ + .text ++.fpu neon ++.arch armv7-a + .align 4 +-.global _crypto_scalarmult_curve25519_neon2 +-.global crypto_scalarmult_curve25519_neon2 +-.type _crypto_scalarmult_curve25519_neon2 STT_FUNC +-.type crypto_scalarmult_curve25519_neon2 STT_FUNC +- _crypto_scalarmult_curve25519_neon2: +- crypto_scalarmult_curve25519_neon2: +- vpush {q4, q5, q6, q7} +- mov r12, sp +- sub sp, sp, #736 +- and sp, sp, #0xffffffe0 +- strd r4, [sp, #0] +- strd r6, [sp, #8] +- strd r8, [sp, #16] +- strd r10, [sp, #24] +- str r12, [sp, #480] +- str r14, [sp, #484] +- mov r0, r0 +- mov r1, r1 +- mov r2, r2 +- add r3, sp, #32 +- ldr r4, =0 +- ldr r5, =254 ++ ++ENTRY(curve25519_neon) ++ push {r4-r11, lr} ++ mov ip, sp ++ sub r3, sp, #704 ++ and r3, r3, #0xfffffff0 ++ mov sp, r3 ++ movw r4, #0 ++ movw r5, #254 + vmov.i32 q0, #1 + vshr.u64 q1, q0, #7 + vshr.u64 q0, q0, #8 + vmov.i32 d4, #19 + vmov.i32 d5, #38 +- add r6, sp, #512 +- vst1.8 {d2-d3}, [r6, : 128] +- add r6, sp, #528 +- vst1.8 {d0-d1}, [r6, : 128] +- add r6, sp, #544 ++ add r6, sp, #480 ++ vst1.8 {d2-d3}, [r6, : 128]! ++ vst1.8 {d0-d1}, [r6, : 128]! + vst1.8 {d4-d5}, [r6, : 128] + add r6, r3, #0 + vmov.i32 q2, #0 +@@ -45,12 +37,12 @@ + vst1.8 {d4-d5}, [r6, : 128]! + vst1.8 d4, [r6, : 64] + add r6, r3, #0 +- ldr r7, =960 ++ movw r7, #960 + sub r7, r7, #2 + neg r7, r7 + sub r7, r7, r7, LSL #7 + str r7, [r6] +- add r6, sp, #704 ++ add r6, sp, #672 + vld1.8 {d4-d5}, [r1]! + vld1.8 {d6-d7}, [r1] + vst1.8 {d4-d5}, [r6, : 128]! +@@ -212,15 +204,15 @@ + vst1.8 {d0-d1}, [r6, : 128]! + vst1.8 {d2-d3}, [r6, : 128]! + vst1.8 d4, [r6, : 64] +-._mainloop: ++.Lmainloop: + mov r2, r5, LSR #3 + and r6, r5, #7 + ldrb r2, [r1, r2] + mov r2, r2, LSR r6 + and r2, r2, #1 +- str r5, [sp, #488] ++ str r5, [sp, #456] + eor r4, r4, r2 +- str r2, [sp, #492] ++ str r2, [sp, #460] + neg r2, r4 + add r4, r3, #96 + add r5, r3, #192 +@@ -291,7 +283,7 @@ + vsub.i32 q0, q1, q3 + vst1.8 d4, [r4, : 64] + vst1.8 d0, [r6, : 64] +- add r2, sp, #544 ++ add r2, sp, #512 + add r4, r3, #96 + add r5, r3, #144 + vld1.8 {d0-d1}, [r2, : 128] +@@ -361,14 +353,13 @@ + vmlal.s32 q0, d12, d8 + vmlal.s32 q0, d13, d17 + vmlal.s32 q0, d6, d6 +- add r2, sp, #512 +- vld1.8 {d18-d19}, [r2, : 128] ++ add r2, sp, #480 ++ vld1.8 {d18-d19}, [r2, : 128]! + vmull.s32 q3, d16, d7 + vmlal.s32 q3, d10, d15 + vmlal.s32 q3, d11, d14 + vmlal.s32 q3, d12, d9 + vmlal.s32 q3, d13, d8 +- add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vadd.i64 q5, q12, q9 + vadd.i64 q6, q15, q9 +@@ -502,22 +493,19 @@ + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 +- add r2, sp, #560 ++ add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 +- vst1.8 {d12-d13}, [r2, : 128] ++ vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 +- add r2, sp, #576 +- vst1.8 {d20-d21}, [r2, : 128] ++ vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 +- add r2, sp, #592 +- vst1.8 {d12-d13}, [r2, : 128] ++ vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 +- add r2, sp, #608 +- vst1.8 {d16-d17}, [r2, : 128] ++ vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 +@@ -528,8 +516,7 @@ + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 +- add r2, sp, #624 +- vst1.8 {d14-d15}, [r2, : 128] ++ vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 +@@ -537,8 +524,7 @@ + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 +- add r2, sp, #640 +- vst1.8 {d20-d21}, [r2, : 128] ++ vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 +@@ -547,14 +533,12 @@ + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 +- add r2, sp, #656 +- vst1.8 {d10-d11}, [r2, : 128] ++ vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 +- add r2, sp, #672 + vst1.8 {d16-d17}, [r2, : 128] + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 +@@ -566,7 +550,7 @@ + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 +- add r2, sp, #576 ++ add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 +@@ -575,32 +559,30 @@ + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 +- add r2, sp, #576 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 +- add r2, sp, #656 ++ add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 +- add r2, sp, #624 ++ add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 +- add r2, sp, #608 ++ add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 +- add r2, sp, #608 + vst1.8 {d8-d9}, [r2, : 128] +- add r2, sp, #560 ++ add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 +@@ -621,36 +603,36 @@ + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 +- add r2, sp, #640 ++ add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 +- vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d18, d6 +- vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d18, d21 +- vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d18, d28 +- vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d18, d29 ++ vmlal.s32 q2, d19, d6 ++ vmlal.s32 q5, d19, d21 ++ vmlal.s32 q1, d19, d29 ++ vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 +- add r2, sp, #592 ++ add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] +- add r2, sp, #512 ++ add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 +- add r2, sp, #528 ++ add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 +- add r2, sp, #576 ++ add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 +- add r2, sp, #672 ++ add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 +- add r2, sp, #608 ++ add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 +@@ -823,22 +805,19 @@ + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 +- add r2, sp, #560 ++ add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 +- vst1.8 {d12-d13}, [r2, : 128] ++ vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 +- add r2, sp, #576 +- vst1.8 {d20-d21}, [r2, : 128] ++ vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 +- add r2, sp, #592 +- vst1.8 {d12-d13}, [r2, : 128] ++ vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 +- add r2, sp, #608 +- vst1.8 {d16-d17}, [r2, : 128] ++ vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 +@@ -849,8 +828,7 @@ + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 +- add r2, sp, #624 +- vst1.8 {d14-d15}, [r2, : 128] ++ vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 +@@ -858,8 +836,7 @@ + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 +- add r2, sp, #640 +- vst1.8 {d20-d21}, [r2, : 128] ++ vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 +@@ -868,15 +845,13 @@ + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 +- add r2, sp, #656 +- vst1.8 {d10-d11}, [r2, : 128] ++ vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 +- add r2, sp, #672 +- vst1.8 {d16-d17}, [r2, : 128] ++ vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 + vmlal.s32 q4, d19, d7 +@@ -887,7 +862,7 @@ + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 +- add r2, sp, #576 ++ add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 +@@ -896,32 +871,30 @@ + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 +- add r2, sp, #576 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 +- add r2, sp, #656 ++ add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 +- add r2, sp, #624 ++ add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 +- add r2, sp, #608 ++ add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 +- add r2, sp, #608 + vst1.8 {d8-d9}, [r2, : 128] +- add r2, sp, #560 ++ add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 +@@ -942,36 +915,36 @@ + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 +- add r2, sp, #640 ++ add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 +- vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d18, d6 +- vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d18, d21 +- vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d18, d28 +- vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d18, d29 ++ vmlal.s32 q2, d19, d6 ++ vmlal.s32 q5, d19, d21 ++ vmlal.s32 q1, d19, d29 ++ vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 +- add r2, sp, #592 ++ add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] +- add r2, sp, #512 ++ add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 +- add r2, sp, #528 ++ add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 +- add r2, sp, #576 ++ add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 +- add r2, sp, #672 ++ add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 +- add r2, sp, #608 ++ add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 +@@ -1069,7 +1042,7 @@ + sub r4, r4, #24 + vst1.8 d0, [r2, : 64] + vst1.8 d1, [r4, : 64] +- add r2, sp, #544 ++ add r2, sp, #512 + add r4, r3, #144 + add r5, r3, #192 + vld1.8 {d0-d1}, [r2, : 128] +@@ -1139,14 +1112,13 @@ + vmlal.s32 q0, d12, d8 + vmlal.s32 q0, d13, d17 + vmlal.s32 q0, d6, d6 +- add r2, sp, #512 +- vld1.8 {d18-d19}, [r2, : 128] ++ add r2, sp, #480 ++ vld1.8 {d18-d19}, [r2, : 128]! + vmull.s32 q3, d16, d7 + vmlal.s32 q3, d10, d15 + vmlal.s32 q3, d11, d14 + vmlal.s32 q3, d12, d9 + vmlal.s32 q3, d13, d8 +- add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vadd.i64 q5, q12, q9 + vadd.i64 q6, q15, q9 +@@ -1295,22 +1267,19 @@ + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 +- add r2, sp, #560 ++ add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 +- vst1.8 {d12-d13}, [r2, : 128] ++ vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 +- add r2, sp, #576 +- vst1.8 {d20-d21}, [r2, : 128] ++ vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 +- add r2, sp, #592 +- vst1.8 {d12-d13}, [r2, : 128] ++ vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 +- add r2, sp, #608 +- vst1.8 {d16-d17}, [r2, : 128] ++ vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 +@@ -1321,8 +1290,7 @@ + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 +- add r2, sp, #624 +- vst1.8 {d14-d15}, [r2, : 128] ++ vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 +@@ -1330,8 +1298,7 @@ + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 +- add r2, sp, #640 +- vst1.8 {d20-d21}, [r2, : 128] ++ vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 +@@ -1340,15 +1307,13 @@ + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 +- add r2, sp, #656 +- vst1.8 {d10-d11}, [r2, : 128] ++ vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 +- add r2, sp, #672 +- vst1.8 {d16-d17}, [r2, : 128] ++ vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 + vmlal.s32 q4, d19, d7 +@@ -1359,7 +1324,7 @@ + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 +- add r2, sp, #576 ++ add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 +@@ -1368,32 +1333,30 @@ + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 +- add r2, sp, #576 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 +- add r2, sp, #656 ++ add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 +- add r2, sp, #624 ++ add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 +- add r2, sp, #608 ++ add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 +- add r2, sp, #608 + vst1.8 {d8-d9}, [r2, : 128] +- add r2, sp, #560 ++ add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 +@@ -1414,36 +1377,36 @@ + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 +- add r2, sp, #640 ++ add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 +- vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d18, d6 +- vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d18, d21 +- vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d18, d28 +- vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d18, d29 ++ vmlal.s32 q2, d19, d6 ++ vmlal.s32 q5, d19, d21 ++ vmlal.s32 q1, d19, d29 ++ vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 +- add r2, sp, #592 ++ add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] +- add r2, sp, #512 ++ add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 +- add r2, sp, #528 ++ add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 +- add r2, sp, #576 ++ add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 +- add r2, sp, #672 ++ add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 +- add r2, sp, #608 ++ add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 +@@ -1541,10 +1504,10 @@ + sub r4, r4, #24 + vst1.8 d0, [r2, : 64] + vst1.8 d1, [r4, : 64] +- ldr r2, [sp, #488] +- ldr r4, [sp, #492] ++ ldr r2, [sp, #456] ++ ldr r4, [sp, #460] + subs r5, r2, #1 +- bge ._mainloop ++ bge .Lmainloop + add r1, r3, #144 + add r2, r3, #336 + vld1.8 {d0-d1}, [r1, : 128]! +@@ -1553,41 +1516,41 @@ + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 {d2-d3}, [r2, : 128]! + vst1.8 d4, [r2, : 64] +- ldr r1, =0 +-._invertloop: ++ movw r1, #0 ++.Linvertloop: + add r2, r3, #144 +- ldr r4, =0 +- ldr r5, =2 ++ movw r4, #0 ++ movw r5, #2 + cmp r1, #1 +- ldreq r5, =1 ++ moveq r5, #1 + addeq r2, r3, #336 + addeq r4, r3, #48 + cmp r1, #2 +- ldreq r5, =1 ++ moveq r5, #1 + addeq r2, r3, #48 + cmp r1, #3 +- ldreq r5, =5 ++ moveq r5, #5 + addeq r4, r3, #336 + cmp r1, #4 +- ldreq r5, =10 ++ moveq r5, #10 + cmp r1, #5 +- ldreq r5, =20 ++ moveq r5, #20 + cmp r1, #6 +- ldreq r5, =10 ++ moveq r5, #10 + addeq r2, r3, #336 + addeq r4, r3, #336 + cmp r1, #7 +- ldreq r5, =50 ++ moveq r5, #50 + cmp r1, #8 +- ldreq r5, =100 ++ moveq r5, #100 + cmp r1, #9 +- ldreq r5, =50 ++ moveq r5, #50 + addeq r2, r3, #336 + cmp r1, #10 +- ldreq r5, =5 ++ moveq r5, #5 + addeq r2, r3, #48 + cmp r1, #11 +- ldreq r5, =0 ++ moveq r5, #0 + addeq r2, r3, #96 + add r6, r3, #144 + add r7, r3, #288 +@@ -1598,8 +1561,8 @@ + vst1.8 {d2-d3}, [r7, : 128]! + vst1.8 d4, [r7, : 64] + cmp r5, #0 +- beq ._skipsquaringloop +-._squaringloop: ++ beq .Lskipsquaringloop ++.Lsquaringloop: + add r6, r3, #288 + add r7, r3, #288 + add r8, r3, #288 +@@ -1611,7 +1574,7 @@ + vld1.8 {d6-d7}, [r7, : 128]! + vld1.8 {d9}, [r7, : 64] + vld1.8 {d10-d11}, [r6, : 128]! +- add r7, sp, #416 ++ add r7, sp, #384 + vld1.8 {d12-d13}, [r6, : 128]! + vmul.i32 q7, q2, q0 + vld1.8 {d8}, [r6, : 64] +@@ -1726,7 +1689,7 @@ + vext.32 d10, d6, d6, #0 + vmov.i32 q1, #0xffffffff + vshl.i64 q4, q1, #25 +- add r7, sp, #512 ++ add r7, sp, #480 + vld1.8 {d14-d15}, [r7, : 128] + vadd.i64 q9, q2, q7 + vshl.i64 q1, q1, #26 +@@ -1735,7 +1698,7 @@ + vadd.i64 q5, q5, q10 + vand q9, q9, q1 + vld1.8 {d16}, [r6, : 64]! +- add r6, sp, #528 ++ add r6, sp, #496 + vld1.8 {d20-d21}, [r6, : 128] + vadd.i64 q11, q5, q10 + vsub.i64 q2, q2, q9 +@@ -1789,8 +1752,8 @@ + sub r6, r6, #32 + vst1.8 d4, [r6, : 64] + subs r5, r5, #1 +- bhi ._squaringloop +-._skipsquaringloop: ++ bhi .Lsquaringloop ++.Lskipsquaringloop: + mov r2, r2 + add r5, r3, #288 + add r6, r3, #144 +@@ -1802,7 +1765,7 @@ + vld1.8 {d6-d7}, [r5, : 128]! + vld1.8 {d9}, [r5, : 64] + vld1.8 {d10-d11}, [r2, : 128]! +- add r5, sp, #416 ++ add r5, sp, #384 + vld1.8 {d12-d13}, [r2, : 128]! + vmul.i32 q7, q2, q0 + vld1.8 {d8}, [r2, : 64] +@@ -1917,7 +1880,7 @@ + vext.32 d10, d6, d6, #0 + vmov.i32 q1, #0xffffffff + vshl.i64 q4, q1, #25 +- add r5, sp, #512 ++ add r5, sp, #480 + vld1.8 {d14-d15}, [r5, : 128] + vadd.i64 q9, q2, q7 + vshl.i64 q1, q1, #26 +@@ -1926,7 +1889,7 @@ + vadd.i64 q5, q5, q10 + vand q9, q9, q1 + vld1.8 {d16}, [r2, : 64]! +- add r2, sp, #528 ++ add r2, sp, #496 + vld1.8 {d20-d21}, [r2, : 128] + vadd.i64 q11, q5, q10 + vsub.i64 q2, q2, q9 +@@ -1980,7 +1943,7 @@ + sub r2, r2, #32 + vst1.8 d4, [r2, : 64] + cmp r4, #0 +- beq ._skippostcopy ++ beq .Lskippostcopy + add r2, r3, #144 + mov r4, r4 + vld1.8 {d0-d1}, [r2, : 128]! +@@ -1989,9 +1952,9 @@ + vst1.8 {d0-d1}, [r4, : 128]! + vst1.8 {d2-d3}, [r4, : 128]! + vst1.8 d4, [r4, : 64] +-._skippostcopy: ++.Lskippostcopy: + cmp r1, #1 +- bne ._skipfinalcopy ++ bne .Lskipfinalcopy + add r2, r3, #288 + add r4, r3, #144 + vld1.8 {d0-d1}, [r2, : 128]! +@@ -2000,10 +1963,10 @@ + vst1.8 {d0-d1}, [r4, : 128]! + vst1.8 {d2-d3}, [r4, : 128]! + vst1.8 d4, [r4, : 64] +-._skipfinalcopy: ++.Lskipfinalcopy: + add r1, r1, #1 + cmp r1, #12 +- blo ._invertloop ++ blo .Linvertloop + add r1, r3, #144 + ldr r2, [r1], #4 + ldr r3, [r1], #4 +@@ -2085,21 +2048,15 @@ + add r8, r8, r10, LSL #12 + mov r9, r10, LSR #20 + add r1, r9, r1, LSL #6 +- str r2, [r0], #4 +- str r3, [r0], #4 +- str r4, [r0], #4 +- str r5, [r0], #4 +- str r6, [r0], #4 +- str r7, [r0], #4 +- str r8, [r0], #4 +- str r1, [r0] +- ldrd r4, [sp, #0] +- ldrd r6, [sp, #8] +- ldrd r8, [sp, #16] +- ldrd r10, [sp, #24] +- ldr r12, [sp, #480] +- ldr r14, [sp, #484] +- ldr r0, =0 +- mov sp, r12 +- vpop {q4, q5, q6, q7} +- bx lr ++ str r2, [r0] ++ str r3, [r0, #4] ++ str r4, [r0, #8] ++ str r5, [r0, #12] ++ str r6, [r0, #16] ++ str r7, [r0, #20] ++ str r8, [r0, #24] ++ str r1, [r0, #28] ++ movw r0, #0 ++ mov sp, ip ++ pop {r4-r11, pc} ++ENDPROC(curve25519_neon) +--- /dev/null ++++ b/arch/arm/crypto/curve25519-glue.c +@@ -0,0 +1,127 @@ ++// SPDX-License-Identifier: GPL-2.0 OR MIT ++/* ++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. ++ * ++ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This ++ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been ++ * manually reworked for use in kernel space. ++ */ ++ ++#include <asm/hwcap.h> ++#include <asm/neon.h> ++#include <asm/simd.h> ++#include <crypto/internal/kpp.h> ++#include <crypto/internal/simd.h> ++#include <linux/types.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/jump_label.h> ++#include <crypto/curve25519.h> ++ ++asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], ++ const u8 secret[CURVE25519_KEY_SIZE], ++ const u8 basepoint[CURVE25519_KEY_SIZE]); ++ ++static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); ++ ++void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], ++ const u8 scalar[CURVE25519_KEY_SIZE], ++ const u8 point[CURVE25519_KEY_SIZE]) ++{ ++ if (static_branch_likely(&have_neon) && crypto_simd_usable()) { ++ kernel_neon_begin(); ++ curve25519_neon(out, scalar, point); ++ kernel_neon_end(); ++ } else { ++ curve25519_generic(out, scalar, point); ++ } ++} ++EXPORT_SYMBOL(curve25519_arch); ++ ++static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, ++ unsigned int len) ++{ ++ u8 *secret = kpp_tfm_ctx(tfm); ++ ++ if (!len) ++ curve25519_generate_secret(secret); ++ else if (len == CURVE25519_KEY_SIZE && ++ crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) ++ memcpy(secret, buf, CURVE25519_KEY_SIZE); ++ else ++ return -EINVAL; ++ return 0; ++} ++ ++static int curve25519_compute_value(struct kpp_request *req) ++{ ++ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); ++ const u8 *secret = kpp_tfm_ctx(tfm); ++ u8 public_key[CURVE25519_KEY_SIZE]; ++ u8 buf[CURVE25519_KEY_SIZE]; ++ int copied, nbytes; ++ u8 const *bp; ++ ++ if (req->src) { ++ copied = sg_copy_to_buffer(req->src, ++ sg_nents_for_len(req->src, ++ CURVE25519_KEY_SIZE), ++ public_key, CURVE25519_KEY_SIZE); ++ if (copied != CURVE25519_KEY_SIZE) ++ return -EINVAL; ++ bp = public_key; ++ } else { ++ bp = curve25519_base_point; ++ } ++ ++ curve25519_arch(buf, secret, bp); ++ ++ /* might want less than we've got */ ++ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); ++ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, ++ nbytes), ++ buf, nbytes); ++ if (copied != nbytes) ++ return -EINVAL; ++ return 0; ++} ++ ++static unsigned int curve25519_max_size(struct crypto_kpp *tfm) ++{ ++ return CURVE25519_KEY_SIZE; ++} ++ ++static struct kpp_alg curve25519_alg = { ++ .base.cra_name = "curve25519", ++ .base.cra_driver_name = "curve25519-neon", ++ .base.cra_priority = 200, ++ .base.cra_module = THIS_MODULE, ++ .base.cra_ctxsize = CURVE25519_KEY_SIZE, ++ ++ .set_secret = curve25519_set_secret, ++ .generate_public_key = curve25519_compute_value, ++ .compute_shared_secret = curve25519_compute_value, ++ .max_size = curve25519_max_size, ++}; ++ ++static int __init mod_init(void) ++{ ++ if (elf_hwcap & HWCAP_NEON) { ++ static_branch_enable(&have_neon); ++ return crypto_register_kpp(&curve25519_alg); ++ } ++ return 0; ++} ++ ++static void __exit mod_exit(void) ++{ ++ if (elf_hwcap & HWCAP_NEON) ++ crypto_unregister_kpp(&curve25519_alg); ++} ++ ++module_init(mod_init); ++module_exit(mod_exit); ++ ++MODULE_ALIAS_CRYPTO("curve25519"); ++MODULE_ALIAS_CRYPTO("curve25519-neon"); ++MODULE_LICENSE("GPL v2"); |