aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch1058
1 files changed, 1058 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch b/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
new file mode 100644
index 0000000000..d84726b616
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
@@ -0,0 +1,1058 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:38 +0100
+Subject: [PATCH] crypto: arm/curve25519 - wire up NEON implementation
+
+commit d8f1308a025fc7e00414194ed742d5f05a21e13c upstream.
+
+This ports the SUPERCOP implementation for usage in kernel space. In
+addition to the usual header, macro, and style changes required for
+kernel space, it makes a few small changes to the code:
+
+ - The stack alignment is relaxed to 16 bytes.
+ - Superfluous mov statements have been removed.
+ - ldr for constants has been replaced with movw.
+ - ldreq has been replaced with moveq.
+ - The str epilogue has been made more idiomatic.
+ - SIMD registers are not pushed and popped at the beginning and end.
+ - The prologue and epilogue have been made idiomatic.
+ - A hole has been removed from the stack, saving 32 bytes.
+ - We write-back the base register whenever possible for vld1.8.
+ - Some multiplications have been reordered for better A7 performance.
+
+There are more opportunities for cleanup, since this code is from qhasm,
+which doesn't always do the most opportune thing. But even prior to
+extensive hand optimizations, this code delivers significant performance
+improvements (given in get_cycles() per call):
+
+ ----------- -------------
+ | generic C | this commit |
+ ------------ ----------- -------------
+ | Cortex-A7 | 49136 | 22395 |
+ ------------ ----------- -------------
+ | Cortex-A17 | 17326 | 4983 |
+ ------------ ----------- -------------
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+[ardb: - move to arch/arm/crypto
+ - wire into lib/crypto framework
+ - implement crypto API KPP hooks ]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig | 6 +
+ arch/arm/crypto/Makefile | 2 +
+ arch/arm/crypto/curve25519-core.S | 347 +++++++++++++-----------------
+ arch/arm/crypto/curve25519-glue.c | 127 +++++++++++
+ 4 files changed, 287 insertions(+), 195 deletions(-)
+ create mode 100644 arch/arm/crypto/curve25519-glue.c
+
+--- a/arch/arm/crypto/Kconfig
++++ b/arch/arm/crypto/Kconfig
+@@ -141,4 +141,10 @@ config CRYPTO_NHPOLY1305_NEON
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_NHPOLY1305
+
++config CRYPTO_CURVE25519_NEON
++ tristate "NEON accelerated Curve25519 scalar multiplication library"
++ depends on KERNEL_MODE_NEON
++ select CRYPTO_LIB_CURVE25519_GENERIC
++ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
++
+ endif
+--- a/arch/arm/crypto/Makefile
++++ b/arch/arm/crypto/Makefile
+@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha51
+ obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+ obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
++obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
+
+ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
+ ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
+@@ -58,6 +59,7 @@ chacha-neon-y := chacha-scalar-core.o ch
+ chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+ poly1305-arm-y := poly1305-core.o poly1305-glue.o
+ nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
++curve25519-neon-y := curve25519-core.o curve25519-glue.o
+
+ ifdef REGENERATE_ARM_CRYPTO
+ quiet_cmd_perl = PERL $@
+--- a/arch/arm/crypto/curve25519-core.S
++++ b/arch/arm/crypto/curve25519-core.S
+@@ -1,43 +1,35 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+ /*
+- * Public domain code from Daniel J. Bernstein and Peter Schwabe, from
+- * SUPERCOP's curve25519/neon2/scalarmult.s.
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
++ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
++ * manually reworked for use in kernel space.
+ */
+
+-.fpu neon
++#include <linux/linkage.h>
++
+ .text
++.fpu neon
++.arch armv7-a
+ .align 4
+-.global _crypto_scalarmult_curve25519_neon2
+-.global crypto_scalarmult_curve25519_neon2
+-.type _crypto_scalarmult_curve25519_neon2 STT_FUNC
+-.type crypto_scalarmult_curve25519_neon2 STT_FUNC
+- _crypto_scalarmult_curve25519_neon2:
+- crypto_scalarmult_curve25519_neon2:
+- vpush {q4, q5, q6, q7}
+- mov r12, sp
+- sub sp, sp, #736
+- and sp, sp, #0xffffffe0
+- strd r4, [sp, #0]
+- strd r6, [sp, #8]
+- strd r8, [sp, #16]
+- strd r10, [sp, #24]
+- str r12, [sp, #480]
+- str r14, [sp, #484]
+- mov r0, r0
+- mov r1, r1
+- mov r2, r2
+- add r3, sp, #32
+- ldr r4, =0
+- ldr r5, =254
++
++ENTRY(curve25519_neon)
++ push {r4-r11, lr}
++ mov ip, sp
++ sub r3, sp, #704
++ and r3, r3, #0xfffffff0
++ mov sp, r3
++ movw r4, #0
++ movw r5, #254
+ vmov.i32 q0, #1
+ vshr.u64 q1, q0, #7
+ vshr.u64 q0, q0, #8
+ vmov.i32 d4, #19
+ vmov.i32 d5, #38
+- add r6, sp, #512
+- vst1.8 {d2-d3}, [r6, : 128]
+- add r6, sp, #528
+- vst1.8 {d0-d1}, [r6, : 128]
+- add r6, sp, #544
++ add r6, sp, #480
++ vst1.8 {d2-d3}, [r6, : 128]!
++ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d4-d5}, [r6, : 128]
+ add r6, r3, #0
+ vmov.i32 q2, #0
+@@ -45,12 +37,12 @@
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+ add r6, r3, #0
+- ldr r7, =960
++ movw r7, #960
+ sub r7, r7, #2
+ neg r7, r7
+ sub r7, r7, r7, LSL #7
+ str r7, [r6]
+- add r6, sp, #704
++ add r6, sp, #672
+ vld1.8 {d4-d5}, [r1]!
+ vld1.8 {d6-d7}, [r1]
+ vst1.8 {d4-d5}, [r6, : 128]!
+@@ -212,15 +204,15 @@
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+-._mainloop:
++.Lmainloop:
+ mov r2, r5, LSR #3
+ and r6, r5, #7
+ ldrb r2, [r1, r2]
+ mov r2, r2, LSR r6
+ and r2, r2, #1
+- str r5, [sp, #488]
++ str r5, [sp, #456]
+ eor r4, r4, r2
+- str r2, [sp, #492]
++ str r2, [sp, #460]
+ neg r2, r4
+ add r4, r3, #96
+ add r5, r3, #192
+@@ -291,7 +283,7 @@
+ vsub.i32 q0, q1, q3
+ vst1.8 d4, [r4, : 64]
+ vst1.8 d0, [r6, : 64]
+- add r2, sp, #544
++ add r2, sp, #512
+ add r4, r3, #96
+ add r5, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]
+@@ -361,14 +353,13 @@
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+- add r2, sp, #512
+- vld1.8 {d18-d19}, [r2, : 128]
++ add r2, sp, #480
++ vld1.8 {d18-d19}, [r2, : 128]!
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+- add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+@@ -502,22 +493,19 @@
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+- add r2, sp, #560
++ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+- vst1.8 {d12-d13}, [r2, : 128]
++ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+- add r2, sp, #576
+- vst1.8 {d20-d21}, [r2, : 128]
++ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+- add r2, sp, #592
+- vst1.8 {d12-d13}, [r2, : 128]
++ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+- add r2, sp, #608
+- vst1.8 {d16-d17}, [r2, : 128]
++ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+@@ -528,8 +516,7 @@
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+- add r2, sp, #624
+- vst1.8 {d14-d15}, [r2, : 128]
++ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+@@ -537,8 +524,7 @@
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+- add r2, sp, #640
+- vst1.8 {d20-d21}, [r2, : 128]
++ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+@@ -547,14 +533,12 @@
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+- add r2, sp, #656
+- vst1.8 {d10-d11}, [r2, : 128]
++ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+- add r2, sp, #672
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+@@ -566,7 +550,7 @@
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+- add r2, sp, #576
++ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+@@ -575,32 +559,30 @@
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+- add r2, sp, #576
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+- add r2, sp, #656
++ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+- add r2, sp, #624
++ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+- add r2, sp, #608
++ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+- add r2, sp, #608
+ vst1.8 {d8-d9}, [r2, : 128]
+- add r2, sp, #560
++ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+@@ -621,36 +603,36 @@
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+- add r2, sp, #640
++ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+- vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+- vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+- vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+- vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
++ vmlal.s32 q2, d19, d6
++ vmlal.s32 q5, d19, d21
++ vmlal.s32 q1, d19, d29
++ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+- add r2, sp, #592
++ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+- add r2, sp, #512
++ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+- add r2, sp, #528
++ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+- add r2, sp, #576
++ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+- add r2, sp, #672
++ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+- add r2, sp, #608
++ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+@@ -823,22 +805,19 @@
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+- add r2, sp, #560
++ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+- vst1.8 {d12-d13}, [r2, : 128]
++ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+- add r2, sp, #576
+- vst1.8 {d20-d21}, [r2, : 128]
++ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+- add r2, sp, #592
+- vst1.8 {d12-d13}, [r2, : 128]
++ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+- add r2, sp, #608
+- vst1.8 {d16-d17}, [r2, : 128]
++ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+@@ -849,8 +828,7 @@
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+- add r2, sp, #624
+- vst1.8 {d14-d15}, [r2, : 128]
++ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+@@ -858,8 +836,7 @@
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+- add r2, sp, #640
+- vst1.8 {d20-d21}, [r2, : 128]
++ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+@@ -868,15 +845,13 @@
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+- add r2, sp, #656
+- vst1.8 {d10-d11}, [r2, : 128]
++ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+- add r2, sp, #672
+- vst1.8 {d16-d17}, [r2, : 128]
++ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+@@ -887,7 +862,7 @@
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+- add r2, sp, #576
++ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+@@ -896,32 +871,30 @@
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+- add r2, sp, #576
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+- add r2, sp, #656
++ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+- add r2, sp, #624
++ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+- add r2, sp, #608
++ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+- add r2, sp, #608
+ vst1.8 {d8-d9}, [r2, : 128]
+- add r2, sp, #560
++ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+@@ -942,36 +915,36 @@
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+- add r2, sp, #640
++ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+- vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+- vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+- vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+- vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
++ vmlal.s32 q2, d19, d6
++ vmlal.s32 q5, d19, d21
++ vmlal.s32 q1, d19, d29
++ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+- add r2, sp, #592
++ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+- add r2, sp, #512
++ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+- add r2, sp, #528
++ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+- add r2, sp, #576
++ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+- add r2, sp, #672
++ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+- add r2, sp, #608
++ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+@@ -1069,7 +1042,7 @@
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+- add r2, sp, #544
++ add r2, sp, #512
+ add r4, r3, #144
+ add r5, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]
+@@ -1139,14 +1112,13 @@
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+- add r2, sp, #512
+- vld1.8 {d18-d19}, [r2, : 128]
++ add r2, sp, #480
++ vld1.8 {d18-d19}, [r2, : 128]!
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+- add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+@@ -1295,22 +1267,19 @@
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+- add r2, sp, #560
++ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+- vst1.8 {d12-d13}, [r2, : 128]
++ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+- add r2, sp, #576
+- vst1.8 {d20-d21}, [r2, : 128]
++ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+- add r2, sp, #592
+- vst1.8 {d12-d13}, [r2, : 128]
++ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+- add r2, sp, #608
+- vst1.8 {d16-d17}, [r2, : 128]
++ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+@@ -1321,8 +1290,7 @@
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+- add r2, sp, #624
+- vst1.8 {d14-d15}, [r2, : 128]
++ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+@@ -1330,8 +1298,7 @@
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+- add r2, sp, #640
+- vst1.8 {d20-d21}, [r2, : 128]
++ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+@@ -1340,15 +1307,13 @@
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+- add r2, sp, #656
+- vst1.8 {d10-d11}, [r2, : 128]
++ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+- add r2, sp, #672
+- vst1.8 {d16-d17}, [r2, : 128]
++ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+@@ -1359,7 +1324,7 @@
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+- add r2, sp, #576
++ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+@@ -1368,32 +1333,30 @@
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+- add r2, sp, #576
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+- add r2, sp, #656
++ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+- add r2, sp, #624
++ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+- add r2, sp, #608
++ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+- add r2, sp, #608
+ vst1.8 {d8-d9}, [r2, : 128]
+- add r2, sp, #560
++ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+@@ -1414,36 +1377,36 @@
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+- add r2, sp, #640
++ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+- vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+- vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+- vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+- vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
++ vmlal.s32 q2, d19, d6
++ vmlal.s32 q5, d19, d21
++ vmlal.s32 q1, d19, d29
++ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+- add r2, sp, #592
++ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+- add r2, sp, #512
++ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+- add r2, sp, #528
++ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+- add r2, sp, #576
++ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+- add r2, sp, #672
++ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+- add r2, sp, #608
++ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+@@ -1541,10 +1504,10 @@
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+- ldr r2, [sp, #488]
+- ldr r4, [sp, #492]
++ ldr r2, [sp, #456]
++ ldr r4, [sp, #460]
+ subs r5, r2, #1
+- bge ._mainloop
++ bge .Lmainloop
+ add r1, r3, #144
+ add r2, r3, #336
+ vld1.8 {d0-d1}, [r1, : 128]!
+@@ -1553,41 +1516,41 @@
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 d4, [r2, : 64]
+- ldr r1, =0
+-._invertloop:
++ movw r1, #0
++.Linvertloop:
+ add r2, r3, #144
+- ldr r4, =0
+- ldr r5, =2
++ movw r4, #0
++ movw r5, #2
+ cmp r1, #1
+- ldreq r5, =1
++ moveq r5, #1
+ addeq r2, r3, #336
+ addeq r4, r3, #48
+ cmp r1, #2
+- ldreq r5, =1
++ moveq r5, #1
+ addeq r2, r3, #48
+ cmp r1, #3
+- ldreq r5, =5
++ moveq r5, #5
+ addeq r4, r3, #336
+ cmp r1, #4
+- ldreq r5, =10
++ moveq r5, #10
+ cmp r1, #5
+- ldreq r5, =20
++ moveq r5, #20
+ cmp r1, #6
+- ldreq r5, =10
++ moveq r5, #10
+ addeq r2, r3, #336
+ addeq r4, r3, #336
+ cmp r1, #7
+- ldreq r5, =50
++ moveq r5, #50
+ cmp r1, #8
+- ldreq r5, =100
++ moveq r5, #100
+ cmp r1, #9
+- ldreq r5, =50
++ moveq r5, #50
+ addeq r2, r3, #336
+ cmp r1, #10
+- ldreq r5, =5
++ moveq r5, #5
+ addeq r2, r3, #48
+ cmp r1, #11
+- ldreq r5, =0
++ moveq r5, #0
+ addeq r2, r3, #96
+ add r6, r3, #144
+ add r7, r3, #288
+@@ -1598,8 +1561,8 @@
+ vst1.8 {d2-d3}, [r7, : 128]!
+ vst1.8 d4, [r7, : 64]
+ cmp r5, #0
+- beq ._skipsquaringloop
+-._squaringloop:
++ beq .Lskipsquaringloop
++.Lsquaringloop:
+ add r6, r3, #288
+ add r7, r3, #288
+ add r8, r3, #288
+@@ -1611,7 +1574,7 @@
+ vld1.8 {d6-d7}, [r7, : 128]!
+ vld1.8 {d9}, [r7, : 64]
+ vld1.8 {d10-d11}, [r6, : 128]!
+- add r7, sp, #416
++ add r7, sp, #384
+ vld1.8 {d12-d13}, [r6, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r6, : 64]
+@@ -1726,7 +1689,7 @@
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+- add r7, sp, #512
++ add r7, sp, #480
+ vld1.8 {d14-d15}, [r7, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+@@ -1735,7 +1698,7 @@
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r6, : 64]!
+- add r6, sp, #528
++ add r6, sp, #496
+ vld1.8 {d20-d21}, [r6, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+@@ -1789,8 +1752,8 @@
+ sub r6, r6, #32
+ vst1.8 d4, [r6, : 64]
+ subs r5, r5, #1
+- bhi ._squaringloop
+-._skipsquaringloop:
++ bhi .Lsquaringloop
++.Lskipsquaringloop:
+ mov r2, r2
+ add r5, r3, #288
+ add r6, r3, #144
+@@ -1802,7 +1765,7 @@
+ vld1.8 {d6-d7}, [r5, : 128]!
+ vld1.8 {d9}, [r5, : 64]
+ vld1.8 {d10-d11}, [r2, : 128]!
+- add r5, sp, #416
++ add r5, sp, #384
+ vld1.8 {d12-d13}, [r2, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r2, : 64]
+@@ -1917,7 +1880,7 @@
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+- add r5, sp, #512
++ add r5, sp, #480
+ vld1.8 {d14-d15}, [r5, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+@@ -1926,7 +1889,7 @@
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r2, : 64]!
+- add r2, sp, #528
++ add r2, sp, #496
+ vld1.8 {d20-d21}, [r2, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+@@ -1980,7 +1943,7 @@
+ sub r2, r2, #32
+ vst1.8 d4, [r2, : 64]
+ cmp r4, #0
+- beq ._skippostcopy
++ beq .Lskippostcopy
+ add r2, r3, #144
+ mov r4, r4
+ vld1.8 {d0-d1}, [r2, : 128]!
+@@ -1989,9 +1952,9 @@
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+-._skippostcopy:
++.Lskippostcopy:
+ cmp r1, #1
+- bne ._skipfinalcopy
++ bne .Lskipfinalcopy
+ add r2, r3, #288
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]!
+@@ -2000,10 +1963,10 @@
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+-._skipfinalcopy:
++.Lskipfinalcopy:
+ add r1, r1, #1
+ cmp r1, #12
+- blo ._invertloop
++ blo .Linvertloop
+ add r1, r3, #144
+ ldr r2, [r1], #4
+ ldr r3, [r1], #4
+@@ -2085,21 +2048,15 @@
+ add r8, r8, r10, LSL #12
+ mov r9, r10, LSR #20
+ add r1, r9, r1, LSL #6
+- str r2, [r0], #4
+- str r3, [r0], #4
+- str r4, [r0], #4
+- str r5, [r0], #4
+- str r6, [r0], #4
+- str r7, [r0], #4
+- str r8, [r0], #4
+- str r1, [r0]
+- ldrd r4, [sp, #0]
+- ldrd r6, [sp, #8]
+- ldrd r8, [sp, #16]
+- ldrd r10, [sp, #24]
+- ldr r12, [sp, #480]
+- ldr r14, [sp, #484]
+- ldr r0, =0
+- mov sp, r12
+- vpop {q4, q5, q6, q7}
+- bx lr
++ str r2, [r0]
++ str r3, [r0, #4]
++ str r4, [r0, #8]
++ str r5, [r0, #12]
++ str r6, [r0, #16]
++ str r7, [r0, #20]
++ str r8, [r0, #24]
++ str r1, [r0, #28]
++ movw r0, #0
++ mov sp, ip
++ pop {r4-r11, pc}
++ENDPROC(curve25519_neon)
+--- /dev/null
++++ b/arch/arm/crypto/curve25519-glue.c
+@@ -0,0 +1,127 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
++ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
++ * manually reworked for use in kernel space.
++ */
++
++#include <asm/hwcap.h>
++#include <asm/neon.h>
++#include <asm/simd.h>
++#include <crypto/internal/kpp.h>
++#include <crypto/internal/simd.h>
++#include <linux/types.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/jump_label.h>
++#include <crypto/curve25519.h>
++
++asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
++ const u8 secret[CURVE25519_KEY_SIZE],
++ const u8 basepoint[CURVE25519_KEY_SIZE]);
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
++
++void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
++ const u8 scalar[CURVE25519_KEY_SIZE],
++ const u8 point[CURVE25519_KEY_SIZE])
++{
++ if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
++ kernel_neon_begin();
++ curve25519_neon(out, scalar, point);
++ kernel_neon_end();
++ } else {
++ curve25519_generic(out, scalar, point);
++ }
++}
++EXPORT_SYMBOL(curve25519_arch);
++
++static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
++ unsigned int len)
++{
++ u8 *secret = kpp_tfm_ctx(tfm);
++
++ if (!len)
++ curve25519_generate_secret(secret);
++ else if (len == CURVE25519_KEY_SIZE &&
++ crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
++ memcpy(secret, buf, CURVE25519_KEY_SIZE);
++ else
++ return -EINVAL;
++ return 0;
++}
++
++static int curve25519_compute_value(struct kpp_request *req)
++{
++ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
++ const u8 *secret = kpp_tfm_ctx(tfm);
++ u8 public_key[CURVE25519_KEY_SIZE];
++ u8 buf[CURVE25519_KEY_SIZE];
++ int copied, nbytes;
++ u8 const *bp;
++
++ if (req->src) {
++ copied = sg_copy_to_buffer(req->src,
++ sg_nents_for_len(req->src,
++ CURVE25519_KEY_SIZE),
++ public_key, CURVE25519_KEY_SIZE);
++ if (copied != CURVE25519_KEY_SIZE)
++ return -EINVAL;
++ bp = public_key;
++ } else {
++ bp = curve25519_base_point;
++ }
++
++ curve25519_arch(buf, secret, bp);
++
++ /* might want less than we've got */
++ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
++ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
++ nbytes),
++ buf, nbytes);
++ if (copied != nbytes)
++ return -EINVAL;
++ return 0;
++}
++
++static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
++{
++ return CURVE25519_KEY_SIZE;
++}
++
++static struct kpp_alg curve25519_alg = {
++ .base.cra_name = "curve25519",
++ .base.cra_driver_name = "curve25519-neon",
++ .base.cra_priority = 200,
++ .base.cra_module = THIS_MODULE,
++ .base.cra_ctxsize = CURVE25519_KEY_SIZE,
++
++ .set_secret = curve25519_set_secret,
++ .generate_public_key = curve25519_compute_value,
++ .compute_shared_secret = curve25519_compute_value,
++ .max_size = curve25519_max_size,
++};
++
++static int __init mod_init(void)
++{
++ if (elf_hwcap & HWCAP_NEON) {
++ static_branch_enable(&have_neon);
++ return crypto_register_kpp(&curve25519_alg);
++ }
++ return 0;
++}
++
++static void __exit mod_exit(void)
++{
++ if (elf_hwcap & HWCAP_NEON)
++ crypto_unregister_kpp(&curve25519_alg);
++}
++
++module_init(mod_init);
++module_exit(mod_exit);
++
++MODULE_ALIAS_CRYPTO("curve25519");
++MODULE_ALIAS_CRYPTO("curve25519-neon");
++MODULE_LICENSE("GPL v2");