aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch1058
1 files changed, 0 insertions, 1058 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch b/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
deleted file mode 100644
index d84726b616..0000000000
--- a/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
+++ /dev/null
@@ -1,1058 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: "Jason A. Donenfeld" <Jason@zx2c4.com>
-Date: Fri, 8 Nov 2019 13:22:38 +0100
-Subject: [PATCH] crypto: arm/curve25519 - wire up NEON implementation
-
-commit d8f1308a025fc7e00414194ed742d5f05a21e13c upstream.
-
-This ports the SUPERCOP implementation for usage in kernel space. In
-addition to the usual header, macro, and style changes required for
-kernel space, it makes a few small changes to the code:
-
- - The stack alignment is relaxed to 16 bytes.
- - Superfluous mov statements have been removed.
- - ldr for constants has been replaced with movw.
- - ldreq has been replaced with moveq.
- - The str epilogue has been made more idiomatic.
- - SIMD registers are not pushed and popped at the beginning and end.
- - The prologue and epilogue have been made idiomatic.
- - A hole has been removed from the stack, saving 32 bytes.
- - We write-back the base register whenever possible for vld1.8.
- - Some multiplications have been reordered for better A7 performance.
-
-There are more opportunities for cleanup, since this code is from qhasm,
-which doesn't always do the most opportune thing. But even prior to
-extensive hand optimizations, this code delivers significant performance
-improvements (given in get_cycles() per call):
-
- ----------- -------------
- | generic C | this commit |
- ------------ ----------- -------------
- | Cortex-A7 | 49136 | 22395 |
- ------------ ----------- -------------
- | Cortex-A17 | 17326 | 4983 |
- ------------ ----------- -------------
-
-Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
-[ardb: - move to arch/arm/crypto
- - wire into lib/crypto framework
- - implement crypto API KPP hooks ]
-Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
-Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
----
- arch/arm/crypto/Kconfig | 6 +
- arch/arm/crypto/Makefile | 2 +
- arch/arm/crypto/curve25519-core.S | 347 +++++++++++++-----------------
- arch/arm/crypto/curve25519-glue.c | 127 +++++++++++
- 4 files changed, 287 insertions(+), 195 deletions(-)
- create mode 100644 arch/arm/crypto/curve25519-glue.c
-
---- a/arch/arm/crypto/Kconfig
-+++ b/arch/arm/crypto/Kconfig
-@@ -141,4 +141,10 @@ config CRYPTO_NHPOLY1305_NEON
- depends on KERNEL_MODE_NEON
- select CRYPTO_NHPOLY1305
-
-+config CRYPTO_CURVE25519_NEON
-+ tristate "NEON accelerated Curve25519 scalar multiplication library"
-+ depends on KERNEL_MODE_NEON
-+ select CRYPTO_LIB_CURVE25519_GENERIC
-+ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
-+
- endif
---- a/arch/arm/crypto/Makefile
-+++ b/arch/arm/crypto/Makefile
-@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha51
- obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
- obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
- obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
-+obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
-
- ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
- ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
-@@ -58,6 +59,7 @@ chacha-neon-y := chacha-scalar-core.o ch
- chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
- poly1305-arm-y := poly1305-core.o poly1305-glue.o
- nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
-+curve25519-neon-y := curve25519-core.o curve25519-glue.o
-
- ifdef REGENERATE_ARM_CRYPTO
- quiet_cmd_perl = PERL $@
---- a/arch/arm/crypto/curve25519-core.S
-+++ b/arch/arm/crypto/curve25519-core.S
-@@ -1,43 +1,35 @@
-+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
- /*
-- * Public domain code from Daniel J. Bernstein and Peter Schwabe, from
-- * SUPERCOP's curve25519/neon2/scalarmult.s.
-+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-+ *
-+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
-+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
-+ * manually reworked for use in kernel space.
- */
-
--.fpu neon
-+#include <linux/linkage.h>
-+
- .text
-+.fpu neon
-+.arch armv7-a
- .align 4
--.global _crypto_scalarmult_curve25519_neon2
--.global crypto_scalarmult_curve25519_neon2
--.type _crypto_scalarmult_curve25519_neon2 STT_FUNC
--.type crypto_scalarmult_curve25519_neon2 STT_FUNC
-- _crypto_scalarmult_curve25519_neon2:
-- crypto_scalarmult_curve25519_neon2:
-- vpush {q4, q5, q6, q7}
-- mov r12, sp
-- sub sp, sp, #736
-- and sp, sp, #0xffffffe0
-- strd r4, [sp, #0]
-- strd r6, [sp, #8]
-- strd r8, [sp, #16]
-- strd r10, [sp, #24]
-- str r12, [sp, #480]
-- str r14, [sp, #484]
-- mov r0, r0
-- mov r1, r1
-- mov r2, r2
-- add r3, sp, #32
-- ldr r4, =0
-- ldr r5, =254
-+
-+ENTRY(curve25519_neon)
-+ push {r4-r11, lr}
-+ mov ip, sp
-+ sub r3, sp, #704
-+ and r3, r3, #0xfffffff0
-+ mov sp, r3
-+ movw r4, #0
-+ movw r5, #254
- vmov.i32 q0, #1
- vshr.u64 q1, q0, #7
- vshr.u64 q0, q0, #8
- vmov.i32 d4, #19
- vmov.i32 d5, #38
-- add r6, sp, #512
-- vst1.8 {d2-d3}, [r6, : 128]
-- add r6, sp, #528
-- vst1.8 {d0-d1}, [r6, : 128]
-- add r6, sp, #544
-+ add r6, sp, #480
-+ vst1.8 {d2-d3}, [r6, : 128]!
-+ vst1.8 {d0-d1}, [r6, : 128]!
- vst1.8 {d4-d5}, [r6, : 128]
- add r6, r3, #0
- vmov.i32 q2, #0
-@@ -45,12 +37,12 @@
- vst1.8 {d4-d5}, [r6, : 128]!
- vst1.8 d4, [r6, : 64]
- add r6, r3, #0
-- ldr r7, =960
-+ movw r7, #960
- sub r7, r7, #2
- neg r7, r7
- sub r7, r7, r7, LSL #7
- str r7, [r6]
-- add r6, sp, #704
-+ add r6, sp, #672
- vld1.8 {d4-d5}, [r1]!
- vld1.8 {d6-d7}, [r1]
- vst1.8 {d4-d5}, [r6, : 128]!
-@@ -212,15 +204,15 @@
- vst1.8 {d0-d1}, [r6, : 128]!
- vst1.8 {d2-d3}, [r6, : 128]!
- vst1.8 d4, [r6, : 64]
--._mainloop:
-+.Lmainloop:
- mov r2, r5, LSR #3
- and r6, r5, #7
- ldrb r2, [r1, r2]
- mov r2, r2, LSR r6
- and r2, r2, #1
-- str r5, [sp, #488]
-+ str r5, [sp, #456]
- eor r4, r4, r2
-- str r2, [sp, #492]
-+ str r2, [sp, #460]
- neg r2, r4
- add r4, r3, #96
- add r5, r3, #192
-@@ -291,7 +283,7 @@
- vsub.i32 q0, q1, q3
- vst1.8 d4, [r4, : 64]
- vst1.8 d0, [r6, : 64]
-- add r2, sp, #544
-+ add r2, sp, #512
- add r4, r3, #96
- add r5, r3, #144
- vld1.8 {d0-d1}, [r2, : 128]
-@@ -361,14 +353,13 @@
- vmlal.s32 q0, d12, d8
- vmlal.s32 q0, d13, d17
- vmlal.s32 q0, d6, d6
-- add r2, sp, #512
-- vld1.8 {d18-d19}, [r2, : 128]
-+ add r2, sp, #480
-+ vld1.8 {d18-d19}, [r2, : 128]!
- vmull.s32 q3, d16, d7
- vmlal.s32 q3, d10, d15
- vmlal.s32 q3, d11, d14
- vmlal.s32 q3, d12, d9
- vmlal.s32 q3, d13, d8
-- add r2, sp, #528
- vld1.8 {d8-d9}, [r2, : 128]
- vadd.i64 q5, q12, q9
- vadd.i64 q6, q15, q9
-@@ -502,22 +493,19 @@
- vadd.i32 q5, q5, q0
- vtrn.32 q11, q14
- vadd.i32 q6, q6, q3
-- add r2, sp, #560
-+ add r2, sp, #528
- vadd.i32 q10, q10, q2
- vtrn.32 d24, d25
-- vst1.8 {d12-d13}, [r2, : 128]
-+ vst1.8 {d12-d13}, [r2, : 128]!
- vshl.i32 q6, q13, #1
-- add r2, sp, #576
-- vst1.8 {d20-d21}, [r2, : 128]
-+ vst1.8 {d20-d21}, [r2, : 128]!
- vshl.i32 q10, q14, #1
-- add r2, sp, #592
-- vst1.8 {d12-d13}, [r2, : 128]
-+ vst1.8 {d12-d13}, [r2, : 128]!
- vshl.i32 q15, q12, #1
- vadd.i32 q8, q8, q4
- vext.32 d10, d31, d30, #0
- vadd.i32 q7, q7, q1
-- add r2, sp, #608
-- vst1.8 {d16-d17}, [r2, : 128]
-+ vst1.8 {d16-d17}, [r2, : 128]!
- vmull.s32 q8, d18, d5
- vmlal.s32 q8, d26, d4
- vmlal.s32 q8, d19, d9
-@@ -528,8 +516,7 @@
- vmlal.s32 q8, d29, d1
- vmlal.s32 q8, d24, d6
- vmlal.s32 q8, d25, d0
-- add r2, sp, #624
-- vst1.8 {d14-d15}, [r2, : 128]
-+ vst1.8 {d14-d15}, [r2, : 128]!
- vmull.s32 q2, d18, d4
- vmlal.s32 q2, d12, d9
- vmlal.s32 q2, d13, d8
-@@ -537,8 +524,7 @@
- vmlal.s32 q2, d22, d2
- vmlal.s32 q2, d23, d1
- vmlal.s32 q2, d24, d0
-- add r2, sp, #640
-- vst1.8 {d20-d21}, [r2, : 128]
-+ vst1.8 {d20-d21}, [r2, : 128]!
- vmull.s32 q7, d18, d9
- vmlal.s32 q7, d26, d3
- vmlal.s32 q7, d19, d8
-@@ -547,14 +533,12 @@
- vmlal.s32 q7, d28, d1
- vmlal.s32 q7, d23, d6
- vmlal.s32 q7, d29, d0
-- add r2, sp, #656
-- vst1.8 {d10-d11}, [r2, : 128]
-+ vst1.8 {d10-d11}, [r2, : 128]!
- vmull.s32 q5, d18, d3
- vmlal.s32 q5, d19, d2
- vmlal.s32 q5, d22, d1
- vmlal.s32 q5, d23, d0
- vmlal.s32 q5, d12, d8
-- add r2, sp, #672
- vst1.8 {d16-d17}, [r2, : 128]
- vmull.s32 q4, d18, d8
- vmlal.s32 q4, d26, d2
-@@ -566,7 +550,7 @@
- vmlal.s32 q8, d26, d1
- vmlal.s32 q8, d19, d6
- vmlal.s32 q8, d27, d0
-- add r2, sp, #576
-+ add r2, sp, #544
- vld1.8 {d20-d21}, [r2, : 128]
- vmlal.s32 q7, d24, d21
- vmlal.s32 q7, d25, d20
-@@ -575,32 +559,30 @@
- vmlal.s32 q8, d22, d21
- vmlal.s32 q8, d28, d20
- vmlal.s32 q5, d24, d20
-- add r2, sp, #576
- vst1.8 {d14-d15}, [r2, : 128]
- vmull.s32 q7, d18, d6
- vmlal.s32 q7, d26, d0
-- add r2, sp, #656
-+ add r2, sp, #624
- vld1.8 {d30-d31}, [r2, : 128]
- vmlal.s32 q2, d30, d21
- vmlal.s32 q7, d19, d21
- vmlal.s32 q7, d27, d20
-- add r2, sp, #624
-+ add r2, sp, #592
- vld1.8 {d26-d27}, [r2, : 128]
- vmlal.s32 q4, d25, d27
- vmlal.s32 q8, d29, d27
- vmlal.s32 q8, d25, d26
- vmlal.s32 q7, d28, d27
- vmlal.s32 q7, d29, d26
-- add r2, sp, #608
-+ add r2, sp, #576
- vld1.8 {d28-d29}, [r2, : 128]
- vmlal.s32 q4, d24, d29
- vmlal.s32 q8, d23, d29
- vmlal.s32 q8, d24, d28
- vmlal.s32 q7, d22, d29
- vmlal.s32 q7, d23, d28
-- add r2, sp, #608
- vst1.8 {d8-d9}, [r2, : 128]
-- add r2, sp, #560
-+ add r2, sp, #528
- vld1.8 {d8-d9}, [r2, : 128]
- vmlal.s32 q7, d24, d9
- vmlal.s32 q7, d25, d31
-@@ -621,36 +603,36 @@
- vmlal.s32 q0, d23, d26
- vmlal.s32 q0, d24, d31
- vmlal.s32 q0, d19, d20
-- add r2, sp, #640
-+ add r2, sp, #608
- vld1.8 {d18-d19}, [r2, : 128]
- vmlal.s32 q2, d18, d7
-- vmlal.s32 q2, d19, d6
- vmlal.s32 q5, d18, d6
-- vmlal.s32 q5, d19, d21
- vmlal.s32 q1, d18, d21
-- vmlal.s32 q1, d19, d29
- vmlal.s32 q0, d18, d28
-- vmlal.s32 q0, d19, d9
- vmlal.s32 q6, d18, d29
-+ vmlal.s32 q2, d19, d6
-+ vmlal.s32 q5, d19, d21
-+ vmlal.s32 q1, d19, d29
-+ vmlal.s32 q0, d19, d9
- vmlal.s32 q6, d19, d28
-- add r2, sp, #592
-+ add r2, sp, #560
- vld1.8 {d18-d19}, [r2, : 128]
-- add r2, sp, #512
-+ add r2, sp, #480
- vld1.8 {d22-d23}, [r2, : 128]
- vmlal.s32 q5, d19, d7
- vmlal.s32 q0, d18, d21
- vmlal.s32 q0, d19, d29
- vmlal.s32 q6, d18, d6
-- add r2, sp, #528
-+ add r2, sp, #496
- vld1.8 {d6-d7}, [r2, : 128]
- vmlal.s32 q6, d19, d21
-- add r2, sp, #576
-+ add r2, sp, #544
- vld1.8 {d18-d19}, [r2, : 128]
- vmlal.s32 q0, d30, d8
-- add r2, sp, #672
-+ add r2, sp, #640
- vld1.8 {d20-d21}, [r2, : 128]
- vmlal.s32 q5, d30, d29
-- add r2, sp, #608
-+ add r2, sp, #576
- vld1.8 {d24-d25}, [r2, : 128]
- vmlal.s32 q1, d30, d28
- vadd.i64 q13, q0, q11
-@@ -823,22 +805,19 @@
- vadd.i32 q5, q5, q0
- vtrn.32 q11, q14
- vadd.i32 q6, q6, q3
-- add r2, sp, #560
-+ add r2, sp, #528
- vadd.i32 q10, q10, q2
- vtrn.32 d24, d25
-- vst1.8 {d12-d13}, [r2, : 128]
-+ vst1.8 {d12-d13}, [r2, : 128]!
- vshl.i32 q6, q13, #1
-- add r2, sp, #576
-- vst1.8 {d20-d21}, [r2, : 128]
-+ vst1.8 {d20-d21}, [r2, : 128]!
- vshl.i32 q10, q14, #1
-- add r2, sp, #592
-- vst1.8 {d12-d13}, [r2, : 128]
-+ vst1.8 {d12-d13}, [r2, : 128]!
- vshl.i32 q15, q12, #1
- vadd.i32 q8, q8, q4
- vext.32 d10, d31, d30, #0
- vadd.i32 q7, q7, q1
-- add r2, sp, #608
-- vst1.8 {d16-d17}, [r2, : 128]
-+ vst1.8 {d16-d17}, [r2, : 128]!
- vmull.s32 q8, d18, d5
- vmlal.s32 q8, d26, d4
- vmlal.s32 q8, d19, d9
-@@ -849,8 +828,7 @@
- vmlal.s32 q8, d29, d1
- vmlal.s32 q8, d24, d6
- vmlal.s32 q8, d25, d0
-- add r2, sp, #624
-- vst1.8 {d14-d15}, [r2, : 128]
-+ vst1.8 {d14-d15}, [r2, : 128]!
- vmull.s32 q2, d18, d4
- vmlal.s32 q2, d12, d9
- vmlal.s32 q2, d13, d8
-@@ -858,8 +836,7 @@
- vmlal.s32 q2, d22, d2
- vmlal.s32 q2, d23, d1
- vmlal.s32 q2, d24, d0
-- add r2, sp, #640
-- vst1.8 {d20-d21}, [r2, : 128]
-+ vst1.8 {d20-d21}, [r2, : 128]!
- vmull.s32 q7, d18, d9
- vmlal.s32 q7, d26, d3
- vmlal.s32 q7, d19, d8
-@@ -868,15 +845,13 @@
- vmlal.s32 q7, d28, d1
- vmlal.s32 q7, d23, d6
- vmlal.s32 q7, d29, d0
-- add r2, sp, #656
-- vst1.8 {d10-d11}, [r2, : 128]
-+ vst1.8 {d10-d11}, [r2, : 128]!
- vmull.s32 q5, d18, d3
- vmlal.s32 q5, d19, d2
- vmlal.s32 q5, d22, d1
- vmlal.s32 q5, d23, d0
- vmlal.s32 q5, d12, d8
-- add r2, sp, #672
-- vst1.8 {d16-d17}, [r2, : 128]
-+ vst1.8 {d16-d17}, [r2, : 128]!
- vmull.s32 q4, d18, d8
- vmlal.s32 q4, d26, d2
- vmlal.s32 q4, d19, d7
-@@ -887,7 +862,7 @@
- vmlal.s32 q8, d26, d1
- vmlal.s32 q8, d19, d6
- vmlal.s32 q8, d27, d0
-- add r2, sp, #576
-+ add r2, sp, #544
- vld1.8 {d20-d21}, [r2, : 128]
- vmlal.s32 q7, d24, d21
- vmlal.s32 q7, d25, d20
-@@ -896,32 +871,30 @@
- vmlal.s32 q8, d22, d21
- vmlal.s32 q8, d28, d20
- vmlal.s32 q5, d24, d20
-- add r2, sp, #576
- vst1.8 {d14-d15}, [r2, : 128]
- vmull.s32 q7, d18, d6
- vmlal.s32 q7, d26, d0
-- add r2, sp, #656
-+ add r2, sp, #624
- vld1.8 {d30-d31}, [r2, : 128]
- vmlal.s32 q2, d30, d21
- vmlal.s32 q7, d19, d21
- vmlal.s32 q7, d27, d20
-- add r2, sp, #624
-+ add r2, sp, #592
- vld1.8 {d26-d27}, [r2, : 128]
- vmlal.s32 q4, d25, d27
- vmlal.s32 q8, d29, d27
- vmlal.s32 q8, d25, d26
- vmlal.s32 q7, d28, d27
- vmlal.s32 q7, d29, d26
-- add r2, sp, #608
-+ add r2, sp, #576
- vld1.8 {d28-d29}, [r2, : 128]
- vmlal.s32 q4, d24, d29
- vmlal.s32 q8, d23, d29
- vmlal.s32 q8, d24, d28
- vmlal.s32 q7, d22, d29
- vmlal.s32 q7, d23, d28
-- add r2, sp, #608
- vst1.8 {d8-d9}, [r2, : 128]
-- add r2, sp, #560
-+ add r2, sp, #528
- vld1.8 {d8-d9}, [r2, : 128]
- vmlal.s32 q7, d24, d9
- vmlal.s32 q7, d25, d31
-@@ -942,36 +915,36 @@
- vmlal.s32 q0, d23, d26
- vmlal.s32 q0, d24, d31
- vmlal.s32 q0, d19, d20
-- add r2, sp, #640
-+ add r2, sp, #608
- vld1.8 {d18-d19}, [r2, : 128]
- vmlal.s32 q2, d18, d7
-- vmlal.s32 q2, d19, d6
- vmlal.s32 q5, d18, d6
-- vmlal.s32 q5, d19, d21
- vmlal.s32 q1, d18, d21
-- vmlal.s32 q1, d19, d29
- vmlal.s32 q0, d18, d28
-- vmlal.s32 q0, d19, d9
- vmlal.s32 q6, d18, d29
-+ vmlal.s32 q2, d19, d6
-+ vmlal.s32 q5, d19, d21
-+ vmlal.s32 q1, d19, d29
-+ vmlal.s32 q0, d19, d9
- vmlal.s32 q6, d19, d28
-- add r2, sp, #592
-+ add r2, sp, #560
- vld1.8 {d18-d19}, [r2, : 128]
-- add r2, sp, #512
-+ add r2, sp, #480
- vld1.8 {d22-d23}, [r2, : 128]
- vmlal.s32 q5, d19, d7
- vmlal.s32 q0, d18, d21
- vmlal.s32 q0, d19, d29
- vmlal.s32 q6, d18, d6
-- add r2, sp, #528
-+ add r2, sp, #496
- vld1.8 {d6-d7}, [r2, : 128]
- vmlal.s32 q6, d19, d21
-- add r2, sp, #576
-+ add r2, sp, #544
- vld1.8 {d18-d19}, [r2, : 128]
- vmlal.s32 q0, d30, d8
-- add r2, sp, #672
-+ add r2, sp, #640
- vld1.8 {d20-d21}, [r2, : 128]
- vmlal.s32 q5, d30, d29
-- add r2, sp, #608
-+ add r2, sp, #576
- vld1.8 {d24-d25}, [r2, : 128]
- vmlal.s32 q1, d30, d28
- vadd.i64 q13, q0, q11
-@@ -1069,7 +1042,7 @@
- sub r4, r4, #24
- vst1.8 d0, [r2, : 64]
- vst1.8 d1, [r4, : 64]
-- add r2, sp, #544
-+ add r2, sp, #512
- add r4, r3, #144
- add r5, r3, #192
- vld1.8 {d0-d1}, [r2, : 128]
-@@ -1139,14 +1112,13 @@
- vmlal.s32 q0, d12, d8
- vmlal.s32 q0, d13, d17
- vmlal.s32 q0, d6, d6
-- add r2, sp, #512
-- vld1.8 {d18-d19}, [r2, : 128]
-+ add r2, sp, #480
-+ vld1.8 {d18-d19}, [r2, : 128]!
- vmull.s32 q3, d16, d7
- vmlal.s32 q3, d10, d15
- vmlal.s32 q3, d11, d14
- vmlal.s32 q3, d12, d9
- vmlal.s32 q3, d13, d8
-- add r2, sp, #528
- vld1.8 {d8-d9}, [r2, : 128]
- vadd.i64 q5, q12, q9
- vadd.i64 q6, q15, q9
-@@ -1295,22 +1267,19 @@
- vadd.i32 q5, q5, q0
- vtrn.32 q11, q14
- vadd.i32 q6, q6, q3
-- add r2, sp, #560
-+ add r2, sp, #528
- vadd.i32 q10, q10, q2
- vtrn.32 d24, d25
-- vst1.8 {d12-d13}, [r2, : 128]
-+ vst1.8 {d12-d13}, [r2, : 128]!
- vshl.i32 q6, q13, #1
-- add r2, sp, #576
-- vst1.8 {d20-d21}, [r2, : 128]
-+ vst1.8 {d20-d21}, [r2, : 128]!
- vshl.i32 q10, q14, #1
-- add r2, sp, #592
-- vst1.8 {d12-d13}, [r2, : 128]
-+ vst1.8 {d12-d13}, [r2, : 128]!
- vshl.i32 q15, q12, #1
- vadd.i32 q8, q8, q4
- vext.32 d10, d31, d30, #0
- vadd.i32 q7, q7, q1
-- add r2, sp, #608
-- vst1.8 {d16-d17}, [r2, : 128]
-+ vst1.8 {d16-d17}, [r2, : 128]!
- vmull.s32 q8, d18, d5
- vmlal.s32 q8, d26, d4
- vmlal.s32 q8, d19, d9
-@@ -1321,8 +1290,7 @@
- vmlal.s32 q8, d29, d1
- vmlal.s32 q8, d24, d6
- vmlal.s32 q8, d25, d0
-- add r2, sp, #624
-- vst1.8 {d14-d15}, [r2, : 128]
-+ vst1.8 {d14-d15}, [r2, : 128]!
- vmull.s32 q2, d18, d4
- vmlal.s32 q2, d12, d9
- vmlal.s32 q2, d13, d8
-@@ -1330,8 +1298,7 @@
- vmlal.s32 q2, d22, d2
- vmlal.s32 q2, d23, d1
- vmlal.s32 q2, d24, d0
-- add r2, sp, #640
-- vst1.8 {d20-d21}, [r2, : 128]
-+ vst1.8 {d20-d21}, [r2, : 128]!
- vmull.s32 q7, d18, d9
- vmlal.s32 q7, d26, d3
- vmlal.s32 q7, d19, d8
-@@ -1340,15 +1307,13 @@
- vmlal.s32 q7, d28, d1
- vmlal.s32 q7, d23, d6
- vmlal.s32 q7, d29, d0
-- add r2, sp, #656
-- vst1.8 {d10-d11}, [r2, : 128]
-+ vst1.8 {d10-d11}, [r2, : 128]!
- vmull.s32 q5, d18, d3
- vmlal.s32 q5, d19, d2
- vmlal.s32 q5, d22, d1
- vmlal.s32 q5, d23, d0
- vmlal.s32 q5, d12, d8
-- add r2, sp, #672
-- vst1.8 {d16-d17}, [r2, : 128]
-+ vst1.8 {d16-d17}, [r2, : 128]!
- vmull.s32 q4, d18, d8
- vmlal.s32 q4, d26, d2
- vmlal.s32 q4, d19, d7
-@@ -1359,7 +1324,7 @@
- vmlal.s32 q8, d26, d1
- vmlal.s32 q8, d19, d6
- vmlal.s32 q8, d27, d0
-- add r2, sp, #576
-+ add r2, sp, #544
- vld1.8 {d20-d21}, [r2, : 128]
- vmlal.s32 q7, d24, d21
- vmlal.s32 q7, d25, d20
-@@ -1368,32 +1333,30 @@
- vmlal.s32 q8, d22, d21
- vmlal.s32 q8, d28, d20
- vmlal.s32 q5, d24, d20
-- add r2, sp, #576
- vst1.8 {d14-d15}, [r2, : 128]
- vmull.s32 q7, d18, d6
- vmlal.s32 q7, d26, d0
-- add r2, sp, #656
-+ add r2, sp, #624
- vld1.8 {d30-d31}, [r2, : 128]
- vmlal.s32 q2, d30, d21
- vmlal.s32 q7, d19, d21
- vmlal.s32 q7, d27, d20
-- add r2, sp, #624
-+ add r2, sp, #592
- vld1.8 {d26-d27}, [r2, : 128]
- vmlal.s32 q4, d25, d27
- vmlal.s32 q8, d29, d27
- vmlal.s32 q8, d25, d26
- vmlal.s32 q7, d28, d27
- vmlal.s32 q7, d29, d26
-- add r2, sp, #608
-+ add r2, sp, #576
- vld1.8 {d28-d29}, [r2, : 128]
- vmlal.s32 q4, d24, d29
- vmlal.s32 q8, d23, d29
- vmlal.s32 q8, d24, d28
- vmlal.s32 q7, d22, d29
- vmlal.s32 q7, d23, d28
-- add r2, sp, #608
- vst1.8 {d8-d9}, [r2, : 128]
-- add r2, sp, #560
-+ add r2, sp, #528
- vld1.8 {d8-d9}, [r2, : 128]
- vmlal.s32 q7, d24, d9
- vmlal.s32 q7, d25, d31
-@@ -1414,36 +1377,36 @@
- vmlal.s32 q0, d23, d26
- vmlal.s32 q0, d24, d31
- vmlal.s32 q0, d19, d20
-- add r2, sp, #640
-+ add r2, sp, #608
- vld1.8 {d18-d19}, [r2, : 128]
- vmlal.s32 q2, d18, d7
-- vmlal.s32 q2, d19, d6
- vmlal.s32 q5, d18, d6
-- vmlal.s32 q5, d19, d21
- vmlal.s32 q1, d18, d21
-- vmlal.s32 q1, d19, d29
- vmlal.s32 q0, d18, d28
-- vmlal.s32 q0, d19, d9
- vmlal.s32 q6, d18, d29
-+ vmlal.s32 q2, d19, d6
-+ vmlal.s32 q5, d19, d21
-+ vmlal.s32 q1, d19, d29
-+ vmlal.s32 q0, d19, d9
- vmlal.s32 q6, d19, d28
-- add r2, sp, #592
-+ add r2, sp, #560
- vld1.8 {d18-d19}, [r2, : 128]
-- add r2, sp, #512
-+ add r2, sp, #480
- vld1.8 {d22-d23}, [r2, : 128]
- vmlal.s32 q5, d19, d7
- vmlal.s32 q0, d18, d21
- vmlal.s32 q0, d19, d29
- vmlal.s32 q6, d18, d6
-- add r2, sp, #528
-+ add r2, sp, #496
- vld1.8 {d6-d7}, [r2, : 128]
- vmlal.s32 q6, d19, d21
-- add r2, sp, #576
-+ add r2, sp, #544
- vld1.8 {d18-d19}, [r2, : 128]
- vmlal.s32 q0, d30, d8
-- add r2, sp, #672
-+ add r2, sp, #640
- vld1.8 {d20-d21}, [r2, : 128]
- vmlal.s32 q5, d30, d29
-- add r2, sp, #608
-+ add r2, sp, #576
- vld1.8 {d24-d25}, [r2, : 128]
- vmlal.s32 q1, d30, d28
- vadd.i64 q13, q0, q11
-@@ -1541,10 +1504,10 @@
- sub r4, r4, #24
- vst1.8 d0, [r2, : 64]
- vst1.8 d1, [r4, : 64]
-- ldr r2, [sp, #488]
-- ldr r4, [sp, #492]
-+ ldr r2, [sp, #456]
-+ ldr r4, [sp, #460]
- subs r5, r2, #1
-- bge ._mainloop
-+ bge .Lmainloop
- add r1, r3, #144
- add r2, r3, #336
- vld1.8 {d0-d1}, [r1, : 128]!
-@@ -1553,41 +1516,41 @@
- vst1.8 {d0-d1}, [r2, : 128]!
- vst1.8 {d2-d3}, [r2, : 128]!
- vst1.8 d4, [r2, : 64]
-- ldr r1, =0
--._invertloop:
-+ movw r1, #0
-+.Linvertloop:
- add r2, r3, #144
-- ldr r4, =0
-- ldr r5, =2
-+ movw r4, #0
-+ movw r5, #2
- cmp r1, #1
-- ldreq r5, =1
-+ moveq r5, #1
- addeq r2, r3, #336
- addeq r4, r3, #48
- cmp r1, #2
-- ldreq r5, =1
-+ moveq r5, #1
- addeq r2, r3, #48
- cmp r1, #3
-- ldreq r5, =5
-+ moveq r5, #5
- addeq r4, r3, #336
- cmp r1, #4
-- ldreq r5, =10
-+ moveq r5, #10
- cmp r1, #5
-- ldreq r5, =20
-+ moveq r5, #20
- cmp r1, #6
-- ldreq r5, =10
-+ moveq r5, #10
- addeq r2, r3, #336
- addeq r4, r3, #336
- cmp r1, #7
-- ldreq r5, =50
-+ moveq r5, #50
- cmp r1, #8
-- ldreq r5, =100
-+ moveq r5, #100
- cmp r1, #9
-- ldreq r5, =50
-+ moveq r5, #50
- addeq r2, r3, #336
- cmp r1, #10
-- ldreq r5, =5
-+ moveq r5, #5
- addeq r2, r3, #48
- cmp r1, #11
-- ldreq r5, =0
-+ moveq r5, #0
- addeq r2, r3, #96
- add r6, r3, #144
- add r7, r3, #288
-@@ -1598,8 +1561,8 @@
- vst1.8 {d2-d3}, [r7, : 128]!
- vst1.8 d4, [r7, : 64]
- cmp r5, #0
-- beq ._skipsquaringloop
--._squaringloop:
-+ beq .Lskipsquaringloop
-+.Lsquaringloop:
- add r6, r3, #288
- add r7, r3, #288
- add r8, r3, #288
-@@ -1611,7 +1574,7 @@
- vld1.8 {d6-d7}, [r7, : 128]!
- vld1.8 {d9}, [r7, : 64]
- vld1.8 {d10-d11}, [r6, : 128]!
-- add r7, sp, #416
-+ add r7, sp, #384
- vld1.8 {d12-d13}, [r6, : 128]!
- vmul.i32 q7, q2, q0
- vld1.8 {d8}, [r6, : 64]
-@@ -1726,7 +1689,7 @@
- vext.32 d10, d6, d6, #0
- vmov.i32 q1, #0xffffffff
- vshl.i64 q4, q1, #25
-- add r7, sp, #512
-+ add r7, sp, #480
- vld1.8 {d14-d15}, [r7, : 128]
- vadd.i64 q9, q2, q7
- vshl.i64 q1, q1, #26
-@@ -1735,7 +1698,7 @@
- vadd.i64 q5, q5, q10
- vand q9, q9, q1
- vld1.8 {d16}, [r6, : 64]!
-- add r6, sp, #528
-+ add r6, sp, #496
- vld1.8 {d20-d21}, [r6, : 128]
- vadd.i64 q11, q5, q10
- vsub.i64 q2, q2, q9
-@@ -1789,8 +1752,8 @@
- sub r6, r6, #32
- vst1.8 d4, [r6, : 64]
- subs r5, r5, #1
-- bhi ._squaringloop
--._skipsquaringloop:
-+ bhi .Lsquaringloop
-+.Lskipsquaringloop:
- mov r2, r2
- add r5, r3, #288
- add r6, r3, #144
-@@ -1802,7 +1765,7 @@
- vld1.8 {d6-d7}, [r5, : 128]!
- vld1.8 {d9}, [r5, : 64]
- vld1.8 {d10-d11}, [r2, : 128]!
-- add r5, sp, #416
-+ add r5, sp, #384
- vld1.8 {d12-d13}, [r2, : 128]!
- vmul.i32 q7, q2, q0
- vld1.8 {d8}, [r2, : 64]
-@@ -1917,7 +1880,7 @@
- vext.32 d10, d6, d6, #0
- vmov.i32 q1, #0xffffffff
- vshl.i64 q4, q1, #25
-- add r5, sp, #512
-+ add r5, sp, #480
- vld1.8 {d14-d15}, [r5, : 128]
- vadd.i64 q9, q2, q7
- vshl.i64 q1, q1, #26
-@@ -1926,7 +1889,7 @@
- vadd.i64 q5, q5, q10
- vand q9, q9, q1
- vld1.8 {d16}, [r2, : 64]!
-- add r2, sp, #528
-+ add r2, sp, #496
- vld1.8 {d20-d21}, [r2, : 128]
- vadd.i64 q11, q5, q10
- vsub.i64 q2, q2, q9
-@@ -1980,7 +1943,7 @@
- sub r2, r2, #32
- vst1.8 d4, [r2, : 64]
- cmp r4, #0
-- beq ._skippostcopy
-+ beq .Lskippostcopy
- add r2, r3, #144
- mov r4, r4
- vld1.8 {d0-d1}, [r2, : 128]!
-@@ -1989,9 +1952,9 @@
- vst1.8 {d0-d1}, [r4, : 128]!
- vst1.8 {d2-d3}, [r4, : 128]!
- vst1.8 d4, [r4, : 64]
--._skippostcopy:
-+.Lskippostcopy:
- cmp r1, #1
-- bne ._skipfinalcopy
-+ bne .Lskipfinalcopy
- add r2, r3, #288
- add r4, r3, #144
- vld1.8 {d0-d1}, [r2, : 128]!
-@@ -2000,10 +1963,10 @@
- vst1.8 {d0-d1}, [r4, : 128]!
- vst1.8 {d2-d3}, [r4, : 128]!
- vst1.8 d4, [r4, : 64]
--._skipfinalcopy:
-+.Lskipfinalcopy:
- add r1, r1, #1
- cmp r1, #12
-- blo ._invertloop
-+ blo .Linvertloop
- add r1, r3, #144
- ldr r2, [r1], #4
- ldr r3, [r1], #4
-@@ -2085,21 +2048,15 @@
- add r8, r8, r10, LSL #12
- mov r9, r10, LSR #20
- add r1, r9, r1, LSL #6
-- str r2, [r0], #4
-- str r3, [r0], #4
-- str r4, [r0], #4
-- str r5, [r0], #4
-- str r6, [r0], #4
-- str r7, [r0], #4
-- str r8, [r0], #4
-- str r1, [r0]
-- ldrd r4, [sp, #0]
-- ldrd r6, [sp, #8]
-- ldrd r8, [sp, #16]
-- ldrd r10, [sp, #24]
-- ldr r12, [sp, #480]
-- ldr r14, [sp, #484]
-- ldr r0, =0
-- mov sp, r12
-- vpop {q4, q5, q6, q7}
-- bx lr
-+ str r2, [r0]
-+ str r3, [r0, #4]
-+ str r4, [r0, #8]
-+ str r5, [r0, #12]
-+ str r6, [r0, #16]
-+ str r7, [r0, #20]
-+ str r8, [r0, #24]
-+ str r1, [r0, #28]
-+ movw r0, #0
-+ mov sp, ip
-+ pop {r4-r11, pc}
-+ENDPROC(curve25519_neon)
---- /dev/null
-+++ b/arch/arm/crypto/curve25519-glue.c
-@@ -0,0 +1,127 @@
-+// SPDX-License-Identifier: GPL-2.0 OR MIT
-+/*
-+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-+ *
-+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
-+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
-+ * manually reworked for use in kernel space.
-+ */
-+
-+#include <asm/hwcap.h>
-+#include <asm/neon.h>
-+#include <asm/simd.h>
-+#include <crypto/internal/kpp.h>
-+#include <crypto/internal/simd.h>
-+#include <linux/types.h>
-+#include <linux/module.h>
-+#include <linux/init.h>
-+#include <linux/jump_label.h>
-+#include <crypto/curve25519.h>
-+
-+asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
-+ const u8 secret[CURVE25519_KEY_SIZE],
-+ const u8 basepoint[CURVE25519_KEY_SIZE]);
-+
-+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-+
-+void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
-+ const u8 scalar[CURVE25519_KEY_SIZE],
-+ const u8 point[CURVE25519_KEY_SIZE])
-+{
-+ if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
-+ kernel_neon_begin();
-+ curve25519_neon(out, scalar, point);
-+ kernel_neon_end();
-+ } else {
-+ curve25519_generic(out, scalar, point);
-+ }
-+}
-+EXPORT_SYMBOL(curve25519_arch);
-+
-+static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
-+ unsigned int len)
-+{
-+ u8 *secret = kpp_tfm_ctx(tfm);
-+
-+ if (!len)
-+ curve25519_generate_secret(secret);
-+ else if (len == CURVE25519_KEY_SIZE &&
-+ crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
-+ memcpy(secret, buf, CURVE25519_KEY_SIZE);
-+ else
-+ return -EINVAL;
-+ return 0;
-+}
-+
-+static int curve25519_compute_value(struct kpp_request *req)
-+{
-+ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-+ const u8 *secret = kpp_tfm_ctx(tfm);
-+ u8 public_key[CURVE25519_KEY_SIZE];
-+ u8 buf[CURVE25519_KEY_SIZE];
-+ int copied, nbytes;
-+ u8 const *bp;
-+
-+ if (req->src) {
-+ copied = sg_copy_to_buffer(req->src,
-+ sg_nents_for_len(req->src,
-+ CURVE25519_KEY_SIZE),
-+ public_key, CURVE25519_KEY_SIZE);
-+ if (copied != CURVE25519_KEY_SIZE)
-+ return -EINVAL;
-+ bp = public_key;
-+ } else {
-+ bp = curve25519_base_point;
-+ }
-+
-+ curve25519_arch(buf, secret, bp);
-+
-+ /* might want less than we've got */
-+ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
-+ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
-+ nbytes),
-+ buf, nbytes);
-+ if (copied != nbytes)
-+ return -EINVAL;
-+ return 0;
-+}
-+
-+static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
-+{
-+ return CURVE25519_KEY_SIZE;
-+}
-+
-+static struct kpp_alg curve25519_alg = {
-+ .base.cra_name = "curve25519",
-+ .base.cra_driver_name = "curve25519-neon",
-+ .base.cra_priority = 200,
-+ .base.cra_module = THIS_MODULE,
-+ .base.cra_ctxsize = CURVE25519_KEY_SIZE,
-+
-+ .set_secret = curve25519_set_secret,
-+ .generate_public_key = curve25519_compute_value,
-+ .compute_shared_secret = curve25519_compute_value,
-+ .max_size = curve25519_max_size,
-+};
-+
-+static int __init mod_init(void)
-+{
-+ if (elf_hwcap & HWCAP_NEON) {
-+ static_branch_enable(&have_neon);
-+ return crypto_register_kpp(&curve25519_alg);
-+ }
-+ return 0;
-+}
-+
-+static void __exit mod_exit(void)
-+{
-+ if (elf_hwcap & HWCAP_NEON)
-+ crypto_unregister_kpp(&curve25519_alg);
-+}
-+
-+module_init(mod_init);
-+module_exit(mod_exit);
-+
-+MODULE_ALIAS_CRYPTO("curve25519");
-+MODULE_ALIAS_CRYPTO("curve25519-neon");
-+MODULE_LICENSE("GPL v2");