aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
diff options
context:
space:
mode:
authorDaniel Golle <daniel@makrotopia.org>2022-03-21 01:16:48 +0000
committerDaniel Golle <daniel@makrotopia.org>2022-03-21 13:11:56 +0000
commit786bf7fdaca4c75e7eba6e9aa3a8b5775fd21186 (patch)
tree926fecb2b1f6ce1e42ba7ef4c7aab8e68dfd214c /target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
parent9470160c350d15f765c33d6c1db15d6c4709a64c (diff)
downloadupstream-786bf7fdaca4c75e7eba6e9aa3a8b5775fd21186.tar.gz
upstream-786bf7fdaca4c75e7eba6e9aa3a8b5775fd21186.tar.bz2
upstream-786bf7fdaca4c75e7eba6e9aa3a8b5775fd21186.zip
kernel: delete Linux 5.4 config and patches
As the upcoming release will be based on Linux 5.10 only, remove all kernel configuration as well as patches for Linux 5.4. There were no targets still actively using Linux 5.4. Signed-off-by: Daniel Golle <daniel@makrotopia.org> (cherry picked from commit 3a14580411adfb75f9a44eded9f41245b9e44606)
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch2776
1 files changed, 0 insertions, 2776 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch b/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
deleted file mode 100644
index 367b20fc3a..0000000000
--- a/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
+++ /dev/null
@@ -1,2776 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ardb@kernel.org>
-Date: Fri, 8 Nov 2019 13:22:25 +0100
-Subject: [PATCH] crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON
- implementation
-
-commit a6b803b3ddc793d6db0c16f12fc12d30d20fa9cc upstream.
-
-This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
-for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
-project. The file 'poly1305-armv4.pl' is taken straight from this upstream
-GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
-and already contains all the changes required to build it as part of a
-Linux kernel module.
-
-[0] https://github.com/dot-asm/cryptogams
-
-Co-developed-by: Andy Polyakov <appro@cryptogams.org>
-Signed-off-by: Andy Polyakov <appro@cryptogams.org>
-Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
-Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
----
- arch/arm/crypto/Kconfig | 5 +
- arch/arm/crypto/Makefile | 12 +-
- arch/arm/crypto/poly1305-armv4.pl | 1236 +++++++++++++++++++++++
- arch/arm/crypto/poly1305-core.S_shipped | 1158 +++++++++++++++++++++
- arch/arm/crypto/poly1305-glue.c | 276 +++++
- lib/crypto/Kconfig | 2 +-
- 6 files changed, 2687 insertions(+), 2 deletions(-)
- create mode 100644 arch/arm/crypto/poly1305-armv4.pl
- create mode 100644 arch/arm/crypto/poly1305-core.S_shipped
- create mode 100644 arch/arm/crypto/poly1305-glue.c
-
---- a/arch/arm/crypto/Kconfig
-+++ b/arch/arm/crypto/Kconfig
-@@ -131,6 +131,11 @@ config CRYPTO_CHACHA20_NEON
- select CRYPTO_BLKCIPHER
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-+config CRYPTO_POLY1305_ARM
-+ tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
-+ select CRYPTO_HASH
-+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
-+
- config CRYPTO_NHPOLY1305_NEON
- tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
- depends on KERNEL_MODE_NEON
---- a/arch/arm/crypto/Makefile
-+++ b/arch/arm/crypto/Makefile
-@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sh
- obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
- obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
- obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-+obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
- obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
-
- ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
-@@ -55,12 +56,16 @@ crct10dif-arm-ce-y := crct10dif-ce-core.
- crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
- chacha-neon-y := chacha-scalar-core.o chacha-glue.o
- chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
-+poly1305-arm-y := poly1305-core.o poly1305-glue.o
- nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
-
- ifdef REGENERATE_ARM_CRYPTO
- quiet_cmd_perl = PERL $@
- cmd_perl = $(PERL) $(<) > $(@)
-
-+$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
-+ $(call cmd,perl)
-+
- $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
- $(call cmd,perl)
-
-@@ -68,4 +73,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha
- $(call cmd,perl)
- endif
-
--clean-files += sha256-core.S sha512-core.S
-+clean-files += poly1305-core.S sha256-core.S sha512-core.S
-+
-+# massage the perlasm code a bit so we only get the NEON routine if we need it
-+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
-+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
-+AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
---- /dev/null
-+++ b/arch/arm/crypto/poly1305-armv4.pl
-@@ -0,0 +1,1236 @@
-+#!/usr/bin/env perl
-+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-+#
-+# ====================================================================
-+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-+# project.
-+# ====================================================================
-+#
-+# IALU(*)/gcc-4.4 NEON
-+#
-+# ARM11xx(ARMv6) 7.78/+100% -
-+# Cortex-A5 6.35/+130% 3.00
-+# Cortex-A8 6.25/+115% 2.36
-+# Cortex-A9 5.10/+95% 2.55
-+# Cortex-A15 3.85/+85% 1.25(**)
-+# Snapdragon S4 5.70/+100% 1.48(**)
-+#
-+# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
-+# (**) these are trade-off results, they can be improved by ~8% but at
-+# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-+# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-+
-+$flavour = shift;
-+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-+
-+if ($flavour && $flavour ne "void") {
-+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-+ die "can't locate arm-xlate.pl";
-+
-+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
-+} else {
-+ open STDOUT,">$output";
-+}
-+
-+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-+
-+$code.=<<___;
-+#ifndef __KERNEL__
-+# include "arm_arch.h"
-+#else
-+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-+# define poly1305_init poly1305_init_arm
-+# define poly1305_blocks poly1305_blocks_arm
-+# define poly1305_emit poly1305_emit_arm
-+.globl poly1305_blocks_neon
-+#endif
-+
-+#if defined(__thumb2__)
-+.syntax unified
-+.thumb
-+#else
-+.code 32
-+#endif
-+
-+.text
-+
-+.globl poly1305_emit
-+.globl poly1305_blocks
-+.globl poly1305_init
-+.type poly1305_init,%function
-+.align 5
-+poly1305_init:
-+.Lpoly1305_init:
-+ stmdb sp!,{r4-r11}
-+
-+ eor r3,r3,r3
-+ cmp $inp,#0
-+ str r3,[$ctx,#0] @ zero hash value
-+ str r3,[$ctx,#4]
-+ str r3,[$ctx,#8]
-+ str r3,[$ctx,#12]
-+ str r3,[$ctx,#16]
-+ str r3,[$ctx,#36] @ clear is_base2_26
-+ add $ctx,$ctx,#20
-+
-+#ifdef __thumb2__
-+ it eq
-+#endif
-+ moveq r0,#0
-+ beq .Lno_key
-+
-+#if __ARM_MAX_ARCH__>=7
-+ mov r3,#-1
-+ str r3,[$ctx,#28] @ impossible key power value
-+# ifndef __KERNEL__
-+ adr r11,.Lpoly1305_init
-+ ldr r12,.LOPENSSL_armcap
-+# endif
-+#endif
-+ ldrb r4,[$inp,#0]
-+ mov r10,#0x0fffffff
-+ ldrb r5,[$inp,#1]
-+ and r3,r10,#-4 @ 0x0ffffffc
-+ ldrb r6,[$inp,#2]
-+ ldrb r7,[$inp,#3]
-+ orr r4,r4,r5,lsl#8
-+ ldrb r5,[$inp,#4]
-+ orr r4,r4,r6,lsl#16
-+ ldrb r6,[$inp,#5]
-+ orr r4,r4,r7,lsl#24
-+ ldrb r7,[$inp,#6]
-+ and r4,r4,r10
-+
-+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-+# if !defined(_WIN32)
-+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
-+# endif
-+# if defined(__APPLE__) || defined(_WIN32)
-+ ldr r12,[r12]
-+# endif
-+#endif
-+ ldrb r8,[$inp,#7]
-+ orr r5,r5,r6,lsl#8
-+ ldrb r6,[$inp,#8]
-+ orr r5,r5,r7,lsl#16
-+ ldrb r7,[$inp,#9]
-+ orr r5,r5,r8,lsl#24
-+ ldrb r8,[$inp,#10]
-+ and r5,r5,r3
-+
-+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-+ tst r12,#ARMV7_NEON @ check for NEON
-+# ifdef __thumb2__
-+ adr r9,.Lpoly1305_blocks_neon
-+ adr r11,.Lpoly1305_blocks
-+ it ne
-+ movne r11,r9
-+ adr r12,.Lpoly1305_emit
-+ orr r11,r11,#1 @ thumb-ify addresses
-+ orr r12,r12,#1
-+# else
-+ add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
-+ ite eq
-+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
-+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
-+# endif
-+#endif
-+ ldrb r9,[$inp,#11]
-+ orr r6,r6,r7,lsl#8
-+ ldrb r7,[$inp,#12]
-+ orr r6,r6,r8,lsl#16
-+ ldrb r8,[$inp,#13]
-+ orr r6,r6,r9,lsl#24
-+ ldrb r9,[$inp,#14]
-+ and r6,r6,r3
-+
-+ ldrb r10,[$inp,#15]
-+ orr r7,r7,r8,lsl#8
-+ str r4,[$ctx,#0]
-+ orr r7,r7,r9,lsl#16
-+ str r5,[$ctx,#4]
-+ orr r7,r7,r10,lsl#24
-+ str r6,[$ctx,#8]
-+ and r7,r7,r3
-+ str r7,[$ctx,#12]
-+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-+ stmia r2,{r11,r12} @ fill functions table
-+ mov r0,#1
-+#else
-+ mov r0,#0
-+#endif
-+.Lno_key:
-+ ldmia sp!,{r4-r11}
-+#if __ARM_ARCH__>=5
-+ ret @ bx lr
-+#else
-+ tst lr,#1
-+ moveq pc,lr @ be binary compatible with V4, yet
-+ bx lr @ interoperable with Thumb ISA:-)
-+#endif
-+.size poly1305_init,.-poly1305_init
-+___
-+{
-+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-+my ($s1,$s2,$s3)=($r1,$r2,$r3);
-+
-+$code.=<<___;
-+.type poly1305_blocks,%function
-+.align 5
-+poly1305_blocks:
-+.Lpoly1305_blocks:
-+ stmdb sp!,{r3-r11,lr}
-+
-+ ands $len,$len,#-16
-+ beq .Lno_data
-+
-+ add $len,$len,$inp @ end pointer
-+ sub sp,sp,#32
-+
-+#if __ARM_ARCH__<7
-+ ldmia $ctx,{$h0-$r3} @ load context
-+ add $ctx,$ctx,#20
-+ str $len,[sp,#16] @ offload stuff
-+ str $ctx,[sp,#12]
-+#else
-+ ldr lr,[$ctx,#36] @ is_base2_26
-+ ldmia $ctx!,{$h0-$h4} @ load hash value
-+ str $len,[sp,#16] @ offload stuff
-+ str $ctx,[sp,#12]
-+
-+ adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
-+ mov $r1,$h1,lsr#6
-+ adcs $r1,$r1,$h2,lsl#20
-+ mov $r2,$h2,lsr#12
-+ adcs $r2,$r2,$h3,lsl#14
-+ mov $r3,$h3,lsr#18
-+ adcs $r3,$r3,$h4,lsl#8
-+ mov $len,#0
-+ teq lr,#0
-+ str $len,[$ctx,#16] @ clear is_base2_26
-+ adc $len,$len,$h4,lsr#24
-+
-+ itttt ne
-+ movne $h0,$r0 @ choose between radixes
-+ movne $h1,$r1
-+ movne $h2,$r2
-+ movne $h3,$r3
-+ ldmia $ctx,{$r0-$r3} @ load key
-+ it ne
-+ movne $h4,$len
-+#endif
-+
-+ mov lr,$inp
-+ cmp $padbit,#0
-+ str $r1,[sp,#20]
-+ str $r2,[sp,#24]
-+ str $r3,[sp,#28]
-+ b .Loop
-+
-+.align 4
-+.Loop:
-+#if __ARM_ARCH__<7
-+ ldrb r0,[lr],#16 @ load input
-+# ifdef __thumb2__
-+ it hi
-+# endif
-+ addhi $h4,$h4,#1 @ 1<<128
-+ ldrb r1,[lr,#-15]
-+ ldrb r2,[lr,#-14]
-+ ldrb r3,[lr,#-13]
-+ orr r1,r0,r1,lsl#8
-+ ldrb r0,[lr,#-12]
-+ orr r2,r1,r2,lsl#16
-+ ldrb r1,[lr,#-11]
-+ orr r3,r2,r3,lsl#24
-+ ldrb r2,[lr,#-10]
-+ adds $h0,$h0,r3 @ accumulate input
-+
-+ ldrb r3,[lr,#-9]
-+ orr r1,r0,r1,lsl#8
-+ ldrb r0,[lr,#-8]
-+ orr r2,r1,r2,lsl#16
-+ ldrb r1,[lr,#-7]
-+ orr r3,r2,r3,lsl#24
-+ ldrb r2,[lr,#-6]
-+ adcs $h1,$h1,r3
-+
-+ ldrb r3,[lr,#-5]
-+ orr r1,r0,r1,lsl#8
-+ ldrb r0,[lr,#-4]
-+ orr r2,r1,r2,lsl#16
-+ ldrb r1,[lr,#-3]
-+ orr r3,r2,r3,lsl#24
-+ ldrb r2,[lr,#-2]
-+ adcs $h2,$h2,r3
-+
-+ ldrb r3,[lr,#-1]
-+ orr r1,r0,r1,lsl#8
-+ str lr,[sp,#8] @ offload input pointer
-+ orr r2,r1,r2,lsl#16
-+ add $s1,$r1,$r1,lsr#2
-+ orr r3,r2,r3,lsl#24
-+#else
-+ ldr r0,[lr],#16 @ load input
-+ it hi
-+ addhi $h4,$h4,#1 @ padbit
-+ ldr r1,[lr,#-12]
-+ ldr r2,[lr,#-8]
-+ ldr r3,[lr,#-4]
-+# ifdef __ARMEB__
-+ rev r0,r0
-+ rev r1,r1
-+ rev r2,r2
-+ rev r3,r3
-+# endif
-+ adds $h0,$h0,r0 @ accumulate input
-+ str lr,[sp,#8] @ offload input pointer
-+ adcs $h1,$h1,r1
-+ add $s1,$r1,$r1,lsr#2
-+ adcs $h2,$h2,r2
-+#endif
-+ add $s2,$r2,$r2,lsr#2
-+ adcs $h3,$h3,r3
-+ add $s3,$r3,$r3,lsr#2
-+
-+ umull r2,r3,$h1,$r0
-+ adc $h4,$h4,#0
-+ umull r0,r1,$h0,$r0
-+ umlal r2,r3,$h4,$s1
-+ umlal r0,r1,$h3,$s1
-+ ldr $r1,[sp,#20] @ reload $r1
-+ umlal r2,r3,$h2,$s3
-+ umlal r0,r1,$h1,$s3
-+ umlal r2,r3,$h3,$s2
-+ umlal r0,r1,$h2,$s2
-+ umlal r2,r3,$h0,$r1
-+ str r0,[sp,#0] @ future $h0
-+ mul r0,$s2,$h4
-+ ldr $r2,[sp,#24] @ reload $r2
-+ adds r2,r2,r1 @ d1+=d0>>32
-+ eor r1,r1,r1
-+ adc lr,r3,#0 @ future $h2
-+ str r2,[sp,#4] @ future $h1
-+
-+ mul r2,$s3,$h4
-+ eor r3,r3,r3
-+ umlal r0,r1,$h3,$s3
-+ ldr $r3,[sp,#28] @ reload $r3
-+ umlal r2,r3,$h3,$r0
-+ umlal r0,r1,$h2,$r0
-+ umlal r2,r3,$h2,$r1
-+ umlal r0,r1,$h1,$r1
-+ umlal r2,r3,$h1,$r2
-+ umlal r0,r1,$h0,$r2
-+ umlal r2,r3,$h0,$r3
-+ ldr $h0,[sp,#0]
-+ mul $h4,$r0,$h4
-+ ldr $h1,[sp,#4]
-+
-+ adds $h2,lr,r0 @ d2+=d1>>32
-+ ldr lr,[sp,#8] @ reload input pointer
-+ adc r1,r1,#0
-+ adds $h3,r2,r1 @ d3+=d2>>32
-+ ldr r0,[sp,#16] @ reload end pointer
-+ adc r3,r3,#0
-+ add $h4,$h4,r3 @ h4+=d3>>32
-+
-+ and r1,$h4,#-4
-+ and $h4,$h4,#3
-+ add r1,r1,r1,lsr#2 @ *=5
-+ adds $h0,$h0,r1
-+ adcs $h1,$h1,#0
-+ adcs $h2,$h2,#0
-+ adcs $h3,$h3,#0
-+ adc $h4,$h4,#0
-+
-+ cmp r0,lr @ done yet?
-+ bhi .Loop
-+
-+ ldr $ctx,[sp,#12]
-+ add sp,sp,#32
-+ stmdb $ctx,{$h0-$h4} @ store the result
-+
-+.Lno_data:
-+#if __ARM_ARCH__>=5
-+ ldmia sp!,{r3-r11,pc}
-+#else
-+ ldmia sp!,{r3-r11,lr}
-+ tst lr,#1
-+ moveq pc,lr @ be binary compatible with V4, yet
-+ bx lr @ interoperable with Thumb ISA:-)
-+#endif
-+.size poly1305_blocks,.-poly1305_blocks
-+___
-+}
-+{
-+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-+my $g4=$ctx;
-+
-+$code.=<<___;
-+.type poly1305_emit,%function
-+.align 5
-+poly1305_emit:
-+.Lpoly1305_emit:
-+ stmdb sp!,{r4-r11}
-+
-+ ldmia $ctx,{$h0-$h4}
-+
-+#if __ARM_ARCH__>=7
-+ ldr ip,[$ctx,#36] @ is_base2_26
-+
-+ adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
-+ mov $g1,$h1,lsr#6
-+ adcs $g1,$g1,$h2,lsl#20
-+ mov $g2,$h2,lsr#12
-+ adcs $g2,$g2,$h3,lsl#14
-+ mov $g3,$h3,lsr#18
-+ adcs $g3,$g3,$h4,lsl#8
-+ mov $g4,#0
-+ adc $g4,$g4,$h4,lsr#24
-+
-+ tst ip,ip
-+ itttt ne
-+ movne $h0,$g0
-+ movne $h1,$g1
-+ movne $h2,$g2
-+ movne $h3,$g3
-+ it ne
-+ movne $h4,$g4
-+#endif
-+
-+ adds $g0,$h0,#5 @ compare to modulus
-+ adcs $g1,$h1,#0
-+ adcs $g2,$h2,#0
-+ adcs $g3,$h3,#0
-+ adc $g4,$h4,#0
-+ tst $g4,#4 @ did it carry/borrow?
-+
-+#ifdef __thumb2__
-+ it ne
-+#endif
-+ movne $h0,$g0
-+ ldr $g0,[$nonce,#0]
-+#ifdef __thumb2__
-+ it ne
-+#endif
-+ movne $h1,$g1
-+ ldr $g1,[$nonce,#4]
-+#ifdef __thumb2__
-+ it ne
-+#endif
-+ movne $h2,$g2
-+ ldr $g2,[$nonce,#8]
-+#ifdef __thumb2__
-+ it ne
-+#endif
-+ movne $h3,$g3
-+ ldr $g3,[$nonce,#12]
-+
-+ adds $h0,$h0,$g0
-+ adcs $h1,$h1,$g1
-+ adcs $h2,$h2,$g2
-+ adc $h3,$h3,$g3
-+
-+#if __ARM_ARCH__>=7
-+# ifdef __ARMEB__
-+ rev $h0,$h0
-+ rev $h1,$h1
-+ rev $h2,$h2
-+ rev $h3,$h3
-+# endif
-+ str $h0,[$mac,#0]
-+ str $h1,[$mac,#4]
-+ str $h2,[$mac,#8]
-+ str $h3,[$mac,#12]
-+#else
-+ strb $h0,[$mac,#0]
-+ mov $h0,$h0,lsr#8
-+ strb $h1,[$mac,#4]
-+ mov $h1,$h1,lsr#8
-+ strb $h2,[$mac,#8]
-+ mov $h2,$h2,lsr#8
-+ strb $h3,[$mac,#12]
-+ mov $h3,$h3,lsr#8
-+
-+ strb $h0,[$mac,#1]
-+ mov $h0,$h0,lsr#8
-+ strb $h1,[$mac,#5]
-+ mov $h1,$h1,lsr#8
-+ strb $h2,[$mac,#9]
-+ mov $h2,$h2,lsr#8
-+ strb $h3,[$mac,#13]
-+ mov $h3,$h3,lsr#8
-+
-+ strb $h0,[$mac,#2]
-+ mov $h0,$h0,lsr#8
-+ strb $h1,[$mac,#6]
-+ mov $h1,$h1,lsr#8
-+ strb $h2,[$mac,#10]
-+ mov $h2,$h2,lsr#8
-+ strb $h3,[$mac,#14]
-+ mov $h3,$h3,lsr#8
-+
-+ strb $h0,[$mac,#3]
-+ strb $h1,[$mac,#7]
-+ strb $h2,[$mac,#11]
-+ strb $h3,[$mac,#15]
-+#endif
-+ ldmia sp!,{r4-r11}
-+#if __ARM_ARCH__>=5
-+ ret @ bx lr
-+#else
-+ tst lr,#1
-+ moveq pc,lr @ be binary compatible with V4, yet
-+ bx lr @ interoperable with Thumb ISA:-)
-+#endif
-+.size poly1305_emit,.-poly1305_emit
-+___
-+{
-+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-+
-+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-+
-+$code.=<<___;
-+#if __ARM_MAX_ARCH__>=7
-+.fpu neon
-+
-+.type poly1305_init_neon,%function
-+.align 5
-+poly1305_init_neon:
-+.Lpoly1305_init_neon:
-+ ldr r3,[$ctx,#48] @ first table element
-+ cmp r3,#-1 @ is value impossible?
-+ bne .Lno_init_neon
-+
-+ ldr r4,[$ctx,#20] @ load key base 2^32
-+ ldr r5,[$ctx,#24]
-+ ldr r6,[$ctx,#28]
-+ ldr r7,[$ctx,#32]
-+
-+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
-+ mov r3,r4,lsr#26
-+ mov r4,r5,lsr#20
-+ orr r3,r3,r5,lsl#6
-+ mov r5,r6,lsr#14
-+ orr r4,r4,r6,lsl#12
-+ mov r6,r7,lsr#8
-+ orr r5,r5,r7,lsl#18
-+ and r3,r3,#0x03ffffff
-+ and r4,r4,#0x03ffffff
-+ and r5,r5,#0x03ffffff
-+
-+ vdup.32 $R0,r2 @ r^1 in both lanes
-+ add r2,r3,r3,lsl#2 @ *5
-+ vdup.32 $R1,r3
-+ add r3,r4,r4,lsl#2
-+ vdup.32 $S1,r2
-+ vdup.32 $R2,r4
-+ add r4,r5,r5,lsl#2
-+ vdup.32 $S2,r3
-+ vdup.32 $R3,r5
-+ add r5,r6,r6,lsl#2
-+ vdup.32 $S3,r4
-+ vdup.32 $R4,r6
-+ vdup.32 $S4,r5
-+
-+ mov $zeros,#2 @ counter
-+
-+.Lsquare_neon:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
-+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
-+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
-+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-+
-+ vmull.u32 $D0,$R0,${R0}[1]
-+ vmull.u32 $D1,$R1,${R0}[1]
-+ vmull.u32 $D2,$R2,${R0}[1]
-+ vmull.u32 $D3,$R3,${R0}[1]
-+ vmull.u32 $D4,$R4,${R0}[1]
-+
-+ vmlal.u32 $D0,$R4,${S1}[1]
-+ vmlal.u32 $D1,$R0,${R1}[1]
-+ vmlal.u32 $D2,$R1,${R1}[1]
-+ vmlal.u32 $D3,$R2,${R1}[1]
-+ vmlal.u32 $D4,$R3,${R1}[1]
-+
-+ vmlal.u32 $D0,$R3,${S2}[1]
-+ vmlal.u32 $D1,$R4,${S2}[1]
-+ vmlal.u32 $D3,$R1,${R2}[1]
-+ vmlal.u32 $D2,$R0,${R2}[1]
-+ vmlal.u32 $D4,$R2,${R2}[1]
-+
-+ vmlal.u32 $D0,$R2,${S3}[1]
-+ vmlal.u32 $D3,$R0,${R3}[1]
-+ vmlal.u32 $D1,$R3,${S3}[1]
-+ vmlal.u32 $D2,$R4,${S3}[1]
-+ vmlal.u32 $D4,$R1,${R3}[1]
-+
-+ vmlal.u32 $D3,$R4,${S4}[1]
-+ vmlal.u32 $D0,$R1,${S4}[1]
-+ vmlal.u32 $D1,$R2,${S4}[1]
-+ vmlal.u32 $D2,$R3,${S4}[1]
-+ vmlal.u32 $D4,$R0,${R4}[1]
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-+ @ and P. Schwabe
-+ @
-+ @ H0>>+H1>>+H2>>+H3>>+H4
-+ @ H3>>+H4>>*5+H0>>+H1
-+ @
-+ @ Trivia.
-+ @
-+ @ Result of multiplication of n-bit number by m-bit number is
-+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
-+ @ m-bit number multiplied by 2^n is still n+m bits wide.
-+ @
-+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
-+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
-+ @ one is n+1 bits wide.
-+ @
-+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
-+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
-+ @ can be 27. However! In cases when their width exceeds 26 bits
-+ @ they are limited by 2^26+2^6. This in turn means that *sum*
-+ @ of the products with these values can still be viewed as sum
-+ @ of 52-bit numbers as long as the amount of addends is not a
-+ @ power of 2. For example,
-+ @
-+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
-+ @
-+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
-+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
-+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
-+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
-+ @ which is less than 32 * (2^52) or 2^57. And when processing
-+ @ data we are looking at triple as many addends...
-+ @
-+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
-+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
-+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
-+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
-+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
-+ @ This means that result of reduction have to be compressed upon
-+ @ loop wrap-around. This can be done in the process of reduction
-+ @ to minimize amount of instructions [as well as amount of
-+ @ 128-bit instructions, which benefits low-end processors], but
-+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
-+ @ not being wider than 58 bits, so that result of right shift
-+ @ by 26 bits fits in 32 bits. This is also useful on x86,
-+ @ because it allows to use paddd in place for paddq, which
-+ @ benefits Atom, where paddq is ridiculously slow.
-+
-+ vshr.u64 $T0,$D3,#26
-+ vmovn.i64 $D3#lo,$D3
-+ vshr.u64 $T1,$D0,#26
-+ vmovn.i64 $D0#lo,$D0
-+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
-+ vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
-+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
-+ vbic.i32 $D0#lo,#0xfc000000
-+
-+ vshrn.u64 $T0#lo,$D4,#26
-+ vmovn.i64 $D4#lo,$D4
-+ vshr.u64 $T1,$D1,#26
-+ vmovn.i64 $D1#lo,$D1
-+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
-+ vbic.i32 $D4#lo,#0xfc000000
-+ vbic.i32 $D1#lo,#0xfc000000
-+
-+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
-+ vshl.u32 $T0#lo,$T0#lo,#2
-+ vshrn.u64 $T1#lo,$D2,#26
-+ vmovn.i64 $D2#lo,$D2
-+ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
-+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
-+ vbic.i32 $D2#lo,#0xfc000000
-+
-+ vshr.u32 $T0#lo,$D0#lo,#26
-+ vbic.i32 $D0#lo,#0xfc000000
-+ vshr.u32 $T1#lo,$D3#lo,#26
-+ vbic.i32 $D3#lo,#0xfc000000
-+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
-+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
-+
-+ subs $zeros,$zeros,#1
-+ beq .Lsquare_break_neon
-+
-+ add $tbl0,$ctx,#(48+0*9*4)
-+ add $tbl1,$ctx,#(48+1*9*4)
-+
-+ vtrn.32 $R0,$D0#lo @ r^2:r^1
-+ vtrn.32 $R2,$D2#lo
-+ vtrn.32 $R3,$D3#lo
-+ vtrn.32 $R1,$D1#lo
-+ vtrn.32 $R4,$D4#lo
-+
-+ vshl.u32 $S2,$R2,#2 @ *5
-+ vshl.u32 $S3,$R3,#2
-+ vshl.u32 $S1,$R1,#2
-+ vshl.u32 $S4,$R4,#2
-+ vadd.i32 $S2,$S2,$R2
-+ vadd.i32 $S1,$S1,$R1
-+ vadd.i32 $S3,$S3,$R3
-+ vadd.i32 $S4,$S4,$R4
-+
-+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-+ vst1.32 {${S4}[0]},[$tbl0,:32]
-+ vst1.32 {${S4}[1]},[$tbl1,:32]
-+
-+ b .Lsquare_neon
-+
-+.align 4
-+.Lsquare_break_neon:
-+ add $tbl0,$ctx,#(48+2*4*9)
-+ add $tbl1,$ctx,#(48+3*4*9)
-+
-+ vmov $R0,$D0#lo @ r^4:r^3
-+ vshl.u32 $S1,$D1#lo,#2 @ *5
-+ vmov $R1,$D1#lo
-+ vshl.u32 $S2,$D2#lo,#2
-+ vmov $R2,$D2#lo
-+ vshl.u32 $S3,$D3#lo,#2
-+ vmov $R3,$D3#lo
-+ vshl.u32 $S4,$D4#lo,#2
-+ vmov $R4,$D4#lo
-+ vadd.i32 $S1,$S1,$D1#lo
-+ vadd.i32 $S2,$S2,$D2#lo
-+ vadd.i32 $S3,$S3,$D3#lo
-+ vadd.i32 $S4,$S4,$D4#lo
-+
-+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-+ vst1.32 {${S4}[0]},[$tbl0]
-+ vst1.32 {${S4}[1]},[$tbl1]
-+
-+.Lno_init_neon:
-+ ret @ bx lr
-+.size poly1305_init_neon,.-poly1305_init_neon
-+
-+.type poly1305_blocks_neon,%function
-+.align 5
-+poly1305_blocks_neon:
-+.Lpoly1305_blocks_neon:
-+ ldr ip,[$ctx,#36] @ is_base2_26
-+
-+ cmp $len,#64
-+ blo .Lpoly1305_blocks
-+
-+ stmdb sp!,{r4-r7}
-+ vstmdb sp!,{d8-d15} @ ABI specification says so
-+
-+ tst ip,ip @ is_base2_26?
-+ bne .Lbase2_26_neon
-+
-+ stmdb sp!,{r1-r3,lr}
-+ bl .Lpoly1305_init_neon
-+
-+ ldr r4,[$ctx,#0] @ load hash value base 2^32
-+ ldr r5,[$ctx,#4]
-+ ldr r6,[$ctx,#8]
-+ ldr r7,[$ctx,#12]
-+ ldr ip,[$ctx,#16]
-+
-+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
-+ mov r3,r4,lsr#26
-+ veor $D0#lo,$D0#lo,$D0#lo
-+ mov r4,r5,lsr#20
-+ orr r3,r3,r5,lsl#6
-+ veor $D1#lo,$D1#lo,$D1#lo
-+ mov r5,r6,lsr#14
-+ orr r4,r4,r6,lsl#12
-+ veor $D2#lo,$D2#lo,$D2#lo
-+ mov r6,r7,lsr#8
-+ orr r5,r5,r7,lsl#18
-+ veor $D3#lo,$D3#lo,$D3#lo
-+ and r3,r3,#0x03ffffff
-+ orr r6,r6,ip,lsl#24
-+ veor $D4#lo,$D4#lo,$D4#lo
-+ and r4,r4,#0x03ffffff
-+ mov r1,#1
-+ and r5,r5,#0x03ffffff
-+ str r1,[$ctx,#36] @ set is_base2_26
-+
-+ vmov.32 $D0#lo[0],r2
-+ vmov.32 $D1#lo[0],r3
-+ vmov.32 $D2#lo[0],r4
-+ vmov.32 $D3#lo[0],r5
-+ vmov.32 $D4#lo[0],r6
-+ adr $zeros,.Lzeros
-+
-+ ldmia sp!,{r1-r3,lr}
-+ b .Lhash_loaded
-+
-+.align 4
-+.Lbase2_26_neon:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ load hash value
-+
-+ veor $D0#lo,$D0#lo,$D0#lo
-+ veor $D1#lo,$D1#lo,$D1#lo
-+ veor $D2#lo,$D2#lo,$D2#lo
-+ veor $D3#lo,$D3#lo,$D3#lo
-+ veor $D4#lo,$D4#lo,$D4#lo
-+ vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-+ adr $zeros,.Lzeros
-+ vld1.32 {$D4#lo[0]},[$ctx]
-+ sub $ctx,$ctx,#16 @ rewind
-+
-+.Lhash_loaded:
-+ add $in2,$inp,#32
-+ mov $padbit,$padbit,lsl#24
-+ tst $len,#31
-+ beq .Leven
-+
-+ vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
-+ vmov.32 $H4#lo[0],$padbit
-+ sub $len,$len,#16
-+ add $in2,$inp,#32
-+
-+# ifdef __ARMEB__
-+ vrev32.8 $H0,$H0
-+ vrev32.8 $H3,$H3
-+ vrev32.8 $H1,$H1
-+ vrev32.8 $H2,$H2
-+# endif
-+ vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
-+ vshl.u32 $H3#lo,$H3#lo,#18
-+
-+ vsri.u32 $H3#lo,$H2#lo,#14
-+ vshl.u32 $H2#lo,$H2#lo,#12
-+ vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
-+
-+ vbic.i32 $H3#lo,#0xfc000000
-+ vsri.u32 $H2#lo,$H1#lo,#20
-+ vshl.u32 $H1#lo,$H1#lo,#6
-+
-+ vbic.i32 $H2#lo,#0xfc000000
-+ vsri.u32 $H1#lo,$H0#lo,#26
-+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
-+
-+ vbic.i32 $H0#lo,#0xfc000000
-+ vbic.i32 $H1#lo,#0xfc000000
-+ vadd.i32 $H2#hi,$H2#lo,$D2#lo
-+
-+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
-+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
-+
-+ mov $tbl1,$zeros
-+ add $tbl0,$ctx,#48
-+
-+ cmp $len,$len
-+ b .Long_tail
-+
-+.align 4
-+.Leven:
-+ subs $len,$len,#64
-+ it lo
-+ movlo $in2,$zeros
-+
-+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
-+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
-+ add $inp,$inp,#64
-+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
-+ add $in2,$in2,#64
-+ itt hi
-+ addhi $tbl1,$ctx,#(48+1*9*4)
-+ addhi $tbl0,$ctx,#(48+3*9*4)
-+
-+# ifdef __ARMEB__
-+ vrev32.8 $H0,$H0
-+ vrev32.8 $H3,$H3
-+ vrev32.8 $H1,$H1
-+ vrev32.8 $H2,$H2
-+# endif
-+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
-+ vshl.u32 $H3,$H3,#18
-+
-+ vsri.u32 $H3,$H2,#14
-+ vshl.u32 $H2,$H2,#12
-+
-+ vbic.i32 $H3,#0xfc000000
-+ vsri.u32 $H2,$H1,#20
-+ vshl.u32 $H1,$H1,#6
-+
-+ vbic.i32 $H2,#0xfc000000
-+ vsri.u32 $H1,$H0,#26
-+
-+ vbic.i32 $H0,#0xfc000000
-+ vbic.i32 $H1,#0xfc000000
-+
-+ bls .Lskip_loop
-+
-+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
-+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
-+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-+ b .Loop_neon
-+
-+.align 5
-+.Loop_neon:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-+ @ \___________________/
-+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-+ @ \___________________/ \____________________/
-+ @
-+ @ Note that we start with inp[2:3]*r^2. This is because it
-+ @ doesn't depend on reduction in previous iteration.
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
-+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
-+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
-+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ inp[2:3]*r^2
-+
-+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
-+ vmull.u32 $D2,$H2#hi,${R0}[1]
-+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
-+ vmull.u32 $D0,$H0#hi,${R0}[1]
-+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
-+ vmull.u32 $D3,$H3#hi,${R0}[1]
-+ vmlal.u32 $D2,$H1#hi,${R1}[1]
-+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
-+ vmull.u32 $D1,$H1#hi,${R0}[1]
-+
-+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
-+ vmull.u32 $D4,$H4#hi,${R0}[1]
-+ subs $len,$len,#64
-+ vmlal.u32 $D0,$H4#hi,${S1}[1]
-+ it lo
-+ movlo $in2,$zeros
-+ vmlal.u32 $D3,$H2#hi,${R1}[1]
-+ vld1.32 ${S4}[1],[$tbl1,:32]
-+ vmlal.u32 $D1,$H0#hi,${R1}[1]
-+ vmlal.u32 $D4,$H3#hi,${R1}[1]
-+
-+ vmlal.u32 $D0,$H3#hi,${S2}[1]
-+ vmlal.u32 $D3,$H1#hi,${R2}[1]
-+ vmlal.u32 $D4,$H2#hi,${R2}[1]
-+ vmlal.u32 $D1,$H4#hi,${S2}[1]
-+ vmlal.u32 $D2,$H0#hi,${R2}[1]
-+
-+ vmlal.u32 $D3,$H0#hi,${R3}[1]
-+ vmlal.u32 $D0,$H2#hi,${S3}[1]
-+ vmlal.u32 $D4,$H1#hi,${R3}[1]
-+ vmlal.u32 $D1,$H3#hi,${S3}[1]
-+ vmlal.u32 $D2,$H4#hi,${S3}[1]
-+
-+ vmlal.u32 $D3,$H4#hi,${S4}[1]
-+ vmlal.u32 $D0,$H1#hi,${S4}[1]
-+ vmlal.u32 $D4,$H0#hi,${R4}[1]
-+ vmlal.u32 $D1,$H2#hi,${S4}[1]
-+ vmlal.u32 $D2,$H3#hi,${S4}[1]
-+
-+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
-+ add $in2,$in2,#64
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ (hash+inp[0:1])*r^4 and accumulate
-+
-+ vmlal.u32 $D3,$H3#lo,${R0}[0]
-+ vmlal.u32 $D0,$H0#lo,${R0}[0]
-+ vmlal.u32 $D4,$H4#lo,${R0}[0]
-+ vmlal.u32 $D1,$H1#lo,${R0}[0]
-+ vmlal.u32 $D2,$H2#lo,${R0}[0]
-+ vld1.32 ${S4}[0],[$tbl0,:32]
-+
-+ vmlal.u32 $D3,$H2#lo,${R1}[0]
-+ vmlal.u32 $D0,$H4#lo,${S1}[0]
-+ vmlal.u32 $D4,$H3#lo,${R1}[0]
-+ vmlal.u32 $D1,$H0#lo,${R1}[0]
-+ vmlal.u32 $D2,$H1#lo,${R1}[0]
-+
-+ vmlal.u32 $D3,$H1#lo,${R2}[0]
-+ vmlal.u32 $D0,$H3#lo,${S2}[0]
-+ vmlal.u32 $D4,$H2#lo,${R2}[0]
-+ vmlal.u32 $D1,$H4#lo,${S2}[0]
-+ vmlal.u32 $D2,$H0#lo,${R2}[0]
-+
-+ vmlal.u32 $D3,$H0#lo,${R3}[0]
-+ vmlal.u32 $D0,$H2#lo,${S3}[0]
-+ vmlal.u32 $D4,$H1#lo,${R3}[0]
-+ vmlal.u32 $D1,$H3#lo,${S3}[0]
-+ vmlal.u32 $D3,$H4#lo,${S4}[0]
-+
-+ vmlal.u32 $D2,$H4#lo,${S3}[0]
-+ vmlal.u32 $D0,$H1#lo,${S4}[0]
-+ vmlal.u32 $D4,$H0#lo,${R4}[0]
-+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
-+ vmlal.u32 $D1,$H2#lo,${S4}[0]
-+ vmlal.u32 $D2,$H3#lo,${S4}[0]
-+
-+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
-+ add $inp,$inp,#64
-+# ifdef __ARMEB__
-+ vrev32.8 $H0,$H0
-+ vrev32.8 $H1,$H1
-+ vrev32.8 $H2,$H2
-+ vrev32.8 $H3,$H3
-+# endif
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
-+ @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-+
-+ vshr.u64 $T0,$D3,#26
-+ vmovn.i64 $D3#lo,$D3
-+ vshr.u64 $T1,$D0,#26
-+ vmovn.i64 $D0#lo,$D0
-+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
-+ vbic.i32 $D3#lo,#0xfc000000
-+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
-+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
-+ vshl.u32 $H3,$H3,#18
-+ vbic.i32 $D0#lo,#0xfc000000
-+
-+ vshrn.u64 $T0#lo,$D4,#26
-+ vmovn.i64 $D4#lo,$D4
-+ vshr.u64 $T1,$D1,#26
-+ vmovn.i64 $D1#lo,$D1
-+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
-+ vsri.u32 $H3,$H2,#14
-+ vbic.i32 $D4#lo,#0xfc000000
-+ vshl.u32 $H2,$H2,#12
-+ vbic.i32 $D1#lo,#0xfc000000
-+
-+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
-+ vshl.u32 $T0#lo,$T0#lo,#2
-+ vbic.i32 $H3,#0xfc000000
-+ vshrn.u64 $T1#lo,$D2,#26
-+ vmovn.i64 $D2#lo,$D2
-+ vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
-+ vsri.u32 $H2,$H1,#20
-+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
-+ vshl.u32 $H1,$H1,#6
-+ vbic.i32 $D2#lo,#0xfc000000
-+ vbic.i32 $H2,#0xfc000000
-+
-+ vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
-+ vmovn.i64 $D0#lo,$D0
-+ vsri.u32 $H1,$H0,#26
-+ vbic.i32 $H0,#0xfc000000
-+ vshr.u32 $T1#lo,$D3#lo,#26
-+ vbic.i32 $D3#lo,#0xfc000000
-+ vbic.i32 $D0#lo,#0xfc000000
-+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
-+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
-+ vbic.i32 $H1,#0xfc000000
-+
-+ bhi .Loop_neon
-+
-+.Lskip_loop:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-+
-+ add $tbl1,$ctx,#(48+0*9*4)
-+ add $tbl0,$ctx,#(48+1*9*4)
-+ adds $len,$len,#32
-+ it ne
-+ movne $len,#0
-+ bne .Long_tail
-+
-+ vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
-+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
-+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
-+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
-+ vadd.i32 $H4#hi,$H4#lo,$D4#lo
-+
-+.Long_tail:
-+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
-+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
-+
-+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
-+ vmull.u32 $D2,$H2#hi,$R0
-+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
-+ vmull.u32 $D0,$H0#hi,$R0
-+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
-+ vmull.u32 $D3,$H3#hi,$R0
-+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
-+ vmull.u32 $D1,$H1#hi,$R0
-+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
-+ vmull.u32 $D4,$H4#hi,$R0
-+
-+ vmlal.u32 $D0,$H4#hi,$S1
-+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-+ vmlal.u32 $D3,$H2#hi,$R1
-+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-+ vmlal.u32 $D1,$H0#hi,$R1
-+ vmlal.u32 $D4,$H3#hi,$R1
-+ vmlal.u32 $D2,$H1#hi,$R1
-+
-+ vmlal.u32 $D3,$H1#hi,$R2
-+ vld1.32 ${S4}[1],[$tbl1,:32]
-+ vmlal.u32 $D0,$H3#hi,$S2
-+ vld1.32 ${S4}[0],[$tbl0,:32]
-+ vmlal.u32 $D4,$H2#hi,$R2
-+ vmlal.u32 $D1,$H4#hi,$S2
-+ vmlal.u32 $D2,$H0#hi,$R2
-+
-+ vmlal.u32 $D3,$H0#hi,$R3
-+ it ne
-+ addne $tbl1,$ctx,#(48+2*9*4)
-+ vmlal.u32 $D0,$H2#hi,$S3
-+ it ne
-+ addne $tbl0,$ctx,#(48+3*9*4)
-+ vmlal.u32 $D4,$H1#hi,$R3
-+ vmlal.u32 $D1,$H3#hi,$S3
-+ vmlal.u32 $D2,$H4#hi,$S3
-+
-+ vmlal.u32 $D3,$H4#hi,$S4
-+ vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
-+ vmlal.u32 $D0,$H1#hi,$S4
-+ vshr.u64 $MASK,$MASK,#38
-+ vmlal.u32 $D4,$H0#hi,$R4
-+ vmlal.u32 $D1,$H2#hi,$S4
-+ vmlal.u32 $D2,$H3#hi,$S4
-+
-+ beq .Lshort_tail
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
-+
-+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
-+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
-+
-+ vmlal.u32 $D2,$H2#lo,$R0
-+ vmlal.u32 $D0,$H0#lo,$R0
-+ vmlal.u32 $D3,$H3#lo,$R0
-+ vmlal.u32 $D1,$H1#lo,$R0
-+ vmlal.u32 $D4,$H4#lo,$R0
-+
-+ vmlal.u32 $D0,$H4#lo,$S1
-+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-+ vmlal.u32 $D3,$H2#lo,$R1
-+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-+ vmlal.u32 $D1,$H0#lo,$R1
-+ vmlal.u32 $D4,$H3#lo,$R1
-+ vmlal.u32 $D2,$H1#lo,$R1
-+
-+ vmlal.u32 $D3,$H1#lo,$R2
-+ vld1.32 ${S4}[1],[$tbl1,:32]
-+ vmlal.u32 $D0,$H3#lo,$S2
-+ vld1.32 ${S4}[0],[$tbl0,:32]
-+ vmlal.u32 $D4,$H2#lo,$R2
-+ vmlal.u32 $D1,$H4#lo,$S2
-+ vmlal.u32 $D2,$H0#lo,$R2
-+
-+ vmlal.u32 $D3,$H0#lo,$R3
-+ vmlal.u32 $D0,$H2#lo,$S3
-+ vmlal.u32 $D4,$H1#lo,$R3
-+ vmlal.u32 $D1,$H3#lo,$S3
-+ vmlal.u32 $D2,$H4#lo,$S3
-+
-+ vmlal.u32 $D3,$H4#lo,$S4
-+ vorn $MASK,$MASK,$MASK @ all-ones
-+ vmlal.u32 $D0,$H1#lo,$S4
-+ vshr.u64 $MASK,$MASK,#38
-+ vmlal.u32 $D4,$H0#lo,$R4
-+ vmlal.u32 $D1,$H2#lo,$S4
-+ vmlal.u32 $D2,$H3#lo,$S4
-+
-+.Lshort_tail:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ horizontal addition
-+
-+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
-+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
-+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
-+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
-+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ lazy reduction, but without narrowing
-+
-+ vshr.u64 $T0,$D3,#26
-+ vand.i64 $D3,$D3,$MASK
-+ vshr.u64 $T1,$D0,#26
-+ vand.i64 $D0,$D0,$MASK
-+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
-+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
-+
-+ vshr.u64 $T0,$D4,#26
-+ vand.i64 $D4,$D4,$MASK
-+ vshr.u64 $T1,$D1,#26
-+ vand.i64 $D1,$D1,$MASK
-+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
-+
-+ vadd.i64 $D0,$D0,$T0
-+ vshl.u64 $T0,$T0,#2
-+ vshr.u64 $T1,$D2,#26
-+ vand.i64 $D2,$D2,$MASK
-+ vadd.i64 $D0,$D0,$T0 @ h4 -> h0
-+ vadd.i64 $D3,$D3,$T1 @ h2 -> h3
-+
-+ vshr.u64 $T0,$D0,#26
-+ vand.i64 $D0,$D0,$MASK
-+ vshr.u64 $T1,$D3,#26
-+ vand.i64 $D3,$D3,$MASK
-+ vadd.i64 $D1,$D1,$T0 @ h0 -> h1
-+ vadd.i64 $D4,$D4,$T1 @ h3 -> h4
-+
-+ cmp $len,#0
-+ bne .Leven
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ store hash value
-+
-+ vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-+ vst1.32 {$D4#lo[0]},[$ctx]
-+
-+ vldmia sp!,{d8-d15} @ epilogue
-+ ldmia sp!,{r4-r7}
-+ ret @ bx lr
-+.size poly1305_blocks_neon,.-poly1305_blocks_neon
-+
-+.align 5
-+.Lzeros:
-+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-+#ifndef __KERNEL__
-+.LOPENSSL_armcap:
-+# ifdef _WIN32
-+.word OPENSSL_armcap_P
-+# else
-+.word OPENSSL_armcap_P-.Lpoly1305_init
-+# endif
-+.comm OPENSSL_armcap_P,4,4
-+.hidden OPENSSL_armcap_P
-+#endif
-+#endif
-+___
-+} }
-+$code.=<<___;
-+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
-+.align 2
-+___
-+
-+foreach (split("\n",$code)) {
-+ s/\`([^\`]*)\`/eval $1/geo;
-+
-+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
-+ s/\bret\b/bx lr/go or
-+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-+
-+ print $_,"\n";
-+}
-+close STDOUT; # enforce flush
---- /dev/null
-+++ b/arch/arm/crypto/poly1305-core.S_shipped
-@@ -0,0 +1,1158 @@
-+#ifndef __KERNEL__
-+# include "arm_arch.h"
-+#else
-+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-+# define poly1305_init poly1305_init_arm
-+# define poly1305_blocks poly1305_blocks_arm
-+# define poly1305_emit poly1305_emit_arm
-+.globl poly1305_blocks_neon
-+#endif
-+
-+#if defined(__thumb2__)
-+.syntax unified
-+.thumb
-+#else
-+.code 32
-+#endif
-+
-+.text
-+
-+.globl poly1305_emit
-+.globl poly1305_blocks
-+.globl poly1305_init
-+.type poly1305_init,%function
-+.align 5
-+poly1305_init:
-+.Lpoly1305_init:
-+ stmdb sp!,{r4-r11}
-+
-+ eor r3,r3,r3
-+ cmp r1,#0
-+ str r3,[r0,#0] @ zero hash value
-+ str r3,[r0,#4]
-+ str r3,[r0,#8]
-+ str r3,[r0,#12]
-+ str r3,[r0,#16]
-+ str r3,[r0,#36] @ clear is_base2_26
-+ add r0,r0,#20
-+
-+#ifdef __thumb2__
-+ it eq
-+#endif
-+ moveq r0,#0
-+ beq .Lno_key
-+
-+#if __ARM_MAX_ARCH__>=7
-+ mov r3,#-1
-+ str r3,[r0,#28] @ impossible key power value
-+# ifndef __KERNEL__
-+ adr r11,.Lpoly1305_init
-+ ldr r12,.LOPENSSL_armcap
-+# endif
-+#endif
-+ ldrb r4,[r1,#0]
-+ mov r10,#0x0fffffff
-+ ldrb r5,[r1,#1]
-+ and r3,r10,#-4 @ 0x0ffffffc
-+ ldrb r6,[r1,#2]
-+ ldrb r7,[r1,#3]
-+ orr r4,r4,r5,lsl#8
-+ ldrb r5,[r1,#4]
-+ orr r4,r4,r6,lsl#16
-+ ldrb r6,[r1,#5]
-+ orr r4,r4,r7,lsl#24
-+ ldrb r7,[r1,#6]
-+ and r4,r4,r10
-+
-+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-+# if !defined(_WIN32)
-+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
-+# endif
-+# if defined(__APPLE__) || defined(_WIN32)
-+ ldr r12,[r12]
-+# endif
-+#endif
-+ ldrb r8,[r1,#7]
-+ orr r5,r5,r6,lsl#8
-+ ldrb r6,[r1,#8]
-+ orr r5,r5,r7,lsl#16
-+ ldrb r7,[r1,#9]
-+ orr r5,r5,r8,lsl#24
-+ ldrb r8,[r1,#10]
-+ and r5,r5,r3
-+
-+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-+ tst r12,#ARMV7_NEON @ check for NEON
-+# ifdef __thumb2__
-+ adr r9,.Lpoly1305_blocks_neon
-+ adr r11,.Lpoly1305_blocks
-+ it ne
-+ movne r11,r9
-+ adr r12,.Lpoly1305_emit
-+ orr r11,r11,#1 @ thumb-ify addresses
-+ orr r12,r12,#1
-+# else
-+ add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
-+ ite eq
-+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
-+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
-+# endif
-+#endif
-+ ldrb r9,[r1,#11]
-+ orr r6,r6,r7,lsl#8
-+ ldrb r7,[r1,#12]
-+ orr r6,r6,r8,lsl#16
-+ ldrb r8,[r1,#13]
-+ orr r6,r6,r9,lsl#24
-+ ldrb r9,[r1,#14]
-+ and r6,r6,r3
-+
-+ ldrb r10,[r1,#15]
-+ orr r7,r7,r8,lsl#8
-+ str r4,[r0,#0]
-+ orr r7,r7,r9,lsl#16
-+ str r5,[r0,#4]
-+ orr r7,r7,r10,lsl#24
-+ str r6,[r0,#8]
-+ and r7,r7,r3
-+ str r7,[r0,#12]
-+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-+ stmia r2,{r11,r12} @ fill functions table
-+ mov r0,#1
-+#else
-+ mov r0,#0
-+#endif
-+.Lno_key:
-+ ldmia sp!,{r4-r11}
-+#if __ARM_ARCH__>=5
-+ bx lr @ bx lr
-+#else
-+ tst lr,#1
-+ moveq pc,lr @ be binary compatible with V4, yet
-+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-+#endif
-+.size poly1305_init,.-poly1305_init
-+.type poly1305_blocks,%function
-+.align 5
-+poly1305_blocks:
-+.Lpoly1305_blocks:
-+ stmdb sp!,{r3-r11,lr}
-+
-+ ands r2,r2,#-16
-+ beq .Lno_data
-+
-+ add r2,r2,r1 @ end pointer
-+ sub sp,sp,#32
-+
-+#if __ARM_ARCH__<7
-+ ldmia r0,{r4-r12} @ load context
-+ add r0,r0,#20
-+ str r2,[sp,#16] @ offload stuff
-+ str r0,[sp,#12]
-+#else
-+ ldr lr,[r0,#36] @ is_base2_26
-+ ldmia r0!,{r4-r8} @ load hash value
-+ str r2,[sp,#16] @ offload stuff
-+ str r0,[sp,#12]
-+
-+ adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
-+ mov r10,r5,lsr#6
-+ adcs r10,r10,r6,lsl#20
-+ mov r11,r6,lsr#12
-+ adcs r11,r11,r7,lsl#14
-+ mov r12,r7,lsr#18
-+ adcs r12,r12,r8,lsl#8
-+ mov r2,#0
-+ teq lr,#0
-+ str r2,[r0,#16] @ clear is_base2_26
-+ adc r2,r2,r8,lsr#24
-+
-+ itttt ne
-+ movne r4,r9 @ choose between radixes
-+ movne r5,r10
-+ movne r6,r11
-+ movne r7,r12
-+ ldmia r0,{r9-r12} @ load key
-+ it ne
-+ movne r8,r2
-+#endif
-+
-+ mov lr,r1
-+ cmp r3,#0
-+ str r10,[sp,#20]
-+ str r11,[sp,#24]
-+ str r12,[sp,#28]
-+ b .Loop
-+
-+.align 4
-+.Loop:
-+#if __ARM_ARCH__<7
-+ ldrb r0,[lr],#16 @ load input
-+# ifdef __thumb2__
-+ it hi
-+# endif
-+ addhi r8,r8,#1 @ 1<<128
-+ ldrb r1,[lr,#-15]
-+ ldrb r2,[lr,#-14]
-+ ldrb r3,[lr,#-13]
-+ orr r1,r0,r1,lsl#8
-+ ldrb r0,[lr,#-12]
-+ orr r2,r1,r2,lsl#16
-+ ldrb r1,[lr,#-11]
-+ orr r3,r2,r3,lsl#24
-+ ldrb r2,[lr,#-10]
-+ adds r4,r4,r3 @ accumulate input
-+
-+ ldrb r3,[lr,#-9]
-+ orr r1,r0,r1,lsl#8
-+ ldrb r0,[lr,#-8]
-+ orr r2,r1,r2,lsl#16
-+ ldrb r1,[lr,#-7]
-+ orr r3,r2,r3,lsl#24
-+ ldrb r2,[lr,#-6]
-+ adcs r5,r5,r3
-+
-+ ldrb r3,[lr,#-5]
-+ orr r1,r0,r1,lsl#8
-+ ldrb r0,[lr,#-4]
-+ orr r2,r1,r2,lsl#16
-+ ldrb r1,[lr,#-3]
-+ orr r3,r2,r3,lsl#24
-+ ldrb r2,[lr,#-2]
-+ adcs r6,r6,r3
-+
-+ ldrb r3,[lr,#-1]
-+ orr r1,r0,r1,lsl#8
-+ str lr,[sp,#8] @ offload input pointer
-+ orr r2,r1,r2,lsl#16
-+ add r10,r10,r10,lsr#2
-+ orr r3,r2,r3,lsl#24
-+#else
-+ ldr r0,[lr],#16 @ load input
-+ it hi
-+ addhi r8,r8,#1 @ padbit
-+ ldr r1,[lr,#-12]
-+ ldr r2,[lr,#-8]
-+ ldr r3,[lr,#-4]
-+# ifdef __ARMEB__
-+ rev r0,r0
-+ rev r1,r1
-+ rev r2,r2
-+ rev r3,r3
-+# endif
-+ adds r4,r4,r0 @ accumulate input
-+ str lr,[sp,#8] @ offload input pointer
-+ adcs r5,r5,r1
-+ add r10,r10,r10,lsr#2
-+ adcs r6,r6,r2
-+#endif
-+ add r11,r11,r11,lsr#2
-+ adcs r7,r7,r3
-+ add r12,r12,r12,lsr#2
-+
-+ umull r2,r3,r5,r9
-+ adc r8,r8,#0
-+ umull r0,r1,r4,r9
-+ umlal r2,r3,r8,r10
-+ umlal r0,r1,r7,r10
-+ ldr r10,[sp,#20] @ reload r10
-+ umlal r2,r3,r6,r12
-+ umlal r0,r1,r5,r12
-+ umlal r2,r3,r7,r11
-+ umlal r0,r1,r6,r11
-+ umlal r2,r3,r4,r10
-+ str r0,[sp,#0] @ future r4
-+ mul r0,r11,r8
-+ ldr r11,[sp,#24] @ reload r11
-+ adds r2,r2,r1 @ d1+=d0>>32
-+ eor r1,r1,r1
-+ adc lr,r3,#0 @ future r6
-+ str r2,[sp,#4] @ future r5
-+
-+ mul r2,r12,r8
-+ eor r3,r3,r3
-+ umlal r0,r1,r7,r12
-+ ldr r12,[sp,#28] @ reload r12
-+ umlal r2,r3,r7,r9
-+ umlal r0,r1,r6,r9
-+ umlal r2,r3,r6,r10
-+ umlal r0,r1,r5,r10
-+ umlal r2,r3,r5,r11
-+ umlal r0,r1,r4,r11
-+ umlal r2,r3,r4,r12
-+ ldr r4,[sp,#0]
-+ mul r8,r9,r8
-+ ldr r5,[sp,#4]
-+
-+ adds r6,lr,r0 @ d2+=d1>>32
-+ ldr lr,[sp,#8] @ reload input pointer
-+ adc r1,r1,#0
-+ adds r7,r2,r1 @ d3+=d2>>32
-+ ldr r0,[sp,#16] @ reload end pointer
-+ adc r3,r3,#0
-+ add r8,r8,r3 @ h4+=d3>>32
-+
-+ and r1,r8,#-4
-+ and r8,r8,#3
-+ add r1,r1,r1,lsr#2 @ *=5
-+ adds r4,r4,r1
-+ adcs r5,r5,#0
-+ adcs r6,r6,#0
-+ adcs r7,r7,#0
-+ adc r8,r8,#0
-+
-+ cmp r0,lr @ done yet?
-+ bhi .Loop
-+
-+ ldr r0,[sp,#12]
-+ add sp,sp,#32
-+ stmdb r0,{r4-r8} @ store the result
-+
-+.Lno_data:
-+#if __ARM_ARCH__>=5
-+ ldmia sp!,{r3-r11,pc}
-+#else
-+ ldmia sp!,{r3-r11,lr}
-+ tst lr,#1
-+ moveq pc,lr @ be binary compatible with V4, yet
-+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-+#endif
-+.size poly1305_blocks,.-poly1305_blocks
-+.type poly1305_emit,%function
-+.align 5
-+poly1305_emit:
-+.Lpoly1305_emit:
-+ stmdb sp!,{r4-r11}
-+
-+ ldmia r0,{r3-r7}
-+
-+#if __ARM_ARCH__>=7
-+ ldr ip,[r0,#36] @ is_base2_26
-+
-+ adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
-+ mov r9,r4,lsr#6
-+ adcs r9,r9,r5,lsl#20
-+ mov r10,r5,lsr#12
-+ adcs r10,r10,r6,lsl#14
-+ mov r11,r6,lsr#18
-+ adcs r11,r11,r7,lsl#8
-+ mov r0,#0
-+ adc r0,r0,r7,lsr#24
-+
-+ tst ip,ip
-+ itttt ne
-+ movne r3,r8
-+ movne r4,r9
-+ movne r5,r10
-+ movne r6,r11
-+ it ne
-+ movne r7,r0
-+#endif
-+
-+ adds r8,r3,#5 @ compare to modulus
-+ adcs r9,r4,#0
-+ adcs r10,r5,#0
-+ adcs r11,r6,#0
-+ adc r0,r7,#0
-+ tst r0,#4 @ did it carry/borrow?
-+
-+#ifdef __thumb2__
-+ it ne
-+#endif
-+ movne r3,r8
-+ ldr r8,[r2,#0]
-+#ifdef __thumb2__
-+ it ne
-+#endif
-+ movne r4,r9
-+ ldr r9,[r2,#4]
-+#ifdef __thumb2__
-+ it ne
-+#endif
-+ movne r5,r10
-+ ldr r10,[r2,#8]
-+#ifdef __thumb2__
-+ it ne
-+#endif
-+ movne r6,r11
-+ ldr r11,[r2,#12]
-+
-+ adds r3,r3,r8
-+ adcs r4,r4,r9
-+ adcs r5,r5,r10
-+ adc r6,r6,r11
-+
-+#if __ARM_ARCH__>=7
-+# ifdef __ARMEB__
-+ rev r3,r3
-+ rev r4,r4
-+ rev r5,r5
-+ rev r6,r6
-+# endif
-+ str r3,[r1,#0]
-+ str r4,[r1,#4]
-+ str r5,[r1,#8]
-+ str r6,[r1,#12]
-+#else
-+ strb r3,[r1,#0]
-+ mov r3,r3,lsr#8
-+ strb r4,[r1,#4]
-+ mov r4,r4,lsr#8
-+ strb r5,[r1,#8]
-+ mov r5,r5,lsr#8
-+ strb r6,[r1,#12]
-+ mov r6,r6,lsr#8
-+
-+ strb r3,[r1,#1]
-+ mov r3,r3,lsr#8
-+ strb r4,[r1,#5]
-+ mov r4,r4,lsr#8
-+ strb r5,[r1,#9]
-+ mov r5,r5,lsr#8
-+ strb r6,[r1,#13]
-+ mov r6,r6,lsr#8
-+
-+ strb r3,[r1,#2]
-+ mov r3,r3,lsr#8
-+ strb r4,[r1,#6]
-+ mov r4,r4,lsr#8
-+ strb r5,[r1,#10]
-+ mov r5,r5,lsr#8
-+ strb r6,[r1,#14]
-+ mov r6,r6,lsr#8
-+
-+ strb r3,[r1,#3]
-+ strb r4,[r1,#7]
-+ strb r5,[r1,#11]
-+ strb r6,[r1,#15]
-+#endif
-+ ldmia sp!,{r4-r11}
-+#if __ARM_ARCH__>=5
-+ bx lr @ bx lr
-+#else
-+ tst lr,#1
-+ moveq pc,lr @ be binary compatible with V4, yet
-+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-+#endif
-+.size poly1305_emit,.-poly1305_emit
-+#if __ARM_MAX_ARCH__>=7
-+.fpu neon
-+
-+.type poly1305_init_neon,%function
-+.align 5
-+poly1305_init_neon:
-+.Lpoly1305_init_neon:
-+ ldr r3,[r0,#48] @ first table element
-+ cmp r3,#-1 @ is value impossible?
-+ bne .Lno_init_neon
-+
-+ ldr r4,[r0,#20] @ load key base 2^32
-+ ldr r5,[r0,#24]
-+ ldr r6,[r0,#28]
-+ ldr r7,[r0,#32]
-+
-+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
-+ mov r3,r4,lsr#26
-+ mov r4,r5,lsr#20
-+ orr r3,r3,r5,lsl#6
-+ mov r5,r6,lsr#14
-+ orr r4,r4,r6,lsl#12
-+ mov r6,r7,lsr#8
-+ orr r5,r5,r7,lsl#18
-+ and r3,r3,#0x03ffffff
-+ and r4,r4,#0x03ffffff
-+ and r5,r5,#0x03ffffff
-+
-+ vdup.32 d0,r2 @ r^1 in both lanes
-+ add r2,r3,r3,lsl#2 @ *5
-+ vdup.32 d1,r3
-+ add r3,r4,r4,lsl#2
-+ vdup.32 d2,r2
-+ vdup.32 d3,r4
-+ add r4,r5,r5,lsl#2
-+ vdup.32 d4,r3
-+ vdup.32 d5,r5
-+ add r5,r6,r6,lsl#2
-+ vdup.32 d6,r4
-+ vdup.32 d7,r6
-+ vdup.32 d8,r5
-+
-+ mov r5,#2 @ counter
-+
-+.Lsquare_neon:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
-+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
-+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
-+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-+
-+ vmull.u32 q5,d0,d0[1]
-+ vmull.u32 q6,d1,d0[1]
-+ vmull.u32 q7,d3,d0[1]
-+ vmull.u32 q8,d5,d0[1]
-+ vmull.u32 q9,d7,d0[1]
-+
-+ vmlal.u32 q5,d7,d2[1]
-+ vmlal.u32 q6,d0,d1[1]
-+ vmlal.u32 q7,d1,d1[1]
-+ vmlal.u32 q8,d3,d1[1]
-+ vmlal.u32 q9,d5,d1[1]
-+
-+ vmlal.u32 q5,d5,d4[1]
-+ vmlal.u32 q6,d7,d4[1]
-+ vmlal.u32 q8,d1,d3[1]
-+ vmlal.u32 q7,d0,d3[1]
-+ vmlal.u32 q9,d3,d3[1]
-+
-+ vmlal.u32 q5,d3,d6[1]
-+ vmlal.u32 q8,d0,d5[1]
-+ vmlal.u32 q6,d5,d6[1]
-+ vmlal.u32 q7,d7,d6[1]
-+ vmlal.u32 q9,d1,d5[1]
-+
-+ vmlal.u32 q8,d7,d8[1]
-+ vmlal.u32 q5,d1,d8[1]
-+ vmlal.u32 q6,d3,d8[1]
-+ vmlal.u32 q7,d5,d8[1]
-+ vmlal.u32 q9,d0,d7[1]
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-+ @ and P. Schwabe
-+ @
-+ @ H0>>+H1>>+H2>>+H3>>+H4
-+ @ H3>>+H4>>*5+H0>>+H1
-+ @
-+ @ Trivia.
-+ @
-+ @ Result of multiplication of n-bit number by m-bit number is
-+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
-+ @ m-bit number multiplied by 2^n is still n+m bits wide.
-+ @
-+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
-+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
-+ @ one is n+1 bits wide.
-+ @
-+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
-+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
-+ @ can be 27. However! In cases when their width exceeds 26 bits
-+ @ they are limited by 2^26+2^6. This in turn means that *sum*
-+ @ of the products with these values can still be viewed as sum
-+ @ of 52-bit numbers as long as the amount of addends is not a
-+ @ power of 2. For example,
-+ @
-+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
-+ @
-+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
-+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
-+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
-+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
-+ @ which is less than 32 * (2^52) or 2^57. And when processing
-+ @ data we are looking at triple as many addends...
-+ @
-+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
-+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
-+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
-+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
-+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
-+ @ This means that result of reduction have to be compressed upon
-+ @ loop wrap-around. This can be done in the process of reduction
-+ @ to minimize amount of instructions [as well as amount of
-+ @ 128-bit instructions, which benefits low-end processors], but
-+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
-+ @ not being wider than 58 bits, so that result of right shift
-+ @ by 26 bits fits in 32 bits. This is also useful on x86,
-+ @ because it allows to use paddd in place for paddq, which
-+ @ benefits Atom, where paddq is ridiculously slow.
-+
-+ vshr.u64 q15,q8,#26
-+ vmovn.i64 d16,q8
-+ vshr.u64 q4,q5,#26
-+ vmovn.i64 d10,q5
-+ vadd.i64 q9,q9,q15 @ h3 -> h4
-+ vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
-+ vadd.i64 q6,q6,q4 @ h0 -> h1
-+ vbic.i32 d10,#0xfc000000
-+
-+ vshrn.u64 d30,q9,#26
-+ vmovn.i64 d18,q9
-+ vshr.u64 q4,q6,#26
-+ vmovn.i64 d12,q6
-+ vadd.i64 q7,q7,q4 @ h1 -> h2
-+ vbic.i32 d18,#0xfc000000
-+ vbic.i32 d12,#0xfc000000
-+
-+ vadd.i32 d10,d10,d30
-+ vshl.u32 d30,d30,#2
-+ vshrn.u64 d8,q7,#26
-+ vmovn.i64 d14,q7
-+ vadd.i32 d10,d10,d30 @ h4 -> h0
-+ vadd.i32 d16,d16,d8 @ h2 -> h3
-+ vbic.i32 d14,#0xfc000000
-+
-+ vshr.u32 d30,d10,#26
-+ vbic.i32 d10,#0xfc000000
-+ vshr.u32 d8,d16,#26
-+ vbic.i32 d16,#0xfc000000
-+ vadd.i32 d12,d12,d30 @ h0 -> h1
-+ vadd.i32 d18,d18,d8 @ h3 -> h4
-+
-+ subs r5,r5,#1
-+ beq .Lsquare_break_neon
-+
-+ add r6,r0,#(48+0*9*4)
-+ add r7,r0,#(48+1*9*4)
-+
-+ vtrn.32 d0,d10 @ r^2:r^1
-+ vtrn.32 d3,d14
-+ vtrn.32 d5,d16
-+ vtrn.32 d1,d12
-+ vtrn.32 d7,d18
-+
-+ vshl.u32 d4,d3,#2 @ *5
-+ vshl.u32 d6,d5,#2
-+ vshl.u32 d2,d1,#2
-+ vshl.u32 d8,d7,#2
-+ vadd.i32 d4,d4,d3
-+ vadd.i32 d2,d2,d1
-+ vadd.i32 d6,d6,d5
-+ vadd.i32 d8,d8,d7
-+
-+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
-+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
-+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
-+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
-+ vst1.32 {d8[0]},[r6,:32]
-+ vst1.32 {d8[1]},[r7,:32]
-+
-+ b .Lsquare_neon
-+
-+.align 4
-+.Lsquare_break_neon:
-+ add r6,r0,#(48+2*4*9)
-+ add r7,r0,#(48+3*4*9)
-+
-+ vmov d0,d10 @ r^4:r^3
-+ vshl.u32 d2,d12,#2 @ *5
-+ vmov d1,d12
-+ vshl.u32 d4,d14,#2
-+ vmov d3,d14
-+ vshl.u32 d6,d16,#2
-+ vmov d5,d16
-+ vshl.u32 d8,d18,#2
-+ vmov d7,d18
-+ vadd.i32 d2,d2,d12
-+ vadd.i32 d4,d4,d14
-+ vadd.i32 d6,d6,d16
-+ vadd.i32 d8,d8,d18
-+
-+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
-+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
-+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
-+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
-+ vst1.32 {d8[0]},[r6]
-+ vst1.32 {d8[1]},[r7]
-+
-+.Lno_init_neon:
-+ bx lr @ bx lr
-+.size poly1305_init_neon,.-poly1305_init_neon
-+
-+.type poly1305_blocks_neon,%function
-+.align 5
-+poly1305_blocks_neon:
-+.Lpoly1305_blocks_neon:
-+ ldr ip,[r0,#36] @ is_base2_26
-+
-+ cmp r2,#64
-+ blo .Lpoly1305_blocks
-+
-+ stmdb sp!,{r4-r7}
-+ vstmdb sp!,{d8-d15} @ ABI specification says so
-+
-+ tst ip,ip @ is_base2_26?
-+ bne .Lbase2_26_neon
-+
-+ stmdb sp!,{r1-r3,lr}
-+ bl .Lpoly1305_init_neon
-+
-+ ldr r4,[r0,#0] @ load hash value base 2^32
-+ ldr r5,[r0,#4]
-+ ldr r6,[r0,#8]
-+ ldr r7,[r0,#12]
-+ ldr ip,[r0,#16]
-+
-+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
-+ mov r3,r4,lsr#26
-+ veor d10,d10,d10
-+ mov r4,r5,lsr#20
-+ orr r3,r3,r5,lsl#6
-+ veor d12,d12,d12
-+ mov r5,r6,lsr#14
-+ orr r4,r4,r6,lsl#12
-+ veor d14,d14,d14
-+ mov r6,r7,lsr#8
-+ orr r5,r5,r7,lsl#18
-+ veor d16,d16,d16
-+ and r3,r3,#0x03ffffff
-+ orr r6,r6,ip,lsl#24
-+ veor d18,d18,d18
-+ and r4,r4,#0x03ffffff
-+ mov r1,#1
-+ and r5,r5,#0x03ffffff
-+ str r1,[r0,#36] @ set is_base2_26
-+
-+ vmov.32 d10[0],r2
-+ vmov.32 d12[0],r3
-+ vmov.32 d14[0],r4
-+ vmov.32 d16[0],r5
-+ vmov.32 d18[0],r6
-+ adr r5,.Lzeros
-+
-+ ldmia sp!,{r1-r3,lr}
-+ b .Lhash_loaded
-+
-+.align 4
-+.Lbase2_26_neon:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ load hash value
-+
-+ veor d10,d10,d10
-+ veor d12,d12,d12
-+ veor d14,d14,d14
-+ veor d16,d16,d16
-+ veor d18,d18,d18
-+ vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
-+ adr r5,.Lzeros
-+ vld1.32 {d18[0]},[r0]
-+ sub r0,r0,#16 @ rewind
-+
-+.Lhash_loaded:
-+ add r4,r1,#32
-+ mov r3,r3,lsl#24
-+ tst r2,#31
-+ beq .Leven
-+
-+ vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
-+ vmov.32 d28[0],r3
-+ sub r2,r2,#16
-+ add r4,r1,#32
-+
-+# ifdef __ARMEB__
-+ vrev32.8 q10,q10
-+ vrev32.8 q13,q13
-+ vrev32.8 q11,q11
-+ vrev32.8 q12,q12
-+# endif
-+ vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
-+ vshl.u32 d26,d26,#18
-+
-+ vsri.u32 d26,d24,#14
-+ vshl.u32 d24,d24,#12
-+ vadd.i32 d29,d28,d18 @ add hash value and move to #hi
-+
-+ vbic.i32 d26,#0xfc000000
-+ vsri.u32 d24,d22,#20
-+ vshl.u32 d22,d22,#6
-+
-+ vbic.i32 d24,#0xfc000000
-+ vsri.u32 d22,d20,#26
-+ vadd.i32 d27,d26,d16
-+
-+ vbic.i32 d20,#0xfc000000
-+ vbic.i32 d22,#0xfc000000
-+ vadd.i32 d25,d24,d14
-+
-+ vadd.i32 d21,d20,d10
-+ vadd.i32 d23,d22,d12
-+
-+ mov r7,r5
-+ add r6,r0,#48
-+
-+ cmp r2,r2
-+ b .Long_tail
-+
-+.align 4
-+.Leven:
-+ subs r2,r2,#64
-+ it lo
-+ movlo r4,r5
-+
-+ vmov.i32 q14,#1<<24 @ padbit, yes, always
-+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
-+ add r1,r1,#64
-+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
-+ add r4,r4,#64
-+ itt hi
-+ addhi r7,r0,#(48+1*9*4)
-+ addhi r6,r0,#(48+3*9*4)
-+
-+# ifdef __ARMEB__
-+ vrev32.8 q10,q10
-+ vrev32.8 q13,q13
-+ vrev32.8 q11,q11
-+ vrev32.8 q12,q12
-+# endif
-+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
-+ vshl.u32 q13,q13,#18
-+
-+ vsri.u32 q13,q12,#14
-+ vshl.u32 q12,q12,#12
-+
-+ vbic.i32 q13,#0xfc000000
-+ vsri.u32 q12,q11,#20
-+ vshl.u32 q11,q11,#6
-+
-+ vbic.i32 q12,#0xfc000000
-+ vsri.u32 q11,q10,#26
-+
-+ vbic.i32 q10,#0xfc000000
-+ vbic.i32 q11,#0xfc000000
-+
-+ bls .Lskip_loop
-+
-+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
-+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
-+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
-+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
-+ b .Loop_neon
-+
-+.align 5
-+.Loop_neon:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-+ @ ___________________/
-+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-+ @ ___________________/ ____________________/
-+ @
-+ @ Note that we start with inp[2:3]*r^2. This is because it
-+ @ doesn't depend on reduction in previous iteration.
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
-+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
-+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
-+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ inp[2:3]*r^2
-+
-+ vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
-+ vmull.u32 q7,d25,d0[1]
-+ vadd.i32 d20,d20,d10
-+ vmull.u32 q5,d21,d0[1]
-+ vadd.i32 d26,d26,d16
-+ vmull.u32 q8,d27,d0[1]
-+ vmlal.u32 q7,d23,d1[1]
-+ vadd.i32 d22,d22,d12
-+ vmull.u32 q6,d23,d0[1]
-+
-+ vadd.i32 d28,d28,d18
-+ vmull.u32 q9,d29,d0[1]
-+ subs r2,r2,#64
-+ vmlal.u32 q5,d29,d2[1]
-+ it lo
-+ movlo r4,r5
-+ vmlal.u32 q8,d25,d1[1]
-+ vld1.32 d8[1],[r7,:32]
-+ vmlal.u32 q6,d21,d1[1]
-+ vmlal.u32 q9,d27,d1[1]
-+
-+ vmlal.u32 q5,d27,d4[1]
-+ vmlal.u32 q8,d23,d3[1]
-+ vmlal.u32 q9,d25,d3[1]
-+ vmlal.u32 q6,d29,d4[1]
-+ vmlal.u32 q7,d21,d3[1]
-+
-+ vmlal.u32 q8,d21,d5[1]
-+ vmlal.u32 q5,d25,d6[1]
-+ vmlal.u32 q9,d23,d5[1]
-+ vmlal.u32 q6,d27,d6[1]
-+ vmlal.u32 q7,d29,d6[1]
-+
-+ vmlal.u32 q8,d29,d8[1]
-+ vmlal.u32 q5,d23,d8[1]
-+ vmlal.u32 q9,d21,d7[1]
-+ vmlal.u32 q6,d25,d8[1]
-+ vmlal.u32 q7,d27,d8[1]
-+
-+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
-+ add r4,r4,#64
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ (hash+inp[0:1])*r^4 and accumulate
-+
-+ vmlal.u32 q8,d26,d0[0]
-+ vmlal.u32 q5,d20,d0[0]
-+ vmlal.u32 q9,d28,d0[0]
-+ vmlal.u32 q6,d22,d0[0]
-+ vmlal.u32 q7,d24,d0[0]
-+ vld1.32 d8[0],[r6,:32]
-+
-+ vmlal.u32 q8,d24,d1[0]
-+ vmlal.u32 q5,d28,d2[0]
-+ vmlal.u32 q9,d26,d1[0]
-+ vmlal.u32 q6,d20,d1[0]
-+ vmlal.u32 q7,d22,d1[0]
-+
-+ vmlal.u32 q8,d22,d3[0]
-+ vmlal.u32 q5,d26,d4[0]
-+ vmlal.u32 q9,d24,d3[0]
-+ vmlal.u32 q6,d28,d4[0]
-+ vmlal.u32 q7,d20,d3[0]
-+
-+ vmlal.u32 q8,d20,d5[0]
-+ vmlal.u32 q5,d24,d6[0]
-+ vmlal.u32 q9,d22,d5[0]
-+ vmlal.u32 q6,d26,d6[0]
-+ vmlal.u32 q8,d28,d8[0]
-+
-+ vmlal.u32 q7,d28,d6[0]
-+ vmlal.u32 q5,d22,d8[0]
-+ vmlal.u32 q9,d20,d7[0]
-+ vmov.i32 q14,#1<<24 @ padbit, yes, always
-+ vmlal.u32 q6,d24,d8[0]
-+ vmlal.u32 q7,d26,d8[0]
-+
-+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
-+ add r1,r1,#64
-+# ifdef __ARMEB__
-+ vrev32.8 q10,q10
-+ vrev32.8 q11,q11
-+ vrev32.8 q12,q12
-+ vrev32.8 q13,q13
-+# endif
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
-+ @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
-+
-+ vshr.u64 q15,q8,#26
-+ vmovn.i64 d16,q8
-+ vshr.u64 q4,q5,#26
-+ vmovn.i64 d10,q5
-+ vadd.i64 q9,q9,q15 @ h3 -> h4
-+ vbic.i32 d16,#0xfc000000
-+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
-+ vadd.i64 q6,q6,q4 @ h0 -> h1
-+ vshl.u32 q13,q13,#18
-+ vbic.i32 d10,#0xfc000000
-+
-+ vshrn.u64 d30,q9,#26
-+ vmovn.i64 d18,q9
-+ vshr.u64 q4,q6,#26
-+ vmovn.i64 d12,q6
-+ vadd.i64 q7,q7,q4 @ h1 -> h2
-+ vsri.u32 q13,q12,#14
-+ vbic.i32 d18,#0xfc000000
-+ vshl.u32 q12,q12,#12
-+ vbic.i32 d12,#0xfc000000
-+
-+ vadd.i32 d10,d10,d30
-+ vshl.u32 d30,d30,#2
-+ vbic.i32 q13,#0xfc000000
-+ vshrn.u64 d8,q7,#26
-+ vmovn.i64 d14,q7
-+ vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
-+ vsri.u32 q12,q11,#20
-+ vadd.i32 d16,d16,d8 @ h2 -> h3
-+ vshl.u32 q11,q11,#6
-+ vbic.i32 d14,#0xfc000000
-+ vbic.i32 q12,#0xfc000000
-+
-+ vshrn.u64 d30,q5,#26 @ re-narrow
-+ vmovn.i64 d10,q5
-+ vsri.u32 q11,q10,#26
-+ vbic.i32 q10,#0xfc000000
-+ vshr.u32 d8,d16,#26
-+ vbic.i32 d16,#0xfc000000
-+ vbic.i32 d10,#0xfc000000
-+ vadd.i32 d12,d12,d30 @ h0 -> h1
-+ vadd.i32 d18,d18,d8 @ h3 -> h4
-+ vbic.i32 q11,#0xfc000000
-+
-+ bhi .Loop_neon
-+
-+.Lskip_loop:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-+
-+ add r7,r0,#(48+0*9*4)
-+ add r6,r0,#(48+1*9*4)
-+ adds r2,r2,#32
-+ it ne
-+ movne r2,#0
-+ bne .Long_tail
-+
-+ vadd.i32 d25,d24,d14 @ add hash value and move to #hi
-+ vadd.i32 d21,d20,d10
-+ vadd.i32 d27,d26,d16
-+ vadd.i32 d23,d22,d12
-+ vadd.i32 d29,d28,d18
-+
-+.Long_tail:
-+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
-+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
-+
-+ vadd.i32 d24,d24,d14 @ can be redundant
-+ vmull.u32 q7,d25,d0
-+ vadd.i32 d20,d20,d10
-+ vmull.u32 q5,d21,d0
-+ vadd.i32 d26,d26,d16
-+ vmull.u32 q8,d27,d0
-+ vadd.i32 d22,d22,d12
-+ vmull.u32 q6,d23,d0
-+ vadd.i32 d28,d28,d18
-+ vmull.u32 q9,d29,d0
-+
-+ vmlal.u32 q5,d29,d2
-+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
-+ vmlal.u32 q8,d25,d1
-+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
-+ vmlal.u32 q6,d21,d1
-+ vmlal.u32 q9,d27,d1
-+ vmlal.u32 q7,d23,d1
-+
-+ vmlal.u32 q8,d23,d3
-+ vld1.32 d8[1],[r7,:32]
-+ vmlal.u32 q5,d27,d4
-+ vld1.32 d8[0],[r6,:32]
-+ vmlal.u32 q9,d25,d3
-+ vmlal.u32 q6,d29,d4
-+ vmlal.u32 q7,d21,d3
-+
-+ vmlal.u32 q8,d21,d5
-+ it ne
-+ addne r7,r0,#(48+2*9*4)
-+ vmlal.u32 q5,d25,d6
-+ it ne
-+ addne r6,r0,#(48+3*9*4)
-+ vmlal.u32 q9,d23,d5
-+ vmlal.u32 q6,d27,d6
-+ vmlal.u32 q7,d29,d6
-+
-+ vmlal.u32 q8,d29,d8
-+ vorn q0,q0,q0 @ all-ones, can be redundant
-+ vmlal.u32 q5,d23,d8
-+ vshr.u64 q0,q0,#38
-+ vmlal.u32 q9,d21,d7
-+ vmlal.u32 q6,d25,d8
-+ vmlal.u32 q7,d27,d8
-+
-+ beq .Lshort_tail
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
-+
-+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
-+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
-+
-+ vmlal.u32 q7,d24,d0
-+ vmlal.u32 q5,d20,d0
-+ vmlal.u32 q8,d26,d0
-+ vmlal.u32 q6,d22,d0
-+ vmlal.u32 q9,d28,d0
-+
-+ vmlal.u32 q5,d28,d2
-+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
-+ vmlal.u32 q8,d24,d1
-+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
-+ vmlal.u32 q6,d20,d1
-+ vmlal.u32 q9,d26,d1
-+ vmlal.u32 q7,d22,d1
-+
-+ vmlal.u32 q8,d22,d3
-+ vld1.32 d8[1],[r7,:32]
-+ vmlal.u32 q5,d26,d4
-+ vld1.32 d8[0],[r6,:32]
-+ vmlal.u32 q9,d24,d3
-+ vmlal.u32 q6,d28,d4
-+ vmlal.u32 q7,d20,d3
-+
-+ vmlal.u32 q8,d20,d5
-+ vmlal.u32 q5,d24,d6
-+ vmlal.u32 q9,d22,d5
-+ vmlal.u32 q6,d26,d6
-+ vmlal.u32 q7,d28,d6
-+
-+ vmlal.u32 q8,d28,d8
-+ vorn q0,q0,q0 @ all-ones
-+ vmlal.u32 q5,d22,d8
-+ vshr.u64 q0,q0,#38
-+ vmlal.u32 q9,d20,d7
-+ vmlal.u32 q6,d24,d8
-+ vmlal.u32 q7,d26,d8
-+
-+.Lshort_tail:
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ horizontal addition
-+
-+ vadd.i64 d16,d16,d17
-+ vadd.i64 d10,d10,d11
-+ vadd.i64 d18,d18,d19
-+ vadd.i64 d12,d12,d13
-+ vadd.i64 d14,d14,d15
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ lazy reduction, but without narrowing
-+
-+ vshr.u64 q15,q8,#26
-+ vand.i64 q8,q8,q0
-+ vshr.u64 q4,q5,#26
-+ vand.i64 q5,q5,q0
-+ vadd.i64 q9,q9,q15 @ h3 -> h4
-+ vadd.i64 q6,q6,q4 @ h0 -> h1
-+
-+ vshr.u64 q15,q9,#26
-+ vand.i64 q9,q9,q0
-+ vshr.u64 q4,q6,#26
-+ vand.i64 q6,q6,q0
-+ vadd.i64 q7,q7,q4 @ h1 -> h2
-+
-+ vadd.i64 q5,q5,q15
-+ vshl.u64 q15,q15,#2
-+ vshr.u64 q4,q7,#26
-+ vand.i64 q7,q7,q0
-+ vadd.i64 q5,q5,q15 @ h4 -> h0
-+ vadd.i64 q8,q8,q4 @ h2 -> h3
-+
-+ vshr.u64 q15,q5,#26
-+ vand.i64 q5,q5,q0
-+ vshr.u64 q4,q8,#26
-+ vand.i64 q8,q8,q0
-+ vadd.i64 q6,q6,q15 @ h0 -> h1
-+ vadd.i64 q9,q9,q4 @ h3 -> h4
-+
-+ cmp r2,#0
-+ bne .Leven
-+
-+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-+ @ store hash value
-+
-+ vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
-+ vst1.32 {d18[0]},[r0]
-+
-+ vldmia sp!,{d8-d15} @ epilogue
-+ ldmia sp!,{r4-r7}
-+ bx lr @ bx lr
-+.size poly1305_blocks_neon,.-poly1305_blocks_neon
-+
-+.align 5
-+.Lzeros:
-+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-+#ifndef __KERNEL__
-+.LOPENSSL_armcap:
-+# ifdef _WIN32
-+.word OPENSSL_armcap_P
-+# else
-+.word OPENSSL_armcap_P-.Lpoly1305_init
-+# endif
-+.comm OPENSSL_armcap_P,4,4
-+.hidden OPENSSL_armcap_P
-+#endif
-+#endif
-+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"
-+.align 2
---- /dev/null
-+++ b/arch/arm/crypto/poly1305-glue.c
-@@ -0,0 +1,276 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/*
-+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
-+ *
-+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
-+ */
-+
-+#include <asm/hwcap.h>
-+#include <asm/neon.h>
-+#include <asm/simd.h>
-+#include <asm/unaligned.h>
-+#include <crypto/algapi.h>
-+#include <crypto/internal/hash.h>
-+#include <crypto/internal/poly1305.h>
-+#include <crypto/internal/simd.h>
-+#include <linux/cpufeature.h>
-+#include <linux/crypto.h>
-+#include <linux/jump_label.h>
-+#include <linux/module.h>
-+
-+void poly1305_init_arm(void *state, const u8 *key);
-+void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
-+void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce);
-+
-+void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
-+{
-+}
-+
-+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-+
-+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
-+{
-+ poly1305_init_arm(&dctx->h, key);
-+ dctx->s[0] = get_unaligned_le32(key + 16);
-+ dctx->s[1] = get_unaligned_le32(key + 20);
-+ dctx->s[2] = get_unaligned_le32(key + 24);
-+ dctx->s[3] = get_unaligned_le32(key + 28);
-+ dctx->buflen = 0;
-+}
-+EXPORT_SYMBOL(poly1305_init_arch);
-+
-+static int arm_poly1305_init(struct shash_desc *desc)
-+{
-+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-+
-+ dctx->buflen = 0;
-+ dctx->rset = 0;
-+ dctx->sset = false;
-+
-+ return 0;
-+}
-+
-+static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
-+ u32 len, u32 hibit, bool do_neon)
-+{
-+ if (unlikely(!dctx->sset)) {
-+ if (!dctx->rset) {
-+ poly1305_init_arm(&dctx->h, src);
-+ src += POLY1305_BLOCK_SIZE;
-+ len -= POLY1305_BLOCK_SIZE;
-+ dctx->rset = 1;
-+ }
-+ if (len >= POLY1305_BLOCK_SIZE) {
-+ dctx->s[0] = get_unaligned_le32(src + 0);
-+ dctx->s[1] = get_unaligned_le32(src + 4);
-+ dctx->s[2] = get_unaligned_le32(src + 8);
-+ dctx->s[3] = get_unaligned_le32(src + 12);
-+ src += POLY1305_BLOCK_SIZE;
-+ len -= POLY1305_BLOCK_SIZE;
-+ dctx->sset = true;
-+ }
-+ if (len < POLY1305_BLOCK_SIZE)
-+ return;
-+ }
-+
-+ len &= ~(POLY1305_BLOCK_SIZE - 1);
-+
-+ if (static_branch_likely(&have_neon) && likely(do_neon))
-+ poly1305_blocks_neon(&dctx->h, src, len, hibit);
-+ else
-+ poly1305_blocks_arm(&dctx->h, src, len, hibit);
-+}
-+
-+static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
-+ const u8 *src, u32 len, bool do_neon)
-+{
-+ if (unlikely(dctx->buflen)) {
-+ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
-+
-+ memcpy(dctx->buf + dctx->buflen, src, bytes);
-+ src += bytes;
-+ len -= bytes;
-+ dctx->buflen += bytes;
-+
-+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-+ arm_poly1305_blocks(dctx, dctx->buf,
-+ POLY1305_BLOCK_SIZE, 1, false);
-+ dctx->buflen = 0;
-+ }
-+ }
-+
-+ if (likely(len >= POLY1305_BLOCK_SIZE)) {
-+ arm_poly1305_blocks(dctx, src, len, 1, do_neon);
-+ src += round_down(len, POLY1305_BLOCK_SIZE);
-+ len %= POLY1305_BLOCK_SIZE;
-+ }
-+
-+ if (unlikely(len)) {
-+ dctx->buflen = len;
-+ memcpy(dctx->buf, src, len);
-+ }
-+}
-+
-+static int arm_poly1305_update(struct shash_desc *desc,
-+ const u8 *src, unsigned int srclen)
-+{
-+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-+
-+ arm_poly1305_do_update(dctx, src, srclen, false);
-+ return 0;
-+}
-+
-+static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
-+ const u8 *src,
-+ unsigned int srclen)
-+{
-+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-+ bool do_neon = crypto_simd_usable() && srclen > 128;
-+
-+ if (static_branch_likely(&have_neon) && do_neon)
-+ kernel_neon_begin();
-+ arm_poly1305_do_update(dctx, src, srclen, do_neon);
-+ if (static_branch_likely(&have_neon) && do_neon)
-+ kernel_neon_end();
-+ return 0;
-+}
-+
-+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
-+ unsigned int nbytes)
-+{
-+ bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-+ crypto_simd_usable();
-+
-+ if (unlikely(dctx->buflen)) {
-+ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
-+
-+ memcpy(dctx->buf + dctx->buflen, src, bytes);
-+ src += bytes;
-+ nbytes -= bytes;
-+ dctx->buflen += bytes;
-+
-+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-+ poly1305_blocks_arm(&dctx->h, dctx->buf,
-+ POLY1305_BLOCK_SIZE, 1);
-+ dctx->buflen = 0;
-+ }
-+ }
-+
-+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-+ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
-+
-+ if (static_branch_likely(&have_neon) && do_neon) {
-+ kernel_neon_begin();
-+ poly1305_blocks_neon(&dctx->h, src, len, 1);
-+ kernel_neon_end();
-+ } else {
-+ poly1305_blocks_arm(&dctx->h, src, len, 1);
-+ }
-+ src += len;
-+ nbytes %= POLY1305_BLOCK_SIZE;
-+ }
-+
-+ if (unlikely(nbytes)) {
-+ dctx->buflen = nbytes;
-+ memcpy(dctx->buf, src, nbytes);
-+ }
-+}
-+EXPORT_SYMBOL(poly1305_update_arch);
-+
-+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-+{
-+ __le32 digest[4];
-+ u64 f = 0;
-+
-+ if (unlikely(dctx->buflen)) {
-+ dctx->buf[dctx->buflen++] = 1;
-+ memset(dctx->buf + dctx->buflen, 0,
-+ POLY1305_BLOCK_SIZE - dctx->buflen);
-+ poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-+ }
-+
-+ poly1305_emit_arm(&dctx->h, digest, dctx->s);
-+
-+ /* mac = (h + s) % (2^128) */
-+ f = (f >> 32) + le32_to_cpu(digest[0]);
-+ put_unaligned_le32(f, dst);
-+ f = (f >> 32) + le32_to_cpu(digest[1]);
-+ put_unaligned_le32(f, dst + 4);
-+ f = (f >> 32) + le32_to_cpu(digest[2]);
-+ put_unaligned_le32(f, dst + 8);
-+ f = (f >> 32) + le32_to_cpu(digest[3]);
-+ put_unaligned_le32(f, dst + 12);
-+
-+ *dctx = (struct poly1305_desc_ctx){};
-+}
-+EXPORT_SYMBOL(poly1305_final_arch);
-+
-+static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
-+{
-+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-+
-+ if (unlikely(!dctx->sset))
-+ return -ENOKEY;
-+
-+ poly1305_final_arch(dctx, dst);
-+ return 0;
-+}
-+
-+static struct shash_alg arm_poly1305_algs[] = {{
-+ .init = arm_poly1305_init,
-+ .update = arm_poly1305_update,
-+ .final = arm_poly1305_final,
-+ .digestsize = POLY1305_DIGEST_SIZE,
-+ .descsize = sizeof(struct poly1305_desc_ctx),
-+
-+ .base.cra_name = "poly1305",
-+ .base.cra_driver_name = "poly1305-arm",
-+ .base.cra_priority = 150,
-+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
-+ .base.cra_module = THIS_MODULE,
-+#ifdef CONFIG_KERNEL_MODE_NEON
-+}, {
-+ .init = arm_poly1305_init,
-+ .update = arm_poly1305_update_neon,
-+ .final = arm_poly1305_final,
-+ .digestsize = POLY1305_DIGEST_SIZE,
-+ .descsize = sizeof(struct poly1305_desc_ctx),
-+
-+ .base.cra_name = "poly1305",
-+ .base.cra_driver_name = "poly1305-neon",
-+ .base.cra_priority = 200,
-+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
-+ .base.cra_module = THIS_MODULE,
-+#endif
-+}};
-+
-+static int __init arm_poly1305_mod_init(void)
-+{
-+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-+ (elf_hwcap & HWCAP_NEON))
-+ static_branch_enable(&have_neon);
-+ else
-+ /* register only the first entry */
-+ return crypto_register_shash(&arm_poly1305_algs[0]);
-+
-+ return crypto_register_shashes(arm_poly1305_algs,
-+ ARRAY_SIZE(arm_poly1305_algs));
-+}
-+
-+static void __exit arm_poly1305_mod_exit(void)
-+{
-+ if (!static_branch_likely(&have_neon)) {
-+ crypto_unregister_shash(&arm_poly1305_algs[0]);
-+ return;
-+ }
-+ crypto_unregister_shashes(arm_poly1305_algs,
-+ ARRAY_SIZE(arm_poly1305_algs));
-+}
-+
-+module_init(arm_poly1305_mod_init);
-+module_exit(arm_poly1305_mod_exit);
-+
-+MODULE_LICENSE("GPL v2");
-+MODULE_ALIAS_CRYPTO("poly1305");
-+MODULE_ALIAS_CRYPTO("poly1305-arm");
-+MODULE_ALIAS_CRYPTO("poly1305-neon");
---- a/lib/crypto/Kconfig
-+++ b/lib/crypto/Kconfig
-@@ -40,7 +40,7 @@ config CRYPTO_LIB_DES
- config CRYPTO_LIB_POLY1305_RSIZE
- int
- default 4 if X86_64
-- default 9 if ARM64
-+ default 9 if ARM || ARM64
- default 1
-
- config CRYPTO_ARCH_HAVE_LIB_POLY1305