aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch2776
1 files changed, 2776 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch b/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
new file mode 100644
index 0000000000..d48235ca94
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
@@ -0,0 +1,2776 @@
+From 588765ccad76f9f65f09e1dcadc464d22441c889 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:25 +0100
+Subject: [PATCH 019/124] crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS
+ NEON implementation
+
+commit a6b803b3ddc793d6db0c16f12fc12d30d20fa9cc upstream.
+
+This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
+for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
+project. The file 'poly1305-armv4.pl' is taken straight from this upstream
+GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
+and already contains all the changes required to build it as part of a
+Linux kernel module.
+
+[0] https://github.com/dot-asm/cryptogams
+
+Co-developed-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig | 5 +
+ arch/arm/crypto/Makefile | 12 +-
+ arch/arm/crypto/poly1305-armv4.pl | 1236 +++++++++++++++++++++++
+ arch/arm/crypto/poly1305-core.S_shipped | 1158 +++++++++++++++++++++
+ arch/arm/crypto/poly1305-glue.c | 276 +++++
+ lib/crypto/Kconfig | 2 +-
+ 6 files changed, 2687 insertions(+), 2 deletions(-)
+ create mode 100644 arch/arm/crypto/poly1305-armv4.pl
+ create mode 100644 arch/arm/crypto/poly1305-core.S_shipped
+ create mode 100644 arch/arm/crypto/poly1305-glue.c
+
+--- a/arch/arm/crypto/Kconfig
++++ b/arch/arm/crypto/Kconfig
+@@ -131,6 +131,11 @@ config CRYPTO_CHACHA20_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
++config CRYPTO_POLY1305_ARM
++ tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
++ select CRYPTO_HASH
++ select CRYPTO_ARCH_HAVE_LIB_POLY1305
++
+ config CRYPTO_NHPOLY1305_NEON
+ tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+ depends on KERNEL_MODE_NEON
+--- a/arch/arm/crypto/Makefile
++++ b/arch/arm/crypto/Makefile
+@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sh
+ obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+ obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
++obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
+
+ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
+@@ -55,12 +56,16 @@ crct10dif-arm-ce-y := crct10dif-ce-core.
+ crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+ chacha-neon-y := chacha-scalar-core.o chacha-glue.o
+ chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
++poly1305-arm-y := poly1305-core.o poly1305-glue.o
+ nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+
+ ifdef REGENERATE_ARM_CRYPTO
+ quiet_cmd_perl = PERL $@
+ cmd_perl = $(PERL) $(<) > $(@)
+
++$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
++ $(call cmd,perl)
++
+ $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+ $(call cmd,perl)
+
+@@ -68,4 +73,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha
+ $(call cmd,perl)
+ endif
+
+-clean-files += sha256-core.S sha512-core.S
++clean-files += poly1305-core.S sha256-core.S sha512-core.S
++
++# massage the perlasm code a bit so we only get the NEON routine if we need it
++poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
++poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
++AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
+--- /dev/null
++++ b/arch/arm/crypto/poly1305-armv4.pl
+@@ -0,0 +1,1236 @@
++#!/usr/bin/env perl
++# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
++#
++# ====================================================================
++# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
++# project.
++# ====================================================================
++#
++# IALU(*)/gcc-4.4 NEON
++#
++# ARM11xx(ARMv6) 7.78/+100% -
++# Cortex-A5 6.35/+130% 3.00
++# Cortex-A8 6.25/+115% 2.36
++# Cortex-A9 5.10/+95% 2.55
++# Cortex-A15 3.85/+85% 1.25(**)
++# Snapdragon S4 5.70/+100% 1.48(**)
++#
++# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
++# (**) these are trade-off results, they can be improved by ~8% but at
++# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
++# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
++
++$flavour = shift;
++if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
++else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
++
++if ($flavour && $flavour ne "void") {
++ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++ die "can't locate arm-xlate.pl";
++
++ open STDOUT,"| \"$^X\" $xlate $flavour $output";
++} else {
++ open STDOUT,">$output";
++}
++
++($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
++
++$code.=<<___;
++#ifndef __KERNEL__
++# include "arm_arch.h"
++#else
++# define __ARM_ARCH__ __LINUX_ARM_ARCH__
++# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
++# define poly1305_init poly1305_init_arm
++# define poly1305_blocks poly1305_blocks_arm
++# define poly1305_emit poly1305_emit_arm
++.globl poly1305_blocks_neon
++#endif
++
++#if defined(__thumb2__)
++.syntax unified
++.thumb
++#else
++.code 32
++#endif
++
++.text
++
++.globl poly1305_emit
++.globl poly1305_blocks
++.globl poly1305_init
++.type poly1305_init,%function
++.align 5
++poly1305_init:
++.Lpoly1305_init:
++ stmdb sp!,{r4-r11}
++
++ eor r3,r3,r3
++ cmp $inp,#0
++ str r3,[$ctx,#0] @ zero hash value
++ str r3,[$ctx,#4]
++ str r3,[$ctx,#8]
++ str r3,[$ctx,#12]
++ str r3,[$ctx,#16]
++ str r3,[$ctx,#36] @ clear is_base2_26
++ add $ctx,$ctx,#20
++
++#ifdef __thumb2__
++ it eq
++#endif
++ moveq r0,#0
++ beq .Lno_key
++
++#if __ARM_MAX_ARCH__>=7
++ mov r3,#-1
++ str r3,[$ctx,#28] @ impossible key power value
++# ifndef __KERNEL__
++ adr r11,.Lpoly1305_init
++ ldr r12,.LOPENSSL_armcap
++# endif
++#endif
++ ldrb r4,[$inp,#0]
++ mov r10,#0x0fffffff
++ ldrb r5,[$inp,#1]
++ and r3,r10,#-4 @ 0x0ffffffc
++ ldrb r6,[$inp,#2]
++ ldrb r7,[$inp,#3]
++ orr r4,r4,r5,lsl#8
++ ldrb r5,[$inp,#4]
++ orr r4,r4,r6,lsl#16
++ ldrb r6,[$inp,#5]
++ orr r4,r4,r7,lsl#24
++ ldrb r7,[$inp,#6]
++ and r4,r4,r10
++
++#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++# if !defined(_WIN32)
++ ldr r12,[r11,r12] @ OPENSSL_armcap_P
++# endif
++# if defined(__APPLE__) || defined(_WIN32)
++ ldr r12,[r12]
++# endif
++#endif
++ ldrb r8,[$inp,#7]
++ orr r5,r5,r6,lsl#8
++ ldrb r6,[$inp,#8]
++ orr r5,r5,r7,lsl#16
++ ldrb r7,[$inp,#9]
++ orr r5,r5,r8,lsl#24
++ ldrb r8,[$inp,#10]
++ and r5,r5,r3
++
++#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++ tst r12,#ARMV7_NEON @ check for NEON
++# ifdef __thumb2__
++ adr r9,.Lpoly1305_blocks_neon
++ adr r11,.Lpoly1305_blocks
++ it ne
++ movne r11,r9
++ adr r12,.Lpoly1305_emit
++ orr r11,r11,#1 @ thumb-ify addresses
++ orr r12,r12,#1
++# else
++ add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
++ ite eq
++ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
++ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
++# endif
++#endif
++ ldrb r9,[$inp,#11]
++ orr r6,r6,r7,lsl#8
++ ldrb r7,[$inp,#12]
++ orr r6,r6,r8,lsl#16
++ ldrb r8,[$inp,#13]
++ orr r6,r6,r9,lsl#24
++ ldrb r9,[$inp,#14]
++ and r6,r6,r3
++
++ ldrb r10,[$inp,#15]
++ orr r7,r7,r8,lsl#8
++ str r4,[$ctx,#0]
++ orr r7,r7,r9,lsl#16
++ str r5,[$ctx,#4]
++ orr r7,r7,r10,lsl#24
++ str r6,[$ctx,#8]
++ and r7,r7,r3
++ str r7,[$ctx,#12]
++#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++ stmia r2,{r11,r12} @ fill functions table
++ mov r0,#1
++#else
++ mov r0,#0
++#endif
++.Lno_key:
++ ldmia sp!,{r4-r11}
++#if __ARM_ARCH__>=5
++ ret @ bx lr
++#else
++ tst lr,#1
++ moveq pc,lr @ be binary compatible with V4, yet
++ bx lr @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_init,.-poly1305_init
++___
++{
++my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
++my ($s1,$s2,$s3)=($r1,$r2,$r3);
++
++$code.=<<___;
++.type poly1305_blocks,%function
++.align 5
++poly1305_blocks:
++.Lpoly1305_blocks:
++ stmdb sp!,{r3-r11,lr}
++
++ ands $len,$len,#-16
++ beq .Lno_data
++
++ add $len,$len,$inp @ end pointer
++ sub sp,sp,#32
++
++#if __ARM_ARCH__<7
++ ldmia $ctx,{$h0-$r3} @ load context
++ add $ctx,$ctx,#20
++ str $len,[sp,#16] @ offload stuff
++ str $ctx,[sp,#12]
++#else
++ ldr lr,[$ctx,#36] @ is_base2_26
++ ldmia $ctx!,{$h0-$h4} @ load hash value
++ str $len,[sp,#16] @ offload stuff
++ str $ctx,[sp,#12]
++
++ adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
++ mov $r1,$h1,lsr#6
++ adcs $r1,$r1,$h2,lsl#20
++ mov $r2,$h2,lsr#12
++ adcs $r2,$r2,$h3,lsl#14
++ mov $r3,$h3,lsr#18
++ adcs $r3,$r3,$h4,lsl#8
++ mov $len,#0
++ teq lr,#0
++ str $len,[$ctx,#16] @ clear is_base2_26
++ adc $len,$len,$h4,lsr#24
++
++ itttt ne
++ movne $h0,$r0 @ choose between radixes
++ movne $h1,$r1
++ movne $h2,$r2
++ movne $h3,$r3
++ ldmia $ctx,{$r0-$r3} @ load key
++ it ne
++ movne $h4,$len
++#endif
++
++ mov lr,$inp
++ cmp $padbit,#0
++ str $r1,[sp,#20]
++ str $r2,[sp,#24]
++ str $r3,[sp,#28]
++ b .Loop
++
++.align 4
++.Loop:
++#if __ARM_ARCH__<7
++ ldrb r0,[lr],#16 @ load input
++# ifdef __thumb2__
++ it hi
++# endif
++ addhi $h4,$h4,#1 @ 1<<128
++ ldrb r1,[lr,#-15]
++ ldrb r2,[lr,#-14]
++ ldrb r3,[lr,#-13]
++ orr r1,r0,r1,lsl#8
++ ldrb r0,[lr,#-12]
++ orr r2,r1,r2,lsl#16
++ ldrb r1,[lr,#-11]
++ orr r3,r2,r3,lsl#24
++ ldrb r2,[lr,#-10]
++ adds $h0,$h0,r3 @ accumulate input
++
++ ldrb r3,[lr,#-9]
++ orr r1,r0,r1,lsl#8
++ ldrb r0,[lr,#-8]
++ orr r2,r1,r2,lsl#16
++ ldrb r1,[lr,#-7]
++ orr r3,r2,r3,lsl#24
++ ldrb r2,[lr,#-6]
++ adcs $h1,$h1,r3
++
++ ldrb r3,[lr,#-5]
++ orr r1,r0,r1,lsl#8
++ ldrb r0,[lr,#-4]
++ orr r2,r1,r2,lsl#16
++ ldrb r1,[lr,#-3]
++ orr r3,r2,r3,lsl#24
++ ldrb r2,[lr,#-2]
++ adcs $h2,$h2,r3
++
++ ldrb r3,[lr,#-1]
++ orr r1,r0,r1,lsl#8
++ str lr,[sp,#8] @ offload input pointer
++ orr r2,r1,r2,lsl#16
++ add $s1,$r1,$r1,lsr#2
++ orr r3,r2,r3,lsl#24
++#else
++ ldr r0,[lr],#16 @ load input
++ it hi
++ addhi $h4,$h4,#1 @ padbit
++ ldr r1,[lr,#-12]
++ ldr r2,[lr,#-8]
++ ldr r3,[lr,#-4]
++# ifdef __ARMEB__
++ rev r0,r0
++ rev r1,r1
++ rev r2,r2
++ rev r3,r3
++# endif
++ adds $h0,$h0,r0 @ accumulate input
++ str lr,[sp,#8] @ offload input pointer
++ adcs $h1,$h1,r1
++ add $s1,$r1,$r1,lsr#2
++ adcs $h2,$h2,r2
++#endif
++ add $s2,$r2,$r2,lsr#2
++ adcs $h3,$h3,r3
++ add $s3,$r3,$r3,lsr#2
++
++ umull r2,r3,$h1,$r0
++ adc $h4,$h4,#0
++ umull r0,r1,$h0,$r0
++ umlal r2,r3,$h4,$s1
++ umlal r0,r1,$h3,$s1
++ ldr $r1,[sp,#20] @ reload $r1
++ umlal r2,r3,$h2,$s3
++ umlal r0,r1,$h1,$s3
++ umlal r2,r3,$h3,$s2
++ umlal r0,r1,$h2,$s2
++ umlal r2,r3,$h0,$r1
++ str r0,[sp,#0] @ future $h0
++ mul r0,$s2,$h4
++ ldr $r2,[sp,#24] @ reload $r2
++ adds r2,r2,r1 @ d1+=d0>>32
++ eor r1,r1,r1
++ adc lr,r3,#0 @ future $h2
++ str r2,[sp,#4] @ future $h1
++
++ mul r2,$s3,$h4
++ eor r3,r3,r3
++ umlal r0,r1,$h3,$s3
++ ldr $r3,[sp,#28] @ reload $r3
++ umlal r2,r3,$h3,$r0
++ umlal r0,r1,$h2,$r0
++ umlal r2,r3,$h2,$r1
++ umlal r0,r1,$h1,$r1
++ umlal r2,r3,$h1,$r2
++ umlal r0,r1,$h0,$r2
++ umlal r2,r3,$h0,$r3
++ ldr $h0,[sp,#0]
++ mul $h4,$r0,$h4
++ ldr $h1,[sp,#4]
++
++ adds $h2,lr,r0 @ d2+=d1>>32
++ ldr lr,[sp,#8] @ reload input pointer
++ adc r1,r1,#0
++ adds $h3,r2,r1 @ d3+=d2>>32
++ ldr r0,[sp,#16] @ reload end pointer
++ adc r3,r3,#0
++ add $h4,$h4,r3 @ h4+=d3>>32
++
++ and r1,$h4,#-4
++ and $h4,$h4,#3
++ add r1,r1,r1,lsr#2 @ *=5
++ adds $h0,$h0,r1
++ adcs $h1,$h1,#0
++ adcs $h2,$h2,#0
++ adcs $h3,$h3,#0
++ adc $h4,$h4,#0
++
++ cmp r0,lr @ done yet?
++ bhi .Loop
++
++ ldr $ctx,[sp,#12]
++ add sp,sp,#32
++ stmdb $ctx,{$h0-$h4} @ store the result
++
++.Lno_data:
++#if __ARM_ARCH__>=5
++ ldmia sp!,{r3-r11,pc}
++#else
++ ldmia sp!,{r3-r11,lr}
++ tst lr,#1
++ moveq pc,lr @ be binary compatible with V4, yet
++ bx lr @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_blocks,.-poly1305_blocks
++___
++}
++{
++my ($ctx,$mac,$nonce)=map("r$_",(0..2));
++my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
++my $g4=$ctx;
++
++$code.=<<___;
++.type poly1305_emit,%function
++.align 5
++poly1305_emit:
++.Lpoly1305_emit:
++ stmdb sp!,{r4-r11}
++
++ ldmia $ctx,{$h0-$h4}
++
++#if __ARM_ARCH__>=7
++ ldr ip,[$ctx,#36] @ is_base2_26
++
++ adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
++ mov $g1,$h1,lsr#6
++ adcs $g1,$g1,$h2,lsl#20
++ mov $g2,$h2,lsr#12
++ adcs $g2,$g2,$h3,lsl#14
++ mov $g3,$h3,lsr#18
++ adcs $g3,$g3,$h4,lsl#8
++ mov $g4,#0
++ adc $g4,$g4,$h4,lsr#24
++
++ tst ip,ip
++ itttt ne
++ movne $h0,$g0
++ movne $h1,$g1
++ movne $h2,$g2
++ movne $h3,$g3
++ it ne
++ movne $h4,$g4
++#endif
++
++ adds $g0,$h0,#5 @ compare to modulus
++ adcs $g1,$h1,#0
++ adcs $g2,$h2,#0
++ adcs $g3,$h3,#0
++ adc $g4,$h4,#0
++ tst $g4,#4 @ did it carry/borrow?
++
++#ifdef __thumb2__
++ it ne
++#endif
++ movne $h0,$g0
++ ldr $g0,[$nonce,#0]
++#ifdef __thumb2__
++ it ne
++#endif
++ movne $h1,$g1
++ ldr $g1,[$nonce,#4]
++#ifdef __thumb2__
++ it ne
++#endif
++ movne $h2,$g2
++ ldr $g2,[$nonce,#8]
++#ifdef __thumb2__
++ it ne
++#endif
++ movne $h3,$g3
++ ldr $g3,[$nonce,#12]
++
++ adds $h0,$h0,$g0
++ adcs $h1,$h1,$g1
++ adcs $h2,$h2,$g2
++ adc $h3,$h3,$g3
++
++#if __ARM_ARCH__>=7
++# ifdef __ARMEB__
++ rev $h0,$h0
++ rev $h1,$h1
++ rev $h2,$h2
++ rev $h3,$h3
++# endif
++ str $h0,[$mac,#0]
++ str $h1,[$mac,#4]
++ str $h2,[$mac,#8]
++ str $h3,[$mac,#12]
++#else
++ strb $h0,[$mac,#0]
++ mov $h0,$h0,lsr#8
++ strb $h1,[$mac,#4]
++ mov $h1,$h1,lsr#8
++ strb $h2,[$mac,#8]
++ mov $h2,$h2,lsr#8
++ strb $h3,[$mac,#12]
++ mov $h3,$h3,lsr#8
++
++ strb $h0,[$mac,#1]
++ mov $h0,$h0,lsr#8
++ strb $h1,[$mac,#5]
++ mov $h1,$h1,lsr#8
++ strb $h2,[$mac,#9]
++ mov $h2,$h2,lsr#8
++ strb $h3,[$mac,#13]
++ mov $h3,$h3,lsr#8
++
++ strb $h0,[$mac,#2]
++ mov $h0,$h0,lsr#8
++ strb $h1,[$mac,#6]
++ mov $h1,$h1,lsr#8
++ strb $h2,[$mac,#10]
++ mov $h2,$h2,lsr#8
++ strb $h3,[$mac,#14]
++ mov $h3,$h3,lsr#8
++
++ strb $h0,[$mac,#3]
++ strb $h1,[$mac,#7]
++ strb $h2,[$mac,#11]
++ strb $h3,[$mac,#15]
++#endif
++ ldmia sp!,{r4-r11}
++#if __ARM_ARCH__>=5
++ ret @ bx lr
++#else
++ tst lr,#1
++ moveq pc,lr @ be binary compatible with V4, yet
++ bx lr @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_emit,.-poly1305_emit
++___
++{
++my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
++my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
++my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
++
++my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
++
++$code.=<<___;
++#if __ARM_MAX_ARCH__>=7
++.fpu neon
++
++.type poly1305_init_neon,%function
++.align 5
++poly1305_init_neon:
++.Lpoly1305_init_neon:
++ ldr r3,[$ctx,#48] @ first table element
++ cmp r3,#-1 @ is value impossible?
++ bne .Lno_init_neon
++
++ ldr r4,[$ctx,#20] @ load key base 2^32
++ ldr r5,[$ctx,#24]
++ ldr r6,[$ctx,#28]
++ ldr r7,[$ctx,#32]
++
++ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
++ mov r3,r4,lsr#26
++ mov r4,r5,lsr#20
++ orr r3,r3,r5,lsl#6
++ mov r5,r6,lsr#14
++ orr r4,r4,r6,lsl#12
++ mov r6,r7,lsr#8
++ orr r5,r5,r7,lsl#18
++ and r3,r3,#0x03ffffff
++ and r4,r4,#0x03ffffff
++ and r5,r5,#0x03ffffff
++
++ vdup.32 $R0,r2 @ r^1 in both lanes
++ add r2,r3,r3,lsl#2 @ *5
++ vdup.32 $R1,r3
++ add r3,r4,r4,lsl#2
++ vdup.32 $S1,r2
++ vdup.32 $R2,r4
++ add r4,r5,r5,lsl#2
++ vdup.32 $S2,r3
++ vdup.32 $R3,r5
++ add r5,r6,r6,lsl#2
++ vdup.32 $S3,r4
++ vdup.32 $R4,r6
++ vdup.32 $S4,r5
++
++ mov $zeros,#2 @ counter
++
++.Lsquare_neon:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
++ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
++ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
++ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
++
++ vmull.u32 $D0,$R0,${R0}[1]
++ vmull.u32 $D1,$R1,${R0}[1]
++ vmull.u32 $D2,$R2,${R0}[1]
++ vmull.u32 $D3,$R3,${R0}[1]
++ vmull.u32 $D4,$R4,${R0}[1]
++
++ vmlal.u32 $D0,$R4,${S1}[1]
++ vmlal.u32 $D1,$R0,${R1}[1]
++ vmlal.u32 $D2,$R1,${R1}[1]
++ vmlal.u32 $D3,$R2,${R1}[1]
++ vmlal.u32 $D4,$R3,${R1}[1]
++
++ vmlal.u32 $D0,$R3,${S2}[1]
++ vmlal.u32 $D1,$R4,${S2}[1]
++ vmlal.u32 $D3,$R1,${R2}[1]
++ vmlal.u32 $D2,$R0,${R2}[1]
++ vmlal.u32 $D4,$R2,${R2}[1]
++
++ vmlal.u32 $D0,$R2,${S3}[1]
++ vmlal.u32 $D3,$R0,${R3}[1]
++ vmlal.u32 $D1,$R3,${S3}[1]
++ vmlal.u32 $D2,$R4,${S3}[1]
++ vmlal.u32 $D4,$R1,${R3}[1]
++
++ vmlal.u32 $D3,$R4,${S4}[1]
++ vmlal.u32 $D0,$R1,${S4}[1]
++ vmlal.u32 $D1,$R2,${S4}[1]
++ vmlal.u32 $D2,$R3,${S4}[1]
++ vmlal.u32 $D4,$R0,${R4}[1]
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
++ @ and P. Schwabe
++ @
++ @ H0>>+H1>>+H2>>+H3>>+H4
++ @ H3>>+H4>>*5+H0>>+H1
++ @
++ @ Trivia.
++ @
++ @ Result of multiplication of n-bit number by m-bit number is
++ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
++ @ m-bit number multiplied by 2^n is still n+m bits wide.
++ @
++ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
++ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
++ @ one is n+1 bits wide.
++ @
++ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
++ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
++ @ can be 27. However! In cases when their width exceeds 26 bits
++ @ they are limited by 2^26+2^6. This in turn means that *sum*
++ @ of the products with these values can still be viewed as sum
++ @ of 52-bit numbers as long as the amount of addends is not a
++ @ power of 2. For example,
++ @
++ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
++ @
++ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
++ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
++ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
++ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
++ @ which is less than 32 * (2^52) or 2^57. And when processing
++ @ data we are looking at triple as many addends...
++ @
++ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
++ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
++ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
++ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
++ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
++ @ This means that result of reduction have to be compressed upon
++ @ loop wrap-around. This can be done in the process of reduction
++ @ to minimize amount of instructions [as well as amount of
++ @ 128-bit instructions, which benefits low-end processors], but
++ @ one has to watch for H2 (which is narrower than H0) and 5*H4
++ @ not being wider than 58 bits, so that result of right shift
++ @ by 26 bits fits in 32 bits. This is also useful on x86,
++ @ because it allows to use paddd in place for paddq, which
++ @ benefits Atom, where paddq is ridiculously slow.
++
++ vshr.u64 $T0,$D3,#26
++ vmovn.i64 $D3#lo,$D3
++ vshr.u64 $T1,$D0,#26
++ vmovn.i64 $D0#lo,$D0
++ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
++ vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
++ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
++ vbic.i32 $D0#lo,#0xfc000000
++
++ vshrn.u64 $T0#lo,$D4,#26
++ vmovn.i64 $D4#lo,$D4
++ vshr.u64 $T1,$D1,#26
++ vmovn.i64 $D1#lo,$D1
++ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
++ vbic.i32 $D4#lo,#0xfc000000
++ vbic.i32 $D1#lo,#0xfc000000
++
++ vadd.i32 $D0#lo,$D0#lo,$T0#lo
++ vshl.u32 $T0#lo,$T0#lo,#2
++ vshrn.u64 $T1#lo,$D2,#26
++ vmovn.i64 $D2#lo,$D2
++ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
++ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
++ vbic.i32 $D2#lo,#0xfc000000
++
++ vshr.u32 $T0#lo,$D0#lo,#26
++ vbic.i32 $D0#lo,#0xfc000000
++ vshr.u32 $T1#lo,$D3#lo,#26
++ vbic.i32 $D3#lo,#0xfc000000
++ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
++ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
++
++ subs $zeros,$zeros,#1
++ beq .Lsquare_break_neon
++
++ add $tbl0,$ctx,#(48+0*9*4)
++ add $tbl1,$ctx,#(48+1*9*4)
++
++ vtrn.32 $R0,$D0#lo @ r^2:r^1
++ vtrn.32 $R2,$D2#lo
++ vtrn.32 $R3,$D3#lo
++ vtrn.32 $R1,$D1#lo
++ vtrn.32 $R4,$D4#lo
++
++ vshl.u32 $S2,$R2,#2 @ *5
++ vshl.u32 $S3,$R3,#2
++ vshl.u32 $S1,$R1,#2
++ vshl.u32 $S4,$R4,#2
++ vadd.i32 $S2,$S2,$R2
++ vadd.i32 $S1,$S1,$R1
++ vadd.i32 $S3,$S3,$R3
++ vadd.i32 $S4,$S4,$R4
++
++ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
++ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
++ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++ vst1.32 {${S4}[0]},[$tbl0,:32]
++ vst1.32 {${S4}[1]},[$tbl1,:32]
++
++ b .Lsquare_neon
++
++.align 4
++.Lsquare_break_neon:
++ add $tbl0,$ctx,#(48+2*4*9)
++ add $tbl1,$ctx,#(48+3*4*9)
++
++ vmov $R0,$D0#lo @ r^4:r^3
++ vshl.u32 $S1,$D1#lo,#2 @ *5
++ vmov $R1,$D1#lo
++ vshl.u32 $S2,$D2#lo,#2
++ vmov $R2,$D2#lo
++ vshl.u32 $S3,$D3#lo,#2
++ vmov $R3,$D3#lo
++ vshl.u32 $S4,$D4#lo,#2
++ vmov $R4,$D4#lo
++ vadd.i32 $S1,$S1,$D1#lo
++ vadd.i32 $S2,$S2,$D2#lo
++ vadd.i32 $S3,$S3,$D3#lo
++ vadd.i32 $S4,$S4,$D4#lo
++
++ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
++ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
++ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++ vst1.32 {${S4}[0]},[$tbl0]
++ vst1.32 {${S4}[1]},[$tbl1]
++
++.Lno_init_neon:
++ ret @ bx lr
++.size poly1305_init_neon,.-poly1305_init_neon
++
++.type poly1305_blocks_neon,%function
++.align 5
++poly1305_blocks_neon:
++.Lpoly1305_blocks_neon:
++ ldr ip,[$ctx,#36] @ is_base2_26
++
++ cmp $len,#64
++ blo .Lpoly1305_blocks
++
++ stmdb sp!,{r4-r7}
++ vstmdb sp!,{d8-d15} @ ABI specification says so
++
++ tst ip,ip @ is_base2_26?
++ bne .Lbase2_26_neon
++
++ stmdb sp!,{r1-r3,lr}
++ bl .Lpoly1305_init_neon
++
++ ldr r4,[$ctx,#0] @ load hash value base 2^32
++ ldr r5,[$ctx,#4]
++ ldr r6,[$ctx,#8]
++ ldr r7,[$ctx,#12]
++ ldr ip,[$ctx,#16]
++
++ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
++ mov r3,r4,lsr#26
++ veor $D0#lo,$D0#lo,$D0#lo
++ mov r4,r5,lsr#20
++ orr r3,r3,r5,lsl#6
++ veor $D1#lo,$D1#lo,$D1#lo
++ mov r5,r6,lsr#14
++ orr r4,r4,r6,lsl#12
++ veor $D2#lo,$D2#lo,$D2#lo
++ mov r6,r7,lsr#8
++ orr r5,r5,r7,lsl#18
++ veor $D3#lo,$D3#lo,$D3#lo
++ and r3,r3,#0x03ffffff
++ orr r6,r6,ip,lsl#24
++ veor $D4#lo,$D4#lo,$D4#lo
++ and r4,r4,#0x03ffffff
++ mov r1,#1
++ and r5,r5,#0x03ffffff
++ str r1,[$ctx,#36] @ set is_base2_26
++
++ vmov.32 $D0#lo[0],r2
++ vmov.32 $D1#lo[0],r3
++ vmov.32 $D2#lo[0],r4
++ vmov.32 $D3#lo[0],r5
++ vmov.32 $D4#lo[0],r6
++ adr $zeros,.Lzeros
++
++ ldmia sp!,{r1-r3,lr}
++ b .Lhash_loaded
++
++.align 4
++.Lbase2_26_neon:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ load hash value
++
++ veor $D0#lo,$D0#lo,$D0#lo
++ veor $D1#lo,$D1#lo,$D1#lo
++ veor $D2#lo,$D2#lo,$D2#lo
++ veor $D3#lo,$D3#lo,$D3#lo
++ veor $D4#lo,$D4#lo,$D4#lo
++ vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
++ adr $zeros,.Lzeros
++ vld1.32 {$D4#lo[0]},[$ctx]
++ sub $ctx,$ctx,#16 @ rewind
++
++.Lhash_loaded:
++ add $in2,$inp,#32
++ mov $padbit,$padbit,lsl#24
++ tst $len,#31
++ beq .Leven
++
++ vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
++ vmov.32 $H4#lo[0],$padbit
++ sub $len,$len,#16
++ add $in2,$inp,#32
++
++# ifdef __ARMEB__
++ vrev32.8 $H0,$H0
++ vrev32.8 $H3,$H3
++ vrev32.8 $H1,$H1
++ vrev32.8 $H2,$H2
++# endif
++ vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
++ vshl.u32 $H3#lo,$H3#lo,#18
++
++ vsri.u32 $H3#lo,$H2#lo,#14
++ vshl.u32 $H2#lo,$H2#lo,#12
++ vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
++
++ vbic.i32 $H3#lo,#0xfc000000
++ vsri.u32 $H2#lo,$H1#lo,#20
++ vshl.u32 $H1#lo,$H1#lo,#6
++
++ vbic.i32 $H2#lo,#0xfc000000
++ vsri.u32 $H1#lo,$H0#lo,#26
++ vadd.i32 $H3#hi,$H3#lo,$D3#lo
++
++ vbic.i32 $H0#lo,#0xfc000000
++ vbic.i32 $H1#lo,#0xfc000000
++ vadd.i32 $H2#hi,$H2#lo,$D2#lo
++
++ vadd.i32 $H0#hi,$H0#lo,$D0#lo
++ vadd.i32 $H1#hi,$H1#lo,$D1#lo
++
++ mov $tbl1,$zeros
++ add $tbl0,$ctx,#48
++
++ cmp $len,$len
++ b .Long_tail
++
++.align 4
++.Leven:
++ subs $len,$len,#64
++ it lo
++ movlo $in2,$zeros
++
++ vmov.i32 $H4,#1<<24 @ padbit, yes, always
++ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
++ add $inp,$inp,#64
++ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
++ add $in2,$in2,#64
++ itt hi
++ addhi $tbl1,$ctx,#(48+1*9*4)
++ addhi $tbl0,$ctx,#(48+3*9*4)
++
++# ifdef __ARMEB__
++ vrev32.8 $H0,$H0
++ vrev32.8 $H3,$H3
++ vrev32.8 $H1,$H1
++ vrev32.8 $H2,$H2
++# endif
++ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
++ vshl.u32 $H3,$H3,#18
++
++ vsri.u32 $H3,$H2,#14
++ vshl.u32 $H2,$H2,#12
++
++ vbic.i32 $H3,#0xfc000000
++ vsri.u32 $H2,$H1,#20
++ vshl.u32 $H1,$H1,#6
++
++ vbic.i32 $H2,#0xfc000000
++ vsri.u32 $H1,$H0,#26
++
++ vbic.i32 $H0,#0xfc000000
++ vbic.i32 $H1,#0xfc000000
++
++ bls .Lskip_loop
++
++ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
++ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
++ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++ b .Loop_neon
++
++.align 5
++.Loop_neon:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
++ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
++ @ \___________________/
++ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
++ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
++ @ \___________________/ \____________________/
++ @
++ @ Note that we start with inp[2:3]*r^2. This is because it
++ @ doesn't depend on reduction in previous iteration.
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
++ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
++ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
++ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
++ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ inp[2:3]*r^2
++
++ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
++ vmull.u32 $D2,$H2#hi,${R0}[1]
++ vadd.i32 $H0#lo,$H0#lo,$D0#lo
++ vmull.u32 $D0,$H0#hi,${R0}[1]
++ vadd.i32 $H3#lo,$H3#lo,$D3#lo
++ vmull.u32 $D3,$H3#hi,${R0}[1]
++ vmlal.u32 $D2,$H1#hi,${R1}[1]
++ vadd.i32 $H1#lo,$H1#lo,$D1#lo
++ vmull.u32 $D1,$H1#hi,${R0}[1]
++
++ vadd.i32 $H4#lo,$H4#lo,$D4#lo
++ vmull.u32 $D4,$H4#hi,${R0}[1]
++ subs $len,$len,#64
++ vmlal.u32 $D0,$H4#hi,${S1}[1]
++ it lo
++ movlo $in2,$zeros
++ vmlal.u32 $D3,$H2#hi,${R1}[1]
++ vld1.32 ${S4}[1],[$tbl1,:32]
++ vmlal.u32 $D1,$H0#hi,${R1}[1]
++ vmlal.u32 $D4,$H3#hi,${R1}[1]
++
++ vmlal.u32 $D0,$H3#hi,${S2}[1]
++ vmlal.u32 $D3,$H1#hi,${R2}[1]
++ vmlal.u32 $D4,$H2#hi,${R2}[1]
++ vmlal.u32 $D1,$H4#hi,${S2}[1]
++ vmlal.u32 $D2,$H0#hi,${R2}[1]
++
++ vmlal.u32 $D3,$H0#hi,${R3}[1]
++ vmlal.u32 $D0,$H2#hi,${S3}[1]
++ vmlal.u32 $D4,$H1#hi,${R3}[1]
++ vmlal.u32 $D1,$H3#hi,${S3}[1]
++ vmlal.u32 $D2,$H4#hi,${S3}[1]
++
++ vmlal.u32 $D3,$H4#hi,${S4}[1]
++ vmlal.u32 $D0,$H1#hi,${S4}[1]
++ vmlal.u32 $D4,$H0#hi,${R4}[1]
++ vmlal.u32 $D1,$H2#hi,${S4}[1]
++ vmlal.u32 $D2,$H3#hi,${S4}[1]
++
++ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
++ add $in2,$in2,#64
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ (hash+inp[0:1])*r^4 and accumulate
++
++ vmlal.u32 $D3,$H3#lo,${R0}[0]
++ vmlal.u32 $D0,$H0#lo,${R0}[0]
++ vmlal.u32 $D4,$H4#lo,${R0}[0]
++ vmlal.u32 $D1,$H1#lo,${R0}[0]
++ vmlal.u32 $D2,$H2#lo,${R0}[0]
++ vld1.32 ${S4}[0],[$tbl0,:32]
++
++ vmlal.u32 $D3,$H2#lo,${R1}[0]
++ vmlal.u32 $D0,$H4#lo,${S1}[0]
++ vmlal.u32 $D4,$H3#lo,${R1}[0]
++ vmlal.u32 $D1,$H0#lo,${R1}[0]
++ vmlal.u32 $D2,$H1#lo,${R1}[0]
++
++ vmlal.u32 $D3,$H1#lo,${R2}[0]
++ vmlal.u32 $D0,$H3#lo,${S2}[0]
++ vmlal.u32 $D4,$H2#lo,${R2}[0]
++ vmlal.u32 $D1,$H4#lo,${S2}[0]
++ vmlal.u32 $D2,$H0#lo,${R2}[0]
++
++ vmlal.u32 $D3,$H0#lo,${R3}[0]
++ vmlal.u32 $D0,$H2#lo,${S3}[0]
++ vmlal.u32 $D4,$H1#lo,${R3}[0]
++ vmlal.u32 $D1,$H3#lo,${S3}[0]
++ vmlal.u32 $D3,$H4#lo,${S4}[0]
++
++ vmlal.u32 $D2,$H4#lo,${S3}[0]
++ vmlal.u32 $D0,$H1#lo,${S4}[0]
++ vmlal.u32 $D4,$H0#lo,${R4}[0]
++ vmov.i32 $H4,#1<<24 @ padbit, yes, always
++ vmlal.u32 $D1,$H2#lo,${S4}[0]
++ vmlal.u32 $D2,$H3#lo,${S4}[0]
++
++ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
++ add $inp,$inp,#64
++# ifdef __ARMEB__
++ vrev32.8 $H0,$H0
++ vrev32.8 $H1,$H1
++ vrev32.8 $H2,$H2
++ vrev32.8 $H3,$H3
++# endif
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
++ @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
++
++ vshr.u64 $T0,$D3,#26
++ vmovn.i64 $D3#lo,$D3
++ vshr.u64 $T1,$D0,#26
++ vmovn.i64 $D0#lo,$D0
++ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
++ vbic.i32 $D3#lo,#0xfc000000
++ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
++ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
++ vshl.u32 $H3,$H3,#18
++ vbic.i32 $D0#lo,#0xfc000000
++
++ vshrn.u64 $T0#lo,$D4,#26
++ vmovn.i64 $D4#lo,$D4
++ vshr.u64 $T1,$D1,#26
++ vmovn.i64 $D1#lo,$D1
++ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
++ vsri.u32 $H3,$H2,#14
++ vbic.i32 $D4#lo,#0xfc000000
++ vshl.u32 $H2,$H2,#12
++ vbic.i32 $D1#lo,#0xfc000000
++
++ vadd.i32 $D0#lo,$D0#lo,$T0#lo
++ vshl.u32 $T0#lo,$T0#lo,#2
++ vbic.i32 $H3,#0xfc000000
++ vshrn.u64 $T1#lo,$D2,#26
++ vmovn.i64 $D2#lo,$D2
++ vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
++ vsri.u32 $H2,$H1,#20
++ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
++ vshl.u32 $H1,$H1,#6
++ vbic.i32 $D2#lo,#0xfc000000
++ vbic.i32 $H2,#0xfc000000
++
++ vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
++ vmovn.i64 $D0#lo,$D0
++ vsri.u32 $H1,$H0,#26
++ vbic.i32 $H0,#0xfc000000
++ vshr.u32 $T1#lo,$D3#lo,#26
++ vbic.i32 $D3#lo,#0xfc000000
++ vbic.i32 $D0#lo,#0xfc000000
++ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
++ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
++ vbic.i32 $H1,#0xfc000000
++
++ bhi .Loop_neon
++
++.Lskip_loop:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
++
++ add $tbl1,$ctx,#(48+0*9*4)
++ add $tbl0,$ctx,#(48+1*9*4)
++ adds $len,$len,#32
++ it ne
++ movne $len,#0
++ bne .Long_tail
++
++ vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
++ vadd.i32 $H0#hi,$H0#lo,$D0#lo
++ vadd.i32 $H3#hi,$H3#lo,$D3#lo
++ vadd.i32 $H1#hi,$H1#lo,$D1#lo
++ vadd.i32 $H4#hi,$H4#lo,$D4#lo
++
++.Long_tail:
++ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
++ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
++
++ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
++ vmull.u32 $D2,$H2#hi,$R0
++ vadd.i32 $H0#lo,$H0#lo,$D0#lo
++ vmull.u32 $D0,$H0#hi,$R0
++ vadd.i32 $H3#lo,$H3#lo,$D3#lo
++ vmull.u32 $D3,$H3#hi,$R0
++ vadd.i32 $H1#lo,$H1#lo,$D1#lo
++ vmull.u32 $D1,$H1#hi,$R0
++ vadd.i32 $H4#lo,$H4#lo,$D4#lo
++ vmull.u32 $D4,$H4#hi,$R0
++
++ vmlal.u32 $D0,$H4#hi,$S1
++ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++ vmlal.u32 $D3,$H2#hi,$R1
++ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++ vmlal.u32 $D1,$H0#hi,$R1
++ vmlal.u32 $D4,$H3#hi,$R1
++ vmlal.u32 $D2,$H1#hi,$R1
++
++ vmlal.u32 $D3,$H1#hi,$R2
++ vld1.32 ${S4}[1],[$tbl1,:32]
++ vmlal.u32 $D0,$H3#hi,$S2
++ vld1.32 ${S4}[0],[$tbl0,:32]
++ vmlal.u32 $D4,$H2#hi,$R2
++ vmlal.u32 $D1,$H4#hi,$S2
++ vmlal.u32 $D2,$H0#hi,$R2
++
++ vmlal.u32 $D3,$H0#hi,$R3
++ it ne
++ addne $tbl1,$ctx,#(48+2*9*4)
++ vmlal.u32 $D0,$H2#hi,$S3
++ it ne
++ addne $tbl0,$ctx,#(48+3*9*4)
++ vmlal.u32 $D4,$H1#hi,$R3
++ vmlal.u32 $D1,$H3#hi,$S3
++ vmlal.u32 $D2,$H4#hi,$S3
++
++ vmlal.u32 $D3,$H4#hi,$S4
++ vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
++ vmlal.u32 $D0,$H1#hi,$S4
++ vshr.u64 $MASK,$MASK,#38
++ vmlal.u32 $D4,$H0#hi,$R4
++ vmlal.u32 $D1,$H2#hi,$S4
++ vmlal.u32 $D2,$H3#hi,$S4
++
++ beq .Lshort_tail
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ (hash+inp[0:1])*r^4:r^3 and accumulate
++
++ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
++ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
++
++ vmlal.u32 $D2,$H2#lo,$R0
++ vmlal.u32 $D0,$H0#lo,$R0
++ vmlal.u32 $D3,$H3#lo,$R0
++ vmlal.u32 $D1,$H1#lo,$R0
++ vmlal.u32 $D4,$H4#lo,$R0
++
++ vmlal.u32 $D0,$H4#lo,$S1
++ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++ vmlal.u32 $D3,$H2#lo,$R1
++ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++ vmlal.u32 $D1,$H0#lo,$R1
++ vmlal.u32 $D4,$H3#lo,$R1
++ vmlal.u32 $D2,$H1#lo,$R1
++
++ vmlal.u32 $D3,$H1#lo,$R2
++ vld1.32 ${S4}[1],[$tbl1,:32]
++ vmlal.u32 $D0,$H3#lo,$S2
++ vld1.32 ${S4}[0],[$tbl0,:32]
++ vmlal.u32 $D4,$H2#lo,$R2
++ vmlal.u32 $D1,$H4#lo,$S2
++ vmlal.u32 $D2,$H0#lo,$R2
++
++ vmlal.u32 $D3,$H0#lo,$R3
++ vmlal.u32 $D0,$H2#lo,$S3
++ vmlal.u32 $D4,$H1#lo,$R3
++ vmlal.u32 $D1,$H3#lo,$S3
++ vmlal.u32 $D2,$H4#lo,$S3
++
++ vmlal.u32 $D3,$H4#lo,$S4
++ vorn $MASK,$MASK,$MASK @ all-ones
++ vmlal.u32 $D0,$H1#lo,$S4
++ vshr.u64 $MASK,$MASK,#38
++ vmlal.u32 $D4,$H0#lo,$R4
++ vmlal.u32 $D1,$H2#lo,$S4
++ vmlal.u32 $D2,$H3#lo,$S4
++
++.Lshort_tail:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ horizontal addition
++
++ vadd.i64 $D3#lo,$D3#lo,$D3#hi
++ vadd.i64 $D0#lo,$D0#lo,$D0#hi
++ vadd.i64 $D4#lo,$D4#lo,$D4#hi
++ vadd.i64 $D1#lo,$D1#lo,$D1#hi
++ vadd.i64 $D2#lo,$D2#lo,$D2#hi
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ lazy reduction, but without narrowing
++
++ vshr.u64 $T0,$D3,#26
++ vand.i64 $D3,$D3,$MASK
++ vshr.u64 $T1,$D0,#26
++ vand.i64 $D0,$D0,$MASK
++ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
++ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
++
++ vshr.u64 $T0,$D4,#26
++ vand.i64 $D4,$D4,$MASK
++ vshr.u64 $T1,$D1,#26
++ vand.i64 $D1,$D1,$MASK
++ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
++
++ vadd.i64 $D0,$D0,$T0
++ vshl.u64 $T0,$T0,#2
++ vshr.u64 $T1,$D2,#26
++ vand.i64 $D2,$D2,$MASK
++ vadd.i64 $D0,$D0,$T0 @ h4 -> h0
++ vadd.i64 $D3,$D3,$T1 @ h2 -> h3
++
++ vshr.u64 $T0,$D0,#26
++ vand.i64 $D0,$D0,$MASK
++ vshr.u64 $T1,$D3,#26
++ vand.i64 $D3,$D3,$MASK
++ vadd.i64 $D1,$D1,$T0 @ h0 -> h1
++ vadd.i64 $D4,$D4,$T1 @ h3 -> h4
++
++ cmp $len,#0
++ bne .Leven
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ store hash value
++
++ vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
++ vst1.32 {$D4#lo[0]},[$ctx]
++
++ vldmia sp!,{d8-d15} @ epilogue
++ ldmia sp!,{r4-r7}
++ ret @ bx lr
++.size poly1305_blocks_neon,.-poly1305_blocks_neon
++
++.align 5
++.Lzeros:
++.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
++#ifndef __KERNEL__
++.LOPENSSL_armcap:
++# ifdef _WIN32
++.word OPENSSL_armcap_P
++# else
++.word OPENSSL_armcap_P-.Lpoly1305_init
++# endif
++.comm OPENSSL_armcap_P,4,4
++.hidden OPENSSL_armcap_P
++#endif
++#endif
++___
++} }
++$code.=<<___;
++.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
++.align 2
++___
++
++foreach (split("\n",$code)) {
++ s/\`([^\`]*)\`/eval $1/geo;
++
++ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
++ s/\bret\b/bx lr/go or
++ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
++
++ print $_,"\n";
++}
++close STDOUT; # enforce flush
+--- /dev/null
++++ b/arch/arm/crypto/poly1305-core.S_shipped
+@@ -0,0 +1,1158 @@
++#ifndef __KERNEL__
++# include "arm_arch.h"
++#else
++# define __ARM_ARCH__ __LINUX_ARM_ARCH__
++# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
++# define poly1305_init poly1305_init_arm
++# define poly1305_blocks poly1305_blocks_arm
++# define poly1305_emit poly1305_emit_arm
++.globl poly1305_blocks_neon
++#endif
++
++#if defined(__thumb2__)
++.syntax unified
++.thumb
++#else
++.code 32
++#endif
++
++.text
++
++.globl poly1305_emit
++.globl poly1305_blocks
++.globl poly1305_init
++.type poly1305_init,%function
++.align 5
++poly1305_init:
++.Lpoly1305_init:
++ stmdb sp!,{r4-r11}
++
++ eor r3,r3,r3
++ cmp r1,#0
++ str r3,[r0,#0] @ zero hash value
++ str r3,[r0,#4]
++ str r3,[r0,#8]
++ str r3,[r0,#12]
++ str r3,[r0,#16]
++ str r3,[r0,#36] @ clear is_base2_26
++ add r0,r0,#20
++
++#ifdef __thumb2__
++ it eq
++#endif
++ moveq r0,#0
++ beq .Lno_key
++
++#if __ARM_MAX_ARCH__>=7
++ mov r3,#-1
++ str r3,[r0,#28] @ impossible key power value
++# ifndef __KERNEL__
++ adr r11,.Lpoly1305_init
++ ldr r12,.LOPENSSL_armcap
++# endif
++#endif
++ ldrb r4,[r1,#0]
++ mov r10,#0x0fffffff
++ ldrb r5,[r1,#1]
++ and r3,r10,#-4 @ 0x0ffffffc
++ ldrb r6,[r1,#2]
++ ldrb r7,[r1,#3]
++ orr r4,r4,r5,lsl#8
++ ldrb r5,[r1,#4]
++ orr r4,r4,r6,lsl#16
++ ldrb r6,[r1,#5]
++ orr r4,r4,r7,lsl#24
++ ldrb r7,[r1,#6]
++ and r4,r4,r10
++
++#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++# if !defined(_WIN32)
++ ldr r12,[r11,r12] @ OPENSSL_armcap_P
++# endif
++# if defined(__APPLE__) || defined(_WIN32)
++ ldr r12,[r12]
++# endif
++#endif
++ ldrb r8,[r1,#7]
++ orr r5,r5,r6,lsl#8
++ ldrb r6,[r1,#8]
++ orr r5,r5,r7,lsl#16
++ ldrb r7,[r1,#9]
++ orr r5,r5,r8,lsl#24
++ ldrb r8,[r1,#10]
++ and r5,r5,r3
++
++#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++ tst r12,#ARMV7_NEON @ check for NEON
++# ifdef __thumb2__
++ adr r9,.Lpoly1305_blocks_neon
++ adr r11,.Lpoly1305_blocks
++ it ne
++ movne r11,r9
++ adr r12,.Lpoly1305_emit
++ orr r11,r11,#1 @ thumb-ify addresses
++ orr r12,r12,#1
++# else
++ add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
++ ite eq
++ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
++ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
++# endif
++#endif
++ ldrb r9,[r1,#11]
++ orr r6,r6,r7,lsl#8
++ ldrb r7,[r1,#12]
++ orr r6,r6,r8,lsl#16
++ ldrb r8,[r1,#13]
++ orr r6,r6,r9,lsl#24
++ ldrb r9,[r1,#14]
++ and r6,r6,r3
++
++ ldrb r10,[r1,#15]
++ orr r7,r7,r8,lsl#8
++ str r4,[r0,#0]
++ orr r7,r7,r9,lsl#16
++ str r5,[r0,#4]
++ orr r7,r7,r10,lsl#24
++ str r6,[r0,#8]
++ and r7,r7,r3
++ str r7,[r0,#12]
++#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++ stmia r2,{r11,r12} @ fill functions table
++ mov r0,#1
++#else
++ mov r0,#0
++#endif
++.Lno_key:
++ ldmia sp!,{r4-r11}
++#if __ARM_ARCH__>=5
++ bx lr @ bx lr
++#else
++ tst lr,#1
++ moveq pc,lr @ be binary compatible with V4, yet
++ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_init,.-poly1305_init
++.type poly1305_blocks,%function
++.align 5
++poly1305_blocks:
++.Lpoly1305_blocks:
++ stmdb sp!,{r3-r11,lr}
++
++ ands r2,r2,#-16
++ beq .Lno_data
++
++ add r2,r2,r1 @ end pointer
++ sub sp,sp,#32
++
++#if __ARM_ARCH__<7
++ ldmia r0,{r4-r12} @ load context
++ add r0,r0,#20
++ str r2,[sp,#16] @ offload stuff
++ str r0,[sp,#12]
++#else
++ ldr lr,[r0,#36] @ is_base2_26
++ ldmia r0!,{r4-r8} @ load hash value
++ str r2,[sp,#16] @ offload stuff
++ str r0,[sp,#12]
++
++ adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
++ mov r10,r5,lsr#6
++ adcs r10,r10,r6,lsl#20
++ mov r11,r6,lsr#12
++ adcs r11,r11,r7,lsl#14
++ mov r12,r7,lsr#18
++ adcs r12,r12,r8,lsl#8
++ mov r2,#0
++ teq lr,#0
++ str r2,[r0,#16] @ clear is_base2_26
++ adc r2,r2,r8,lsr#24
++
++ itttt ne
++ movne r4,r9 @ choose between radixes
++ movne r5,r10
++ movne r6,r11
++ movne r7,r12
++ ldmia r0,{r9-r12} @ load key
++ it ne
++ movne r8,r2
++#endif
++
++ mov lr,r1
++ cmp r3,#0
++ str r10,[sp,#20]
++ str r11,[sp,#24]
++ str r12,[sp,#28]
++ b .Loop
++
++.align 4
++.Loop:
++#if __ARM_ARCH__<7
++ ldrb r0,[lr],#16 @ load input
++# ifdef __thumb2__
++ it hi
++# endif
++ addhi r8,r8,#1 @ 1<<128
++ ldrb r1,[lr,#-15]
++ ldrb r2,[lr,#-14]
++ ldrb r3,[lr,#-13]
++ orr r1,r0,r1,lsl#8
++ ldrb r0,[lr,#-12]
++ orr r2,r1,r2,lsl#16
++ ldrb r1,[lr,#-11]
++ orr r3,r2,r3,lsl#24
++ ldrb r2,[lr,#-10]
++ adds r4,r4,r3 @ accumulate input
++
++ ldrb r3,[lr,#-9]
++ orr r1,r0,r1,lsl#8
++ ldrb r0,[lr,#-8]
++ orr r2,r1,r2,lsl#16
++ ldrb r1,[lr,#-7]
++ orr r3,r2,r3,lsl#24
++ ldrb r2,[lr,#-6]
++ adcs r5,r5,r3
++
++ ldrb r3,[lr,#-5]
++ orr r1,r0,r1,lsl#8
++ ldrb r0,[lr,#-4]
++ orr r2,r1,r2,lsl#16
++ ldrb r1,[lr,#-3]
++ orr r3,r2,r3,lsl#24
++ ldrb r2,[lr,#-2]
++ adcs r6,r6,r3
++
++ ldrb r3,[lr,#-1]
++ orr r1,r0,r1,lsl#8
++ str lr,[sp,#8] @ offload input pointer
++ orr r2,r1,r2,lsl#16
++ add r10,r10,r10,lsr#2
++ orr r3,r2,r3,lsl#24
++#else
++ ldr r0,[lr],#16 @ load input
++ it hi
++ addhi r8,r8,#1 @ padbit
++ ldr r1,[lr,#-12]
++ ldr r2,[lr,#-8]
++ ldr r3,[lr,#-4]
++# ifdef __ARMEB__
++ rev r0,r0
++ rev r1,r1
++ rev r2,r2
++ rev r3,r3
++# endif
++ adds r4,r4,r0 @ accumulate input
++ str lr,[sp,#8] @ offload input pointer
++ adcs r5,r5,r1
++ add r10,r10,r10,lsr#2
++ adcs r6,r6,r2
++#endif
++ add r11,r11,r11,lsr#2
++ adcs r7,r7,r3
++ add r12,r12,r12,lsr#2
++
++ umull r2,r3,r5,r9
++ adc r8,r8,#0
++ umull r0,r1,r4,r9
++ umlal r2,r3,r8,r10
++ umlal r0,r1,r7,r10
++ ldr r10,[sp,#20] @ reload r10
++ umlal r2,r3,r6,r12
++ umlal r0,r1,r5,r12
++ umlal r2,r3,r7,r11
++ umlal r0,r1,r6,r11
++ umlal r2,r3,r4,r10
++ str r0,[sp,#0] @ future r4
++ mul r0,r11,r8
++ ldr r11,[sp,#24] @ reload r11
++ adds r2,r2,r1 @ d1+=d0>>32
++ eor r1,r1,r1
++ adc lr,r3,#0 @ future r6
++ str r2,[sp,#4] @ future r5
++
++ mul r2,r12,r8
++ eor r3,r3,r3
++ umlal r0,r1,r7,r12
++ ldr r12,[sp,#28] @ reload r12
++ umlal r2,r3,r7,r9
++ umlal r0,r1,r6,r9
++ umlal r2,r3,r6,r10
++ umlal r0,r1,r5,r10
++ umlal r2,r3,r5,r11
++ umlal r0,r1,r4,r11
++ umlal r2,r3,r4,r12
++ ldr r4,[sp,#0]
++ mul r8,r9,r8
++ ldr r5,[sp,#4]
++
++ adds r6,lr,r0 @ d2+=d1>>32
++ ldr lr,[sp,#8] @ reload input pointer
++ adc r1,r1,#0
++ adds r7,r2,r1 @ d3+=d2>>32
++ ldr r0,[sp,#16] @ reload end pointer
++ adc r3,r3,#0
++ add r8,r8,r3 @ h4+=d3>>32
++
++ and r1,r8,#-4
++ and r8,r8,#3
++ add r1,r1,r1,lsr#2 @ *=5
++ adds r4,r4,r1
++ adcs r5,r5,#0
++ adcs r6,r6,#0
++ adcs r7,r7,#0
++ adc r8,r8,#0
++
++ cmp r0,lr @ done yet?
++ bhi .Loop
++
++ ldr r0,[sp,#12]
++ add sp,sp,#32
++ stmdb r0,{r4-r8} @ store the result
++
++.Lno_data:
++#if __ARM_ARCH__>=5
++ ldmia sp!,{r3-r11,pc}
++#else
++ ldmia sp!,{r3-r11,lr}
++ tst lr,#1
++ moveq pc,lr @ be binary compatible with V4, yet
++ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_blocks,.-poly1305_blocks
++.type poly1305_emit,%function
++.align 5
++poly1305_emit:
++.Lpoly1305_emit:
++ stmdb sp!,{r4-r11}
++
++ ldmia r0,{r3-r7}
++
++#if __ARM_ARCH__>=7
++ ldr ip,[r0,#36] @ is_base2_26
++
++ adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
++ mov r9,r4,lsr#6
++ adcs r9,r9,r5,lsl#20
++ mov r10,r5,lsr#12
++ adcs r10,r10,r6,lsl#14
++ mov r11,r6,lsr#18
++ adcs r11,r11,r7,lsl#8
++ mov r0,#0
++ adc r0,r0,r7,lsr#24
++
++ tst ip,ip
++ itttt ne
++ movne r3,r8
++ movne r4,r9
++ movne r5,r10
++ movne r6,r11
++ it ne
++ movne r7,r0
++#endif
++
++ adds r8,r3,#5 @ compare to modulus
++ adcs r9,r4,#0
++ adcs r10,r5,#0
++ adcs r11,r6,#0
++ adc r0,r7,#0
++ tst r0,#4 @ did it carry/borrow?
++
++#ifdef __thumb2__
++ it ne
++#endif
++ movne r3,r8
++ ldr r8,[r2,#0]
++#ifdef __thumb2__
++ it ne
++#endif
++ movne r4,r9
++ ldr r9,[r2,#4]
++#ifdef __thumb2__
++ it ne
++#endif
++ movne r5,r10
++ ldr r10,[r2,#8]
++#ifdef __thumb2__
++ it ne
++#endif
++ movne r6,r11
++ ldr r11,[r2,#12]
++
++ adds r3,r3,r8
++ adcs r4,r4,r9
++ adcs r5,r5,r10
++ adc r6,r6,r11
++
++#if __ARM_ARCH__>=7
++# ifdef __ARMEB__
++ rev r3,r3
++ rev r4,r4
++ rev r5,r5
++ rev r6,r6
++# endif
++ str r3,[r1,#0]
++ str r4,[r1,#4]
++ str r5,[r1,#8]
++ str r6,[r1,#12]
++#else
++ strb r3,[r1,#0]
++ mov r3,r3,lsr#8
++ strb r4,[r1,#4]
++ mov r4,r4,lsr#8
++ strb r5,[r1,#8]
++ mov r5,r5,lsr#8
++ strb r6,[r1,#12]
++ mov r6,r6,lsr#8
++
++ strb r3,[r1,#1]
++ mov r3,r3,lsr#8
++ strb r4,[r1,#5]
++ mov r4,r4,lsr#8
++ strb r5,[r1,#9]
++ mov r5,r5,lsr#8
++ strb r6,[r1,#13]
++ mov r6,r6,lsr#8
++
++ strb r3,[r1,#2]
++ mov r3,r3,lsr#8
++ strb r4,[r1,#6]
++ mov r4,r4,lsr#8
++ strb r5,[r1,#10]
++ mov r5,r5,lsr#8
++ strb r6,[r1,#14]
++ mov r6,r6,lsr#8
++
++ strb r3,[r1,#3]
++ strb r4,[r1,#7]
++ strb r5,[r1,#11]
++ strb r6,[r1,#15]
++#endif
++ ldmia sp!,{r4-r11}
++#if __ARM_ARCH__>=5
++ bx lr @ bx lr
++#else
++ tst lr,#1
++ moveq pc,lr @ be binary compatible with V4, yet
++ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_emit,.-poly1305_emit
++#if __ARM_MAX_ARCH__>=7
++.fpu neon
++
++.type poly1305_init_neon,%function
++.align 5
++poly1305_init_neon:
++.Lpoly1305_init_neon:
++ ldr r3,[r0,#48] @ first table element
++ cmp r3,#-1 @ is value impossible?
++ bne .Lno_init_neon
++
++ ldr r4,[r0,#20] @ load key base 2^32
++ ldr r5,[r0,#24]
++ ldr r6,[r0,#28]
++ ldr r7,[r0,#32]
++
++ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
++ mov r3,r4,lsr#26
++ mov r4,r5,lsr#20
++ orr r3,r3,r5,lsl#6
++ mov r5,r6,lsr#14
++ orr r4,r4,r6,lsl#12
++ mov r6,r7,lsr#8
++ orr r5,r5,r7,lsl#18
++ and r3,r3,#0x03ffffff
++ and r4,r4,#0x03ffffff
++ and r5,r5,#0x03ffffff
++
++ vdup.32 d0,r2 @ r^1 in both lanes
++ add r2,r3,r3,lsl#2 @ *5
++ vdup.32 d1,r3
++ add r3,r4,r4,lsl#2
++ vdup.32 d2,r2
++ vdup.32 d3,r4
++ add r4,r5,r5,lsl#2
++ vdup.32 d4,r3
++ vdup.32 d5,r5
++ add r5,r6,r6,lsl#2
++ vdup.32 d6,r4
++ vdup.32 d7,r6
++ vdup.32 d8,r5
++
++ mov r5,#2 @ counter
++
++.Lsquare_neon:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
++ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
++ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
++ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
++
++ vmull.u32 q5,d0,d0[1]
++ vmull.u32 q6,d1,d0[1]
++ vmull.u32 q7,d3,d0[1]
++ vmull.u32 q8,d5,d0[1]
++ vmull.u32 q9,d7,d0[1]
++
++ vmlal.u32 q5,d7,d2[1]
++ vmlal.u32 q6,d0,d1[1]
++ vmlal.u32 q7,d1,d1[1]
++ vmlal.u32 q8,d3,d1[1]
++ vmlal.u32 q9,d5,d1[1]
++
++ vmlal.u32 q5,d5,d4[1]
++ vmlal.u32 q6,d7,d4[1]
++ vmlal.u32 q8,d1,d3[1]
++ vmlal.u32 q7,d0,d3[1]
++ vmlal.u32 q9,d3,d3[1]
++
++ vmlal.u32 q5,d3,d6[1]
++ vmlal.u32 q8,d0,d5[1]
++ vmlal.u32 q6,d5,d6[1]
++ vmlal.u32 q7,d7,d6[1]
++ vmlal.u32 q9,d1,d5[1]
++
++ vmlal.u32 q8,d7,d8[1]
++ vmlal.u32 q5,d1,d8[1]
++ vmlal.u32 q6,d3,d8[1]
++ vmlal.u32 q7,d5,d8[1]
++ vmlal.u32 q9,d0,d7[1]
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
++ @ and P. Schwabe
++ @
++ @ H0>>+H1>>+H2>>+H3>>+H4
++ @ H3>>+H4>>*5+H0>>+H1
++ @
++ @ Trivia.
++ @
++ @ Result of multiplication of n-bit number by m-bit number is
++ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
++ @ m-bit number multiplied by 2^n is still n+m bits wide.
++ @
++ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
++ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
++ @ one is n+1 bits wide.
++ @
++ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
++ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
++ @ can be 27. However! In cases when their width exceeds 26 bits
++ @ they are limited by 2^26+2^6. This in turn means that *sum*
++ @ of the products with these values can still be viewed as sum
++ @ of 52-bit numbers as long as the amount of addends is not a
++ @ power of 2. For example,
++ @
++ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
++ @
++ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
++ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
++ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
++ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
++ @ which is less than 32 * (2^52) or 2^57. And when processing
++ @ data we are looking at triple as many addends...
++ @
++ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
++ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
++ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
++ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
++ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
++ @ This means that result of reduction have to be compressed upon
++ @ loop wrap-around. This can be done in the process of reduction
++ @ to minimize amount of instructions [as well as amount of
++ @ 128-bit instructions, which benefits low-end processors], but
++ @ one has to watch for H2 (which is narrower than H0) and 5*H4
++ @ not being wider than 58 bits, so that result of right shift
++ @ by 26 bits fits in 32 bits. This is also useful on x86,
++ @ because it allows to use paddd in place for paddq, which
++ @ benefits Atom, where paddq is ridiculously slow.
++
++ vshr.u64 q15,q8,#26
++ vmovn.i64 d16,q8
++ vshr.u64 q4,q5,#26
++ vmovn.i64 d10,q5
++ vadd.i64 q9,q9,q15 @ h3 -> h4
++ vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
++ vadd.i64 q6,q6,q4 @ h0 -> h1
++ vbic.i32 d10,#0xfc000000
++
++ vshrn.u64 d30,q9,#26
++ vmovn.i64 d18,q9
++ vshr.u64 q4,q6,#26
++ vmovn.i64 d12,q6
++ vadd.i64 q7,q7,q4 @ h1 -> h2
++ vbic.i32 d18,#0xfc000000
++ vbic.i32 d12,#0xfc000000
++
++ vadd.i32 d10,d10,d30
++ vshl.u32 d30,d30,#2
++ vshrn.u64 d8,q7,#26
++ vmovn.i64 d14,q7
++ vadd.i32 d10,d10,d30 @ h4 -> h0
++ vadd.i32 d16,d16,d8 @ h2 -> h3
++ vbic.i32 d14,#0xfc000000
++
++ vshr.u32 d30,d10,#26
++ vbic.i32 d10,#0xfc000000
++ vshr.u32 d8,d16,#26
++ vbic.i32 d16,#0xfc000000
++ vadd.i32 d12,d12,d30 @ h0 -> h1
++ vadd.i32 d18,d18,d8 @ h3 -> h4
++
++ subs r5,r5,#1
++ beq .Lsquare_break_neon
++
++ add r6,r0,#(48+0*9*4)
++ add r7,r0,#(48+1*9*4)
++
++ vtrn.32 d0,d10 @ r^2:r^1
++ vtrn.32 d3,d14
++ vtrn.32 d5,d16
++ vtrn.32 d1,d12
++ vtrn.32 d7,d18
++
++ vshl.u32 d4,d3,#2 @ *5
++ vshl.u32 d6,d5,#2
++ vshl.u32 d2,d1,#2
++ vshl.u32 d8,d7,#2
++ vadd.i32 d4,d4,d3
++ vadd.i32 d2,d2,d1
++ vadd.i32 d6,d6,d5
++ vadd.i32 d8,d8,d7
++
++ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
++ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
++ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
++ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
++ vst1.32 {d8[0]},[r6,:32]
++ vst1.32 {d8[1]},[r7,:32]
++
++ b .Lsquare_neon
++
++.align 4
++.Lsquare_break_neon:
++ add r6,r0,#(48+2*4*9)
++ add r7,r0,#(48+3*4*9)
++
++ vmov d0,d10 @ r^4:r^3
++ vshl.u32 d2,d12,#2 @ *5
++ vmov d1,d12
++ vshl.u32 d4,d14,#2
++ vmov d3,d14
++ vshl.u32 d6,d16,#2
++ vmov d5,d16
++ vshl.u32 d8,d18,#2
++ vmov d7,d18
++ vadd.i32 d2,d2,d12
++ vadd.i32 d4,d4,d14
++ vadd.i32 d6,d6,d16
++ vadd.i32 d8,d8,d18
++
++ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
++ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
++ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
++ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
++ vst1.32 {d8[0]},[r6]
++ vst1.32 {d8[1]},[r7]
++
++.Lno_init_neon:
++ bx lr @ bx lr
++.size poly1305_init_neon,.-poly1305_init_neon
++
++.type poly1305_blocks_neon,%function
++.align 5
++poly1305_blocks_neon:
++.Lpoly1305_blocks_neon:
++ ldr ip,[r0,#36] @ is_base2_26
++
++ cmp r2,#64
++ blo .Lpoly1305_blocks
++
++ stmdb sp!,{r4-r7}
++ vstmdb sp!,{d8-d15} @ ABI specification says so
++
++ tst ip,ip @ is_base2_26?
++ bne .Lbase2_26_neon
++
++ stmdb sp!,{r1-r3,lr}
++ bl .Lpoly1305_init_neon
++
++ ldr r4,[r0,#0] @ load hash value base 2^32
++ ldr r5,[r0,#4]
++ ldr r6,[r0,#8]
++ ldr r7,[r0,#12]
++ ldr ip,[r0,#16]
++
++ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
++ mov r3,r4,lsr#26
++ veor d10,d10,d10
++ mov r4,r5,lsr#20
++ orr r3,r3,r5,lsl#6
++ veor d12,d12,d12
++ mov r5,r6,lsr#14
++ orr r4,r4,r6,lsl#12
++ veor d14,d14,d14
++ mov r6,r7,lsr#8
++ orr r5,r5,r7,lsl#18
++ veor d16,d16,d16
++ and r3,r3,#0x03ffffff
++ orr r6,r6,ip,lsl#24
++ veor d18,d18,d18
++ and r4,r4,#0x03ffffff
++ mov r1,#1
++ and r5,r5,#0x03ffffff
++ str r1,[r0,#36] @ set is_base2_26
++
++ vmov.32 d10[0],r2
++ vmov.32 d12[0],r3
++ vmov.32 d14[0],r4
++ vmov.32 d16[0],r5
++ vmov.32 d18[0],r6
++ adr r5,.Lzeros
++
++ ldmia sp!,{r1-r3,lr}
++ b .Lhash_loaded
++
++.align 4
++.Lbase2_26_neon:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ load hash value
++
++ veor d10,d10,d10
++ veor d12,d12,d12
++ veor d14,d14,d14
++ veor d16,d16,d16
++ veor d18,d18,d18
++ vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
++ adr r5,.Lzeros
++ vld1.32 {d18[0]},[r0]
++ sub r0,r0,#16 @ rewind
++
++.Lhash_loaded:
++ add r4,r1,#32
++ mov r3,r3,lsl#24
++ tst r2,#31
++ beq .Leven
++
++ vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
++ vmov.32 d28[0],r3
++ sub r2,r2,#16
++ add r4,r1,#32
++
++# ifdef __ARMEB__
++ vrev32.8 q10,q10
++ vrev32.8 q13,q13
++ vrev32.8 q11,q11
++ vrev32.8 q12,q12
++# endif
++ vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
++ vshl.u32 d26,d26,#18
++
++ vsri.u32 d26,d24,#14
++ vshl.u32 d24,d24,#12
++ vadd.i32 d29,d28,d18 @ add hash value and move to #hi
++
++ vbic.i32 d26,#0xfc000000
++ vsri.u32 d24,d22,#20
++ vshl.u32 d22,d22,#6
++
++ vbic.i32 d24,#0xfc000000
++ vsri.u32 d22,d20,#26
++ vadd.i32 d27,d26,d16
++
++ vbic.i32 d20,#0xfc000000
++ vbic.i32 d22,#0xfc000000
++ vadd.i32 d25,d24,d14
++
++ vadd.i32 d21,d20,d10
++ vadd.i32 d23,d22,d12
++
++ mov r7,r5
++ add r6,r0,#48
++
++ cmp r2,r2
++ b .Long_tail
++
++.align 4
++.Leven:
++ subs r2,r2,#64
++ it lo
++ movlo r4,r5
++
++ vmov.i32 q14,#1<<24 @ padbit, yes, always
++ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
++ add r1,r1,#64
++ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
++ add r4,r4,#64
++ itt hi
++ addhi r7,r0,#(48+1*9*4)
++ addhi r6,r0,#(48+3*9*4)
++
++# ifdef __ARMEB__
++ vrev32.8 q10,q10
++ vrev32.8 q13,q13
++ vrev32.8 q11,q11
++ vrev32.8 q12,q12
++# endif
++ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
++ vshl.u32 q13,q13,#18
++
++ vsri.u32 q13,q12,#14
++ vshl.u32 q12,q12,#12
++
++ vbic.i32 q13,#0xfc000000
++ vsri.u32 q12,q11,#20
++ vshl.u32 q11,q11,#6
++
++ vbic.i32 q12,#0xfc000000
++ vsri.u32 q11,q10,#26
++
++ vbic.i32 q10,#0xfc000000
++ vbic.i32 q11,#0xfc000000
++
++ bls .Lskip_loop
++
++ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
++ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
++ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
++ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
++ b .Loop_neon
++
++.align 5
++.Loop_neon:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
++ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
++ @ ___________________/
++ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
++ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
++ @ ___________________/ ____________________/
++ @
++ @ Note that we start with inp[2:3]*r^2. This is because it
++ @ doesn't depend on reduction in previous iteration.
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
++ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
++ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
++ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
++ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ inp[2:3]*r^2
++
++ vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
++ vmull.u32 q7,d25,d0[1]
++ vadd.i32 d20,d20,d10
++ vmull.u32 q5,d21,d0[1]
++ vadd.i32 d26,d26,d16
++ vmull.u32 q8,d27,d0[1]
++ vmlal.u32 q7,d23,d1[1]
++ vadd.i32 d22,d22,d12
++ vmull.u32 q6,d23,d0[1]
++
++ vadd.i32 d28,d28,d18
++ vmull.u32 q9,d29,d0[1]
++ subs r2,r2,#64
++ vmlal.u32 q5,d29,d2[1]
++ it lo
++ movlo r4,r5
++ vmlal.u32 q8,d25,d1[1]
++ vld1.32 d8[1],[r7,:32]
++ vmlal.u32 q6,d21,d1[1]
++ vmlal.u32 q9,d27,d1[1]
++
++ vmlal.u32 q5,d27,d4[1]
++ vmlal.u32 q8,d23,d3[1]
++ vmlal.u32 q9,d25,d3[1]
++ vmlal.u32 q6,d29,d4[1]
++ vmlal.u32 q7,d21,d3[1]
++
++ vmlal.u32 q8,d21,d5[1]
++ vmlal.u32 q5,d25,d6[1]
++ vmlal.u32 q9,d23,d5[1]
++ vmlal.u32 q6,d27,d6[1]
++ vmlal.u32 q7,d29,d6[1]
++
++ vmlal.u32 q8,d29,d8[1]
++ vmlal.u32 q5,d23,d8[1]
++ vmlal.u32 q9,d21,d7[1]
++ vmlal.u32 q6,d25,d8[1]
++ vmlal.u32 q7,d27,d8[1]
++
++ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
++ add r4,r4,#64
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ (hash+inp[0:1])*r^4 and accumulate
++
++ vmlal.u32 q8,d26,d0[0]
++ vmlal.u32 q5,d20,d0[0]
++ vmlal.u32 q9,d28,d0[0]
++ vmlal.u32 q6,d22,d0[0]
++ vmlal.u32 q7,d24,d0[0]
++ vld1.32 d8[0],[r6,:32]
++
++ vmlal.u32 q8,d24,d1[0]
++ vmlal.u32 q5,d28,d2[0]
++ vmlal.u32 q9,d26,d1[0]
++ vmlal.u32 q6,d20,d1[0]
++ vmlal.u32 q7,d22,d1[0]
++
++ vmlal.u32 q8,d22,d3[0]
++ vmlal.u32 q5,d26,d4[0]
++ vmlal.u32 q9,d24,d3[0]
++ vmlal.u32 q6,d28,d4[0]
++ vmlal.u32 q7,d20,d3[0]
++
++ vmlal.u32 q8,d20,d5[0]
++ vmlal.u32 q5,d24,d6[0]
++ vmlal.u32 q9,d22,d5[0]
++ vmlal.u32 q6,d26,d6[0]
++ vmlal.u32 q8,d28,d8[0]
++
++ vmlal.u32 q7,d28,d6[0]
++ vmlal.u32 q5,d22,d8[0]
++ vmlal.u32 q9,d20,d7[0]
++ vmov.i32 q14,#1<<24 @ padbit, yes, always
++ vmlal.u32 q6,d24,d8[0]
++ vmlal.u32 q7,d26,d8[0]
++
++ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
++ add r1,r1,#64
++# ifdef __ARMEB__
++ vrev32.8 q10,q10
++ vrev32.8 q11,q11
++ vrev32.8 q12,q12
++ vrev32.8 q13,q13
++# endif
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
++ @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
++
++ vshr.u64 q15,q8,#26
++ vmovn.i64 d16,q8
++ vshr.u64 q4,q5,#26
++ vmovn.i64 d10,q5
++ vadd.i64 q9,q9,q15 @ h3 -> h4
++ vbic.i32 d16,#0xfc000000
++ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
++ vadd.i64 q6,q6,q4 @ h0 -> h1
++ vshl.u32 q13,q13,#18
++ vbic.i32 d10,#0xfc000000
++
++ vshrn.u64 d30,q9,#26
++ vmovn.i64 d18,q9
++ vshr.u64 q4,q6,#26
++ vmovn.i64 d12,q6
++ vadd.i64 q7,q7,q4 @ h1 -> h2
++ vsri.u32 q13,q12,#14
++ vbic.i32 d18,#0xfc000000
++ vshl.u32 q12,q12,#12
++ vbic.i32 d12,#0xfc000000
++
++ vadd.i32 d10,d10,d30
++ vshl.u32 d30,d30,#2
++ vbic.i32 q13,#0xfc000000
++ vshrn.u64 d8,q7,#26
++ vmovn.i64 d14,q7
++ vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
++ vsri.u32 q12,q11,#20
++ vadd.i32 d16,d16,d8 @ h2 -> h3
++ vshl.u32 q11,q11,#6
++ vbic.i32 d14,#0xfc000000
++ vbic.i32 q12,#0xfc000000
++
++ vshrn.u64 d30,q5,#26 @ re-narrow
++ vmovn.i64 d10,q5
++ vsri.u32 q11,q10,#26
++ vbic.i32 q10,#0xfc000000
++ vshr.u32 d8,d16,#26
++ vbic.i32 d16,#0xfc000000
++ vbic.i32 d10,#0xfc000000
++ vadd.i32 d12,d12,d30 @ h0 -> h1
++ vadd.i32 d18,d18,d8 @ h3 -> h4
++ vbic.i32 q11,#0xfc000000
++
++ bhi .Loop_neon
++
++.Lskip_loop:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
++
++ add r7,r0,#(48+0*9*4)
++ add r6,r0,#(48+1*9*4)
++ adds r2,r2,#32
++ it ne
++ movne r2,#0
++ bne .Long_tail
++
++ vadd.i32 d25,d24,d14 @ add hash value and move to #hi
++ vadd.i32 d21,d20,d10
++ vadd.i32 d27,d26,d16
++ vadd.i32 d23,d22,d12
++ vadd.i32 d29,d28,d18
++
++.Long_tail:
++ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
++ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
++
++ vadd.i32 d24,d24,d14 @ can be redundant
++ vmull.u32 q7,d25,d0
++ vadd.i32 d20,d20,d10
++ vmull.u32 q5,d21,d0
++ vadd.i32 d26,d26,d16
++ vmull.u32 q8,d27,d0
++ vadd.i32 d22,d22,d12
++ vmull.u32 q6,d23,d0
++ vadd.i32 d28,d28,d18
++ vmull.u32 q9,d29,d0
++
++ vmlal.u32 q5,d29,d2
++ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
++ vmlal.u32 q8,d25,d1
++ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
++ vmlal.u32 q6,d21,d1
++ vmlal.u32 q9,d27,d1
++ vmlal.u32 q7,d23,d1
++
++ vmlal.u32 q8,d23,d3
++ vld1.32 d8[1],[r7,:32]
++ vmlal.u32 q5,d27,d4
++ vld1.32 d8[0],[r6,:32]
++ vmlal.u32 q9,d25,d3
++ vmlal.u32 q6,d29,d4
++ vmlal.u32 q7,d21,d3
++
++ vmlal.u32 q8,d21,d5
++ it ne
++ addne r7,r0,#(48+2*9*4)
++ vmlal.u32 q5,d25,d6
++ it ne
++ addne r6,r0,#(48+3*9*4)
++ vmlal.u32 q9,d23,d5
++ vmlal.u32 q6,d27,d6
++ vmlal.u32 q7,d29,d6
++
++ vmlal.u32 q8,d29,d8
++ vorn q0,q0,q0 @ all-ones, can be redundant
++ vmlal.u32 q5,d23,d8
++ vshr.u64 q0,q0,#38
++ vmlal.u32 q9,d21,d7
++ vmlal.u32 q6,d25,d8
++ vmlal.u32 q7,d27,d8
++
++ beq .Lshort_tail
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ (hash+inp[0:1])*r^4:r^3 and accumulate
++
++ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
++ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
++
++ vmlal.u32 q7,d24,d0
++ vmlal.u32 q5,d20,d0
++ vmlal.u32 q8,d26,d0
++ vmlal.u32 q6,d22,d0
++ vmlal.u32 q9,d28,d0
++
++ vmlal.u32 q5,d28,d2
++ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
++ vmlal.u32 q8,d24,d1
++ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
++ vmlal.u32 q6,d20,d1
++ vmlal.u32 q9,d26,d1
++ vmlal.u32 q7,d22,d1
++
++ vmlal.u32 q8,d22,d3
++ vld1.32 d8[1],[r7,:32]
++ vmlal.u32 q5,d26,d4
++ vld1.32 d8[0],[r6,:32]
++ vmlal.u32 q9,d24,d3
++ vmlal.u32 q6,d28,d4
++ vmlal.u32 q7,d20,d3
++
++ vmlal.u32 q8,d20,d5
++ vmlal.u32 q5,d24,d6
++ vmlal.u32 q9,d22,d5
++ vmlal.u32 q6,d26,d6
++ vmlal.u32 q7,d28,d6
++
++ vmlal.u32 q8,d28,d8
++ vorn q0,q0,q0 @ all-ones
++ vmlal.u32 q5,d22,d8
++ vshr.u64 q0,q0,#38
++ vmlal.u32 q9,d20,d7
++ vmlal.u32 q6,d24,d8
++ vmlal.u32 q7,d26,d8
++
++.Lshort_tail:
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ horizontal addition
++
++ vadd.i64 d16,d16,d17
++ vadd.i64 d10,d10,d11
++ vadd.i64 d18,d18,d19
++ vadd.i64 d12,d12,d13
++ vadd.i64 d14,d14,d15
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ lazy reduction, but without narrowing
++
++ vshr.u64 q15,q8,#26
++ vand.i64 q8,q8,q0
++ vshr.u64 q4,q5,#26
++ vand.i64 q5,q5,q0
++ vadd.i64 q9,q9,q15 @ h3 -> h4
++ vadd.i64 q6,q6,q4 @ h0 -> h1
++
++ vshr.u64 q15,q9,#26
++ vand.i64 q9,q9,q0
++ vshr.u64 q4,q6,#26
++ vand.i64 q6,q6,q0
++ vadd.i64 q7,q7,q4 @ h1 -> h2
++
++ vadd.i64 q5,q5,q15
++ vshl.u64 q15,q15,#2
++ vshr.u64 q4,q7,#26
++ vand.i64 q7,q7,q0
++ vadd.i64 q5,q5,q15 @ h4 -> h0
++ vadd.i64 q8,q8,q4 @ h2 -> h3
++
++ vshr.u64 q15,q5,#26
++ vand.i64 q5,q5,q0
++ vshr.u64 q4,q8,#26
++ vand.i64 q8,q8,q0
++ vadd.i64 q6,q6,q15 @ h0 -> h1
++ vadd.i64 q9,q9,q4 @ h3 -> h4
++
++ cmp r2,#0
++ bne .Leven
++
++ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++ @ store hash value
++
++ vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
++ vst1.32 {d18[0]},[r0]
++
++ vldmia sp!,{d8-d15} @ epilogue
++ ldmia sp!,{r4-r7}
++ bx lr @ bx lr
++.size poly1305_blocks_neon,.-poly1305_blocks_neon
++
++.align 5
++.Lzeros:
++.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
++#ifndef __KERNEL__
++.LOPENSSL_armcap:
++# ifdef _WIN32
++.word OPENSSL_armcap_P
++# else
++.word OPENSSL_armcap_P-.Lpoly1305_init
++# endif
++.comm OPENSSL_armcap_P,4,4
++.hidden OPENSSL_armcap_P
++#endif
++#endif
++.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"
++.align 2
+--- /dev/null
++++ b/arch/arm/crypto/poly1305-glue.c
+@@ -0,0 +1,276 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
++ *
++ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
++ */
++
++#include <asm/hwcap.h>
++#include <asm/neon.h>
++#include <asm/simd.h>
++#include <asm/unaligned.h>
++#include <crypto/algapi.h>
++#include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
++#include <crypto/internal/simd.h>
++#include <linux/cpufeature.h>
++#include <linux/crypto.h>
++#include <linux/jump_label.h>
++#include <linux/module.h>
++
++void poly1305_init_arm(void *state, const u8 *key);
++void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
++void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce);
++
++void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
++{
++}
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
++
++void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
++{
++ poly1305_init_arm(&dctx->h, key);
++ dctx->s[0] = get_unaligned_le32(key + 16);
++ dctx->s[1] = get_unaligned_le32(key + 20);
++ dctx->s[2] = get_unaligned_le32(key + 24);
++ dctx->s[3] = get_unaligned_le32(key + 28);
++ dctx->buflen = 0;
++}
++EXPORT_SYMBOL(poly1305_init_arch);
++
++static int arm_poly1305_init(struct shash_desc *desc)
++{
++ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++ dctx->buflen = 0;
++ dctx->rset = 0;
++ dctx->sset = false;
++
++ return 0;
++}
++
++static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
++ u32 len, u32 hibit, bool do_neon)
++{
++ if (unlikely(!dctx->sset)) {
++ if (!dctx->rset) {
++ poly1305_init_arm(&dctx->h, src);
++ src += POLY1305_BLOCK_SIZE;
++ len -= POLY1305_BLOCK_SIZE;
++ dctx->rset = 1;
++ }
++ if (len >= POLY1305_BLOCK_SIZE) {
++ dctx->s[0] = get_unaligned_le32(src + 0);
++ dctx->s[1] = get_unaligned_le32(src + 4);
++ dctx->s[2] = get_unaligned_le32(src + 8);
++ dctx->s[3] = get_unaligned_le32(src + 12);
++ src += POLY1305_BLOCK_SIZE;
++ len -= POLY1305_BLOCK_SIZE;
++ dctx->sset = true;
++ }
++ if (len < POLY1305_BLOCK_SIZE)
++ return;
++ }
++
++ len &= ~(POLY1305_BLOCK_SIZE - 1);
++
++ if (static_branch_likely(&have_neon) && likely(do_neon))
++ poly1305_blocks_neon(&dctx->h, src, len, hibit);
++ else
++ poly1305_blocks_arm(&dctx->h, src, len, hibit);
++}
++
++static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
++ const u8 *src, u32 len, bool do_neon)
++{
++ if (unlikely(dctx->buflen)) {
++ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++ memcpy(dctx->buf + dctx->buflen, src, bytes);
++ src += bytes;
++ len -= bytes;
++ dctx->buflen += bytes;
++
++ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++ arm_poly1305_blocks(dctx, dctx->buf,
++ POLY1305_BLOCK_SIZE, 1, false);
++ dctx->buflen = 0;
++ }
++ }
++
++ if (likely(len >= POLY1305_BLOCK_SIZE)) {
++ arm_poly1305_blocks(dctx, src, len, 1, do_neon);
++ src += round_down(len, POLY1305_BLOCK_SIZE);
++ len %= POLY1305_BLOCK_SIZE;
++ }
++
++ if (unlikely(len)) {
++ dctx->buflen = len;
++ memcpy(dctx->buf, src, len);
++ }
++}
++
++static int arm_poly1305_update(struct shash_desc *desc,
++ const u8 *src, unsigned int srclen)
++{
++ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++ arm_poly1305_do_update(dctx, src, srclen, false);
++ return 0;
++}
++
++static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
++ const u8 *src,
++ unsigned int srclen)
++{
++ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++ bool do_neon = crypto_simd_usable() && srclen > 128;
++
++ if (static_branch_likely(&have_neon) && do_neon)
++ kernel_neon_begin();
++ arm_poly1305_do_update(dctx, src, srclen, do_neon);
++ if (static_branch_likely(&have_neon) && do_neon)
++ kernel_neon_end();
++ return 0;
++}
++
++void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
++ unsigned int nbytes)
++{
++ bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
++ crypto_simd_usable();
++
++ if (unlikely(dctx->buflen)) {
++ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++ memcpy(dctx->buf + dctx->buflen, src, bytes);
++ src += bytes;
++ nbytes -= bytes;
++ dctx->buflen += bytes;
++
++ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++ poly1305_blocks_arm(&dctx->h, dctx->buf,
++ POLY1305_BLOCK_SIZE, 1);
++ dctx->buflen = 0;
++ }
++ }
++
++ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
++ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
++
++ if (static_branch_likely(&have_neon) && do_neon) {
++ kernel_neon_begin();
++ poly1305_blocks_neon(&dctx->h, src, len, 1);
++ kernel_neon_end();
++ } else {
++ poly1305_blocks_arm(&dctx->h, src, len, 1);
++ }
++ src += len;
++ nbytes %= POLY1305_BLOCK_SIZE;
++ }
++
++ if (unlikely(nbytes)) {
++ dctx->buflen = nbytes;
++ memcpy(dctx->buf, src, nbytes);
++ }
++}
++EXPORT_SYMBOL(poly1305_update_arch);
++
++void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
++{
++ __le32 digest[4];
++ u64 f = 0;
++
++ if (unlikely(dctx->buflen)) {
++ dctx->buf[dctx->buflen++] = 1;
++ memset(dctx->buf + dctx->buflen, 0,
++ POLY1305_BLOCK_SIZE - dctx->buflen);
++ poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
++ }
++
++ poly1305_emit_arm(&dctx->h, digest, dctx->s);
++
++ /* mac = (h + s) % (2^128) */
++ f = (f >> 32) + le32_to_cpu(digest[0]);
++ put_unaligned_le32(f, dst);
++ f = (f >> 32) + le32_to_cpu(digest[1]);
++ put_unaligned_le32(f, dst + 4);
++ f = (f >> 32) + le32_to_cpu(digest[2]);
++ put_unaligned_le32(f, dst + 8);
++ f = (f >> 32) + le32_to_cpu(digest[3]);
++ put_unaligned_le32(f, dst + 12);
++
++ *dctx = (struct poly1305_desc_ctx){};
++}
++EXPORT_SYMBOL(poly1305_final_arch);
++
++static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
++{
++ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++ if (unlikely(!dctx->sset))
++ return -ENOKEY;
++
++ poly1305_final_arch(dctx, dst);
++ return 0;
++}
++
++static struct shash_alg arm_poly1305_algs[] = {{
++ .init = arm_poly1305_init,
++ .update = arm_poly1305_update,
++ .final = arm_poly1305_final,
++ .digestsize = POLY1305_DIGEST_SIZE,
++ .descsize = sizeof(struct poly1305_desc_ctx),
++
++ .base.cra_name = "poly1305",
++ .base.cra_driver_name = "poly1305-arm",
++ .base.cra_priority = 150,
++ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
++ .base.cra_module = THIS_MODULE,
++#ifdef CONFIG_KERNEL_MODE_NEON
++}, {
++ .init = arm_poly1305_init,
++ .update = arm_poly1305_update_neon,
++ .final = arm_poly1305_final,
++ .digestsize = POLY1305_DIGEST_SIZE,
++ .descsize = sizeof(struct poly1305_desc_ctx),
++
++ .base.cra_name = "poly1305",
++ .base.cra_driver_name = "poly1305-neon",
++ .base.cra_priority = 200,
++ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
++ .base.cra_module = THIS_MODULE,
++#endif
++}};
++
++static int __init arm_poly1305_mod_init(void)
++{
++ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
++ (elf_hwcap & HWCAP_NEON))
++ static_branch_enable(&have_neon);
++ else
++ /* register only the first entry */
++ return crypto_register_shash(&arm_poly1305_algs[0]);
++
++ return crypto_register_shashes(arm_poly1305_algs,
++ ARRAY_SIZE(arm_poly1305_algs));
++}
++
++static void __exit arm_poly1305_mod_exit(void)
++{
++ if (!static_branch_likely(&have_neon)) {
++ crypto_unregister_shash(&arm_poly1305_algs[0]);
++ return;
++ }
++ crypto_unregister_shashes(arm_poly1305_algs,
++ ARRAY_SIZE(arm_poly1305_algs));
++}
++
++module_init(arm_poly1305_mod_init);
++module_exit(arm_poly1305_mod_exit);
++
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("poly1305");
++MODULE_ALIAS_CRYPTO("poly1305-arm");
++MODULE_ALIAS_CRYPTO("poly1305-neon");
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -40,7 +40,7 @@ config CRYPTO_LIB_DES
+ config CRYPTO_LIB_POLY1305_RSIZE
+ int
+ default 4 if X86_64
+- default 9 if ARM64
++ default 9 if ARM || ARM64
+ default 1
+
+ config CRYPTO_ARCH_HAVE_LIB_POLY1305