aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch1563
1 files changed, 0 insertions, 1563 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
deleted file mode 100644
index 272e1797da..0000000000
--- a/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
+++ /dev/null
@@ -1,1563 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ardb@kernel.org>
-Date: Fri, 8 Nov 2019 13:22:26 +0100
-Subject: [PATCH] crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS
- optimized implementation
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-commit a11d055e7a64ac34a5e99b6fe731299449cbcd58 upstream.
-
-This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for
-MIPS authored by Andy Polyakov, a prior 64-bit only version of which has been
-contributed by him to the OpenSSL project. The file 'poly1305-mips.pl' is taken
-straight from this upstream GitHub repository [0] at commit
-d22ade312a7af958ec955620b0d241cf42c37feb, and already contains all the changes
-required to build it as part of a Linux kernel module.
-
-[0] https://github.com/dot-asm/cryptogams
-
-Co-developed-by: Andy Polyakov <appro@cryptogams.org>
-Signed-off-by: Andy Polyakov <appro@cryptogams.org>
-Co-developed-by: René van Dorst <opensource@vdorst.com>
-Signed-off-by: René van Dorst <opensource@vdorst.com>
-Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
-Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
----
- arch/mips/crypto/Makefile | 14 +
- arch/mips/crypto/poly1305-glue.c | 203 +++++
- arch/mips/crypto/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++
- crypto/Kconfig | 5 +
- lib/crypto/Kconfig | 1 +
- 5 files changed, 1496 insertions(+)
- create mode 100644 arch/mips/crypto/poly1305-glue.c
- create mode 100644 arch/mips/crypto/poly1305-mips.pl
-
---- a/arch/mips/crypto/Makefile
-+++ b/arch/mips/crypto/Makefile
-@@ -8,3 +8,17 @@ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32
- obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
- chacha-mips-y := chacha-core.o chacha-glue.o
- AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
-+
-+obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
-+poly1305-mips-y := poly1305-core.o poly1305-glue.o
-+
-+perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
-+perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
-+
-+quiet_cmd_perlasm = PERLASM $@
-+ cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
-+
-+$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
-+ $(call if_changed,perlasm)
-+
-+targets += poly1305-core.S
---- /dev/null
-+++ b/arch/mips/crypto/poly1305-glue.c
-@@ -0,0 +1,203 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/*
-+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
-+ *
-+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
-+ */
-+
-+#include <asm/unaligned.h>
-+#include <crypto/algapi.h>
-+#include <crypto/internal/hash.h>
-+#include <crypto/internal/poly1305.h>
-+#include <linux/cpufeature.h>
-+#include <linux/crypto.h>
-+#include <linux/module.h>
-+
-+asmlinkage void poly1305_init_mips(void *state, const u8 *key);
-+asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
-+asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
-+
-+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
-+{
-+ poly1305_init_mips(&dctx->h, key);
-+ dctx->s[0] = get_unaligned_le32(key + 16);
-+ dctx->s[1] = get_unaligned_le32(key + 20);
-+ dctx->s[2] = get_unaligned_le32(key + 24);
-+ dctx->s[3] = get_unaligned_le32(key + 28);
-+ dctx->buflen = 0;
-+}
-+EXPORT_SYMBOL(poly1305_init_arch);
-+
-+static int mips_poly1305_init(struct shash_desc *desc)
-+{
-+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-+
-+ dctx->buflen = 0;
-+ dctx->rset = 0;
-+ dctx->sset = false;
-+
-+ return 0;
-+}
-+
-+static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
-+ u32 len, u32 hibit)
-+{
-+ if (unlikely(!dctx->sset)) {
-+ if (!dctx->rset) {
-+ poly1305_init_mips(&dctx->h, src);
-+ src += POLY1305_BLOCK_SIZE;
-+ len -= POLY1305_BLOCK_SIZE;
-+ dctx->rset = 1;
-+ }
-+ if (len >= POLY1305_BLOCK_SIZE) {
-+ dctx->s[0] = get_unaligned_le32(src + 0);
-+ dctx->s[1] = get_unaligned_le32(src + 4);
-+ dctx->s[2] = get_unaligned_le32(src + 8);
-+ dctx->s[3] = get_unaligned_le32(src + 12);
-+ src += POLY1305_BLOCK_SIZE;
-+ len -= POLY1305_BLOCK_SIZE;
-+ dctx->sset = true;
-+ }
-+ if (len < POLY1305_BLOCK_SIZE)
-+ return;
-+ }
-+
-+ len &= ~(POLY1305_BLOCK_SIZE - 1);
-+
-+ poly1305_blocks_mips(&dctx->h, src, len, hibit);
-+}
-+
-+static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
-+ unsigned int len)
-+{
-+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-+
-+ if (unlikely(dctx->buflen)) {
-+ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
-+
-+ memcpy(dctx->buf + dctx->buflen, src, bytes);
-+ src += bytes;
-+ len -= bytes;
-+ dctx->buflen += bytes;
-+
-+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-+ mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
-+ dctx->buflen = 0;
-+ }
-+ }
-+
-+ if (likely(len >= POLY1305_BLOCK_SIZE)) {
-+ mips_poly1305_blocks(dctx, src, len, 1);
-+ src += round_down(len, POLY1305_BLOCK_SIZE);
-+ len %= POLY1305_BLOCK_SIZE;
-+ }
-+
-+ if (unlikely(len)) {
-+ dctx->buflen = len;
-+ memcpy(dctx->buf, src, len);
-+ }
-+ return 0;
-+}
-+
-+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
-+ unsigned int nbytes)
-+{
-+ if (unlikely(dctx->buflen)) {
-+ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
-+
-+ memcpy(dctx->buf + dctx->buflen, src, bytes);
-+ src += bytes;
-+ nbytes -= bytes;
-+ dctx->buflen += bytes;
-+
-+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-+ poly1305_blocks_mips(&dctx->h, dctx->buf,
-+ POLY1305_BLOCK_SIZE, 1);
-+ dctx->buflen = 0;
-+ }
-+ }
-+
-+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-+ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
-+
-+ poly1305_blocks_mips(&dctx->h, src, len, 1);
-+ src += len;
-+ nbytes %= POLY1305_BLOCK_SIZE;
-+ }
-+
-+ if (unlikely(nbytes)) {
-+ dctx->buflen = nbytes;
-+ memcpy(dctx->buf, src, nbytes);
-+ }
-+}
-+EXPORT_SYMBOL(poly1305_update_arch);
-+
-+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-+{
-+ __le32 digest[4];
-+ u64 f = 0;
-+
-+ if (unlikely(dctx->buflen)) {
-+ dctx->buf[dctx->buflen++] = 1;
-+ memset(dctx->buf + dctx->buflen, 0,
-+ POLY1305_BLOCK_SIZE - dctx->buflen);
-+ poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-+ }
-+
-+ poly1305_emit_mips(&dctx->h, digest, dctx->s);
-+
-+ /* mac = (h + s) % (2^128) */
-+ f = (f >> 32) + le32_to_cpu(digest[0]);
-+ put_unaligned_le32(f, dst);
-+ f = (f >> 32) + le32_to_cpu(digest[1]);
-+ put_unaligned_le32(f, dst + 4);
-+ f = (f >> 32) + le32_to_cpu(digest[2]);
-+ put_unaligned_le32(f, dst + 8);
-+ f = (f >> 32) + le32_to_cpu(digest[3]);
-+ put_unaligned_le32(f, dst + 12);
-+
-+ *dctx = (struct poly1305_desc_ctx){};
-+}
-+EXPORT_SYMBOL(poly1305_final_arch);
-+
-+static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
-+{
-+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-+
-+ if (unlikely(!dctx->sset))
-+ return -ENOKEY;
-+
-+ poly1305_final_arch(dctx, dst);
-+ return 0;
-+}
-+
-+static struct shash_alg mips_poly1305_alg = {
-+ .init = mips_poly1305_init,
-+ .update = mips_poly1305_update,
-+ .final = mips_poly1305_final,
-+ .digestsize = POLY1305_DIGEST_SIZE,
-+ .descsize = sizeof(struct poly1305_desc_ctx),
-+
-+ .base.cra_name = "poly1305",
-+ .base.cra_driver_name = "poly1305-mips",
-+ .base.cra_priority = 200,
-+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
-+ .base.cra_module = THIS_MODULE,
-+};
-+
-+static int __init mips_poly1305_mod_init(void)
-+{
-+ return crypto_register_shash(&mips_poly1305_alg);
-+}
-+
-+static void __exit mips_poly1305_mod_exit(void)
-+{
-+ crypto_unregister_shash(&mips_poly1305_alg);
-+}
-+
-+module_init(mips_poly1305_mod_init);
-+module_exit(mips_poly1305_mod_exit);
-+
-+MODULE_LICENSE("GPL v2");
-+MODULE_ALIAS_CRYPTO("poly1305");
-+MODULE_ALIAS_CRYPTO("poly1305-mips");
---- /dev/null
-+++ b/arch/mips/crypto/poly1305-mips.pl
-@@ -0,0 +1,1273 @@
-+#!/usr/bin/env perl
-+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-+#
-+# ====================================================================
-+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
-+# project.
-+# ====================================================================
-+
-+# Poly1305 hash for MIPS.
-+#
-+# May 2016
-+#
-+# Numbers are cycles per processed byte with poly1305_blocks alone.
-+#
-+# IALU/gcc
-+# R1x000 ~5.5/+130% (big-endian)
-+# Octeon II 2.50/+70% (little-endian)
-+#
-+# March 2019
-+#
-+# Add 32-bit code path.
-+#
-+# October 2019
-+#
-+# Modulo-scheduling reduction allows to omit dependency chain at the
-+# end of inner loop and improve performance. Also optimize MIPS32R2
-+# code path for MIPS 1004K core. Per René von Dorst's suggestions.
-+#
-+# IALU/gcc
-+# R1x000 ~9.8/? (big-endian)
-+# Octeon II 3.65/+140% (little-endian)
-+# MT7621/1004K 4.75/? (little-endian)
-+#
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
-+# excluded from the rule, because it's specified volatile];
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+# old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+# <appro@openssl.org>
-+#
-+######################################################################
-+
-+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
-+
-+if ($flavour =~ /64|n32/i) {{{
-+######################################################################
-+# 64-bit code path
-+#
-+
-+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
-+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
-+
-+$code.=<<___;
-+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
-+ defined(_MIPS_ARCH_MIPS64R6)) \\
-+ && !defined(_MIPS_ARCH_MIPS64R2)
-+# define _MIPS_ARCH_MIPS64R2
-+#endif
-+
-+#if defined(_MIPS_ARCH_MIPS64R6)
-+# define dmultu(rs,rt)
-+# define mflo(rd,rs,rt) dmulu rd,rs,rt
-+# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
-+#else
-+# define dmultu(rs,rt) dmultu rs,rt
-+# define mflo(rd,rs,rt) mflo rd
-+# define mfhi(rd,rs,rt) mfhi rd
-+#endif
-+
-+#ifdef __KERNEL__
-+# define poly1305_init poly1305_init_mips
-+# define poly1305_blocks poly1305_blocks_mips
-+# define poly1305_emit poly1305_emit_mips
-+#endif
-+
-+#if defined(__MIPSEB__) && !defined(MIPSEB)
-+# define MIPSEB
-+#endif
-+
-+#ifdef MIPSEB
-+# define MSB 0
-+# define LSB 7
-+#else
-+# define MSB 7
-+# define LSB 0
-+#endif
-+
-+.text
-+.set noat
-+.set noreorder
-+
-+.align 5
-+.globl poly1305_init
-+.ent poly1305_init
-+poly1305_init:
-+ .frame $sp,0,$ra
-+ .set reorder
-+
-+ sd $zero,0($ctx)
-+ sd $zero,8($ctx)
-+ sd $zero,16($ctx)
-+
-+ beqz $inp,.Lno_key
-+
-+#if defined(_MIPS_ARCH_MIPS64R6)
-+ andi $tmp0,$inp,7 # $inp % 8
-+ dsubu $inp,$inp,$tmp0 # align $inp
-+ sll $tmp0,$tmp0,3 # byte to bit offset
-+ ld $in0,0($inp)
-+ ld $in1,8($inp)
-+ beqz $tmp0,.Laligned_key
-+ ld $tmp2,16($inp)
-+
-+ subu $tmp1,$zero,$tmp0
-+# ifdef MIPSEB
-+ dsllv $in0,$in0,$tmp0
-+ dsrlv $tmp3,$in1,$tmp1
-+ dsllv $in1,$in1,$tmp0
-+ dsrlv $tmp2,$tmp2,$tmp1
-+# else
-+ dsrlv $in0,$in0,$tmp0
-+ dsllv $tmp3,$in1,$tmp1
-+ dsrlv $in1,$in1,$tmp0
-+ dsllv $tmp2,$tmp2,$tmp1
-+# endif
-+ or $in0,$in0,$tmp3
-+ or $in1,$in1,$tmp2
-+.Laligned_key:
-+#else
-+ ldl $in0,0+MSB($inp)
-+ ldl $in1,8+MSB($inp)
-+ ldr $in0,0+LSB($inp)
-+ ldr $in1,8+LSB($inp)
-+#endif
-+#ifdef MIPSEB
-+# if defined(_MIPS_ARCH_MIPS64R2)
-+ dsbh $in0,$in0 # byte swap
-+ dsbh $in1,$in1
-+ dshd $in0,$in0
-+ dshd $in1,$in1
-+# else
-+ ori $tmp0,$zero,0xFF
-+ dsll $tmp2,$tmp0,32
-+ or $tmp0,$tmp2 # 0x000000FF000000FF
-+
-+ and $tmp1,$in0,$tmp0 # byte swap
-+ and $tmp3,$in1,$tmp0
-+ dsrl $tmp2,$in0,24
-+ dsrl $tmp4,$in1,24
-+ dsll $tmp1,24
-+ dsll $tmp3,24
-+ and $tmp2,$tmp0
-+ and $tmp4,$tmp0
-+ dsll $tmp0,8 # 0x0000FF000000FF00
-+ or $tmp1,$tmp2
-+ or $tmp3,$tmp4
-+ and $tmp2,$in0,$tmp0
-+ and $tmp4,$in1,$tmp0
-+ dsrl $in0,8
-+ dsrl $in1,8
-+ dsll $tmp2,8
-+ dsll $tmp4,8
-+ and $in0,$tmp0
-+ and $in1,$tmp0
-+ or $tmp1,$tmp2
-+ or $tmp3,$tmp4
-+ or $in0,$tmp1
-+ or $in1,$tmp3
-+ dsrl $tmp1,$in0,32
-+ dsrl $tmp3,$in1,32
-+ dsll $in0,32
-+ dsll $in1,32
-+ or $in0,$tmp1
-+ or $in1,$tmp3
-+# endif
-+#endif
-+ li $tmp0,1
-+ dsll $tmp0,32 # 0x0000000100000000
-+ daddiu $tmp0,-63 # 0x00000000ffffffc1
-+ dsll $tmp0,28 # 0x0ffffffc10000000
-+ daddiu $tmp0,-1 # 0x0ffffffc0fffffff
-+
-+ and $in0,$tmp0
-+ daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
-+ and $in1,$tmp0
-+
-+ sd $in0,24($ctx)
-+ dsrl $tmp0,$in1,2
-+ sd $in1,32($ctx)
-+ daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
-+ sd $tmp0,40($ctx)
-+
-+.Lno_key:
-+ li $v0,0 # return 0
-+ jr $ra
-+.end poly1305_init
-+___
-+{
-+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
-+
-+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
-+ ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
-+my ($shr,$shl) = ($s6,$s7); # used on R6
-+
-+$code.=<<___;
-+.align 5
-+.globl poly1305_blocks
-+.ent poly1305_blocks
-+poly1305_blocks:
-+ .set noreorder
-+ dsrl $len,4 # number of complete blocks
-+ bnez $len,poly1305_blocks_internal
-+ nop
-+ jr $ra
-+ nop
-+.end poly1305_blocks
-+
-+.align 5
-+.ent poly1305_blocks_internal
-+poly1305_blocks_internal:
-+ .set noreorder
-+#if defined(_MIPS_ARCH_MIPS64R6)
-+ .frame $sp,8*8,$ra
-+ .mask $SAVED_REGS_MASK|0x000c0000,-8
-+ dsubu $sp,8*8
-+ sd $s7,56($sp)
-+ sd $s6,48($sp)
-+#else
-+ .frame $sp,6*8,$ra
-+ .mask $SAVED_REGS_MASK,-8
-+ dsubu $sp,6*8
-+#endif
-+ sd $s5,40($sp)
-+ sd $s4,32($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
-+ sd $s3,24($sp)
-+ sd $s2,16($sp)
-+ sd $s1,8($sp)
-+ sd $s0,0($sp)
-+___
-+$code.=<<___;
-+ .set reorder
-+
-+#if defined(_MIPS_ARCH_MIPS64R6)
-+ andi $shr,$inp,7
-+ dsubu $inp,$inp,$shr # align $inp
-+ sll $shr,$shr,3 # byte to bit offset
-+ subu $shl,$zero,$shr
-+#endif
-+
-+ ld $h0,0($ctx) # load hash value
-+ ld $h1,8($ctx)
-+ ld $h2,16($ctx)
-+
-+ ld $r0,24($ctx) # load key
-+ ld $r1,32($ctx)
-+ ld $rs1,40($ctx)
-+
-+ dsll $len,4
-+ daddu $len,$inp # end of buffer
-+ b .Loop
-+
-+.align 4
-+.Loop:
-+#if defined(_MIPS_ARCH_MIPS64R6)
-+ ld $in0,0($inp) # load input
-+ ld $in1,8($inp)
-+ beqz $shr,.Laligned_inp
-+
-+ ld $tmp2,16($inp)
-+# ifdef MIPSEB
-+ dsllv $in0,$in0,$shr
-+ dsrlv $tmp3,$in1,$shl
-+ dsllv $in1,$in1,$shr
-+ dsrlv $tmp2,$tmp2,$shl
-+# else
-+ dsrlv $in0,$in0,$shr
-+ dsllv $tmp3,$in1,$shl
-+ dsrlv $in1,$in1,$shr
-+ dsllv $tmp2,$tmp2,$shl
-+# endif
-+ or $in0,$in0,$tmp3
-+ or $in1,$in1,$tmp2
-+.Laligned_inp:
-+#else
-+ ldl $in0,0+MSB($inp) # load input
-+ ldl $in1,8+MSB($inp)
-+ ldr $in0,0+LSB($inp)
-+ ldr $in1,8+LSB($inp)
-+#endif
-+ daddiu $inp,16
-+#ifdef MIPSEB
-+# if defined(_MIPS_ARCH_MIPS64R2)
-+ dsbh $in0,$in0 # byte swap
-+ dsbh $in1,$in1
-+ dshd $in0,$in0
-+ dshd $in1,$in1
-+# else
-+ ori $tmp0,$zero,0xFF
-+ dsll $tmp2,$tmp0,32
-+ or $tmp0,$tmp2 # 0x000000FF000000FF
-+
-+ and $tmp1,$in0,$tmp0 # byte swap
-+ and $tmp3,$in1,$tmp0
-+ dsrl $tmp2,$in0,24
-+ dsrl $tmp4,$in1,24
-+ dsll $tmp1,24
-+ dsll $tmp3,24
-+ and $tmp2,$tmp0
-+ and $tmp4,$tmp0
-+ dsll $tmp0,8 # 0x0000FF000000FF00
-+ or $tmp1,$tmp2
-+ or $tmp3,$tmp4
-+ and $tmp2,$in0,$tmp0
-+ and $tmp4,$in1,$tmp0
-+ dsrl $in0,8
-+ dsrl $in1,8
-+ dsll $tmp2,8
-+ dsll $tmp4,8
-+ and $in0,$tmp0
-+ and $in1,$tmp0
-+ or $tmp1,$tmp2
-+ or $tmp3,$tmp4
-+ or $in0,$tmp1
-+ or $in1,$tmp3
-+ dsrl $tmp1,$in0,32
-+ dsrl $tmp3,$in1,32
-+ dsll $in0,32
-+ dsll $in1,32
-+ or $in0,$tmp1
-+ or $in1,$tmp3
-+# endif
-+#endif
-+ dsrl $tmp1,$h2,2 # modulo-scheduled reduction
-+ andi $h2,$h2,3
-+ dsll $tmp0,$tmp1,2
-+
-+ daddu $d0,$h0,$in0 # accumulate input
-+ daddu $tmp1,$tmp0
-+ sltu $tmp0,$d0,$h0
-+ daddu $d0,$d0,$tmp1 # ... and residue
-+ sltu $tmp1,$d0,$tmp1
-+ daddu $d1,$h1,$in1
-+ daddu $tmp0,$tmp1
-+ sltu $tmp1,$d1,$h1
-+ daddu $d1,$tmp0
-+
-+ dmultu ($r0,$d0) # h0*r0
-+ daddu $d2,$h2,$padbit
-+ sltu $tmp0,$d1,$tmp0
-+ mflo ($h0,$r0,$d0)
-+ mfhi ($h1,$r0,$d0)
-+
-+ dmultu ($rs1,$d1) # h1*5*r1
-+ daddu $d2,$tmp1
-+ daddu $d2,$tmp0
-+ mflo ($tmp0,$rs1,$d1)
-+ mfhi ($tmp1,$rs1,$d1)
-+
-+ dmultu ($r1,$d0) # h0*r1
-+ mflo ($tmp2,$r1,$d0)
-+ mfhi ($h2,$r1,$d0)
-+ daddu $h0,$tmp0
-+ daddu $h1,$tmp1
-+ sltu $tmp0,$h0,$tmp0
-+
-+ dmultu ($r0,$d1) # h1*r0
-+ daddu $h1,$tmp0
-+ daddu $h1,$tmp2
-+ mflo ($tmp0,$r0,$d1)
-+ mfhi ($tmp1,$r0,$d1)
-+
-+ dmultu ($rs1,$d2) # h2*5*r1
-+ sltu $tmp2,$h1,$tmp2
-+ daddu $h2,$tmp2
-+ mflo ($tmp2,$rs1,$d2)
-+
-+ dmultu ($r0,$d2) # h2*r0
-+ daddu $h1,$tmp0
-+ daddu $h2,$tmp1
-+ mflo ($tmp3,$r0,$d2)
-+ sltu $tmp0,$h1,$tmp0
-+ daddu $h2,$tmp0
-+
-+ daddu $h1,$tmp2
-+ sltu $tmp2,$h1,$tmp2
-+ daddu $h2,$tmp2
-+ daddu $h2,$tmp3
-+
-+ bne $inp,$len,.Loop
-+
-+ sd $h0,0($ctx) # store hash value
-+ sd $h1,8($ctx)
-+ sd $h2,16($ctx)
-+
-+ .set noreorder
-+#if defined(_MIPS_ARCH_MIPS64R6)
-+ ld $s7,56($sp)
-+ ld $s6,48($sp)
-+#endif
-+ ld $s5,40($sp) # epilogue
-+ ld $s4,32($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
-+ ld $s3,24($sp)
-+ ld $s2,16($sp)
-+ ld $s1,8($sp)
-+ ld $s0,0($sp)
-+___
-+$code.=<<___;
-+ jr $ra
-+#if defined(_MIPS_ARCH_MIPS64R6)
-+ daddu $sp,8*8
-+#else
-+ daddu $sp,6*8
-+#endif
-+.end poly1305_blocks_internal
-+___
-+}
-+{
-+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
-+
-+$code.=<<___;
-+.align 5
-+.globl poly1305_emit
-+.ent poly1305_emit
-+poly1305_emit:
-+ .frame $sp,0,$ra
-+ .set reorder
-+
-+ ld $tmp2,16($ctx)
-+ ld $tmp0,0($ctx)
-+ ld $tmp1,8($ctx)
-+
-+ li $in0,-4 # final reduction
-+ dsrl $in1,$tmp2,2
-+ and $in0,$tmp2
-+ andi $tmp2,$tmp2,3
-+ daddu $in0,$in1
-+
-+ daddu $tmp0,$tmp0,$in0
-+ sltu $in1,$tmp0,$in0
-+ daddiu $in0,$tmp0,5 # compare to modulus
-+ daddu $tmp1,$tmp1,$in1
-+ sltiu $tmp3,$in0,5
-+ sltu $tmp4,$tmp1,$in1
-+ daddu $in1,$tmp1,$tmp3
-+ daddu $tmp2,$tmp2,$tmp4
-+ sltu $tmp3,$in1,$tmp3
-+ daddu $tmp2,$tmp2,$tmp3
-+
-+ dsrl $tmp2,2 # see if it carried/borrowed
-+ dsubu $tmp2,$zero,$tmp2
-+
-+ xor $in0,$tmp0
-+ xor $in1,$tmp1
-+ and $in0,$tmp2
-+ and $in1,$tmp2
-+ xor $in0,$tmp0
-+ xor $in1,$tmp1
-+
-+ lwu $tmp0,0($nonce) # load nonce
-+ lwu $tmp1,4($nonce)
-+ lwu $tmp2,8($nonce)
-+ lwu $tmp3,12($nonce)
-+ dsll $tmp1,32
-+ dsll $tmp3,32
-+ or $tmp0,$tmp1
-+ or $tmp2,$tmp3
-+
-+ daddu $in0,$tmp0 # accumulate nonce
-+ daddu $in1,$tmp2
-+ sltu $tmp0,$in0,$tmp0
-+ daddu $in1,$tmp0
-+
-+ dsrl $tmp0,$in0,8 # write mac value
-+ dsrl $tmp1,$in0,16
-+ dsrl $tmp2,$in0,24
-+ sb $in0,0($mac)
-+ dsrl $tmp3,$in0,32
-+ sb $tmp0,1($mac)
-+ dsrl $tmp0,$in0,40
-+ sb $tmp1,2($mac)
-+ dsrl $tmp1,$in0,48
-+ sb $tmp2,3($mac)
-+ dsrl $tmp2,$in0,56
-+ sb $tmp3,4($mac)
-+ dsrl $tmp3,$in1,8
-+ sb $tmp0,5($mac)
-+ dsrl $tmp0,$in1,16
-+ sb $tmp1,6($mac)
-+ dsrl $tmp1,$in1,24
-+ sb $tmp2,7($mac)
-+
-+ sb $in1,8($mac)
-+ dsrl $tmp2,$in1,32
-+ sb $tmp3,9($mac)
-+ dsrl $tmp3,$in1,40
-+ sb $tmp0,10($mac)
-+ dsrl $tmp0,$in1,48
-+ sb $tmp1,11($mac)
-+ dsrl $tmp1,$in1,56
-+ sb $tmp2,12($mac)
-+ sb $tmp3,13($mac)
-+ sb $tmp0,14($mac)
-+ sb $tmp1,15($mac)
-+
-+ jr $ra
-+.end poly1305_emit
-+.rdata
-+.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
-+.align 2
-+___
-+}
-+}}} else {{{
-+######################################################################
-+# 32-bit code path
-+#
-+
-+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
-+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
-+ ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
-+
-+$code.=<<___;
-+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
-+ defined(_MIPS_ARCH_MIPS32R6)) \\
-+ && !defined(_MIPS_ARCH_MIPS32R2)
-+# define _MIPS_ARCH_MIPS32R2
-+#endif
-+
-+#if defined(_MIPS_ARCH_MIPS32R6)
-+# define multu(rs,rt)
-+# define mflo(rd,rs,rt) mulu rd,rs,rt
-+# define mfhi(rd,rs,rt) muhu rd,rs,rt
-+#else
-+# define multu(rs,rt) multu rs,rt
-+# define mflo(rd,rs,rt) mflo rd
-+# define mfhi(rd,rs,rt) mfhi rd
-+#endif
-+
-+#ifdef __KERNEL__
-+# define poly1305_init poly1305_init_mips
-+# define poly1305_blocks poly1305_blocks_mips
-+# define poly1305_emit poly1305_emit_mips
-+#endif
-+
-+#if defined(__MIPSEB__) && !defined(MIPSEB)
-+# define MIPSEB
-+#endif
-+
-+#ifdef MIPSEB
-+# define MSB 0
-+# define LSB 3
-+#else
-+# define MSB 3
-+# define LSB 0
-+#endif
-+
-+.text
-+.set noat
-+.set noreorder
-+
-+.align 5
-+.globl poly1305_init
-+.ent poly1305_init
-+poly1305_init:
-+ .frame $sp,0,$ra
-+ .set reorder
-+
-+ sw $zero,0($ctx)
-+ sw $zero,4($ctx)
-+ sw $zero,8($ctx)
-+ sw $zero,12($ctx)
-+ sw $zero,16($ctx)
-+
-+ beqz $inp,.Lno_key
-+
-+#if defined(_MIPS_ARCH_MIPS32R6)
-+ andi $tmp0,$inp,3 # $inp % 4
-+ subu $inp,$inp,$tmp0 # align $inp
-+ sll $tmp0,$tmp0,3 # byte to bit offset
-+ lw $in0,0($inp)
-+ lw $in1,4($inp)
-+ lw $in2,8($inp)
-+ lw $in3,12($inp)
-+ beqz $tmp0,.Laligned_key
-+
-+ lw $tmp2,16($inp)
-+ subu $tmp1,$zero,$tmp0
-+# ifdef MIPSEB
-+ sllv $in0,$in0,$tmp0
-+ srlv $tmp3,$in1,$tmp1
-+ sllv $in1,$in1,$tmp0
-+ or $in0,$in0,$tmp3
-+ srlv $tmp3,$in2,$tmp1
-+ sllv $in2,$in2,$tmp0
-+ or $in1,$in1,$tmp3
-+ srlv $tmp3,$in3,$tmp1
-+ sllv $in3,$in3,$tmp0
-+ or $in2,$in2,$tmp3
-+ srlv $tmp2,$tmp2,$tmp1
-+ or $in3,$in3,$tmp2
-+# else
-+ srlv $in0,$in0,$tmp0
-+ sllv $tmp3,$in1,$tmp1
-+ srlv $in1,$in1,$tmp0
-+ or $in0,$in0,$tmp3
-+ sllv $tmp3,$in2,$tmp1
-+ srlv $in2,$in2,$tmp0
-+ or $in1,$in1,$tmp3
-+ sllv $tmp3,$in3,$tmp1
-+ srlv $in3,$in3,$tmp0
-+ or $in2,$in2,$tmp3
-+ sllv $tmp2,$tmp2,$tmp1
-+ or $in3,$in3,$tmp2
-+# endif
-+.Laligned_key:
-+#else
-+ lwl $in0,0+MSB($inp)
-+ lwl $in1,4+MSB($inp)
-+ lwl $in2,8+MSB($inp)
-+ lwl $in3,12+MSB($inp)
-+ lwr $in0,0+LSB($inp)
-+ lwr $in1,4+LSB($inp)
-+ lwr $in2,8+LSB($inp)
-+ lwr $in3,12+LSB($inp)
-+#endif
-+#ifdef MIPSEB
-+# if defined(_MIPS_ARCH_MIPS32R2)
-+ wsbh $in0,$in0 # byte swap
-+ wsbh $in1,$in1
-+ wsbh $in2,$in2
-+ wsbh $in3,$in3
-+ rotr $in0,$in0,16
-+ rotr $in1,$in1,16
-+ rotr $in2,$in2,16
-+ rotr $in3,$in3,16
-+# else
-+ srl $tmp0,$in0,24 # byte swap
-+ srl $tmp1,$in0,8
-+ andi $tmp2,$in0,0xFF00
-+ sll $in0,$in0,24
-+ andi $tmp1,0xFF00
-+ sll $tmp2,$tmp2,8
-+ or $in0,$tmp0
-+ srl $tmp0,$in1,24
-+ or $tmp1,$tmp2
-+ srl $tmp2,$in1,8
-+ or $in0,$tmp1
-+ andi $tmp1,$in1,0xFF00
-+ sll $in1,$in1,24
-+ andi $tmp2,0xFF00
-+ sll $tmp1,$tmp1,8
-+ or $in1,$tmp0
-+ srl $tmp0,$in2,24
-+ or $tmp2,$tmp1
-+ srl $tmp1,$in2,8
-+ or $in1,$tmp2
-+ andi $tmp2,$in2,0xFF00
-+ sll $in2,$in2,24
-+ andi $tmp1,0xFF00
-+ sll $tmp2,$tmp2,8
-+ or $in2,$tmp0
-+ srl $tmp0,$in3,24
-+ or $tmp1,$tmp2
-+ srl $tmp2,$in3,8
-+ or $in2,$tmp1
-+ andi $tmp1,$in3,0xFF00
-+ sll $in3,$in3,24
-+ andi $tmp2,0xFF00
-+ sll $tmp1,$tmp1,8
-+ or $in3,$tmp0
-+ or $tmp2,$tmp1
-+ or $in3,$tmp2
-+# endif
-+#endif
-+ lui $tmp0,0x0fff
-+ ori $tmp0,0xffff # 0x0fffffff
-+ and $in0,$in0,$tmp0
-+ subu $tmp0,3 # 0x0ffffffc
-+ and $in1,$in1,$tmp0
-+ and $in2,$in2,$tmp0
-+ and $in3,$in3,$tmp0
-+
-+ sw $in0,20($ctx)
-+ sw $in1,24($ctx)
-+ sw $in2,28($ctx)
-+ sw $in3,32($ctx)
-+
-+ srl $tmp1,$in1,2
-+ srl $tmp2,$in2,2
-+ srl $tmp3,$in3,2
-+ addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
-+ addu $in2,$in2,$tmp2
-+ addu $in3,$in3,$tmp3
-+ sw $in1,36($ctx)
-+ sw $in2,40($ctx)
-+ sw $in3,44($ctx)
-+.Lno_key:
-+ li $v0,0
-+ jr $ra
-+.end poly1305_init
-+___
-+{
-+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
-+
-+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
-+ ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
-+my ($d0,$d1,$d2,$d3) =
-+ ($a4,$a5,$a6,$a7);
-+my $shr = $t2; # used on R6
-+my $one = $t2; # used on R2
-+
-+$code.=<<___;
-+.globl poly1305_blocks
-+.align 5
-+.ent poly1305_blocks
-+poly1305_blocks:
-+ .frame $sp,16*4,$ra
-+ .mask $SAVED_REGS_MASK,-4
-+ .set noreorder
-+ subu $sp, $sp,4*12
-+ sw $s11,4*11($sp)
-+ sw $s10,4*10($sp)
-+ sw $s9, 4*9($sp)
-+ sw $s8, 4*8($sp)
-+ sw $s7, 4*7($sp)
-+ sw $s6, 4*6($sp)
-+ sw $s5, 4*5($sp)
-+ sw $s4, 4*4($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
-+ sw $s3, 4*3($sp)
-+ sw $s2, 4*2($sp)
-+ sw $s1, 4*1($sp)
-+ sw $s0, 4*0($sp)
-+___
-+$code.=<<___;
-+ .set reorder
-+
-+ srl $len,4 # number of complete blocks
-+ li $one,1
-+ beqz $len,.Labort
-+
-+#if defined(_MIPS_ARCH_MIPS32R6)
-+ andi $shr,$inp,3
-+ subu $inp,$inp,$shr # align $inp
-+ sll $shr,$shr,3 # byte to bit offset
-+#endif
-+
-+ lw $h0,0($ctx) # load hash value
-+ lw $h1,4($ctx)
-+ lw $h2,8($ctx)
-+ lw $h3,12($ctx)
-+ lw $h4,16($ctx)
-+
-+ lw $r0,20($ctx) # load key
-+ lw $r1,24($ctx)
-+ lw $r2,28($ctx)
-+ lw $r3,32($ctx)
-+ lw $rs1,36($ctx)
-+ lw $rs2,40($ctx)
-+ lw $rs3,44($ctx)
-+
-+ sll $len,4
-+ addu $len,$len,$inp # end of buffer
-+ b .Loop
-+
-+.align 4
-+.Loop:
-+#if defined(_MIPS_ARCH_MIPS32R6)
-+ lw $d0,0($inp) # load input
-+ lw $d1,4($inp)
-+ lw $d2,8($inp)
-+ lw $d3,12($inp)
-+ beqz $shr,.Laligned_inp
-+
-+ lw $t0,16($inp)
-+ subu $t1,$zero,$shr
-+# ifdef MIPSEB
-+ sllv $d0,$d0,$shr
-+ srlv $at,$d1,$t1
-+ sllv $d1,$d1,$shr
-+ or $d0,$d0,$at
-+ srlv $at,$d2,$t1
-+ sllv $d2,$d2,$shr
-+ or $d1,$d1,$at
-+ srlv $at,$d3,$t1
-+ sllv $d3,$d3,$shr
-+ or $d2,$d2,$at
-+ srlv $t0,$t0,$t1
-+ or $d3,$d3,$t0
-+# else
-+ srlv $d0,$d0,$shr
-+ sllv $at,$d1,$t1
-+ srlv $d1,$d1,$shr
-+ or $d0,$d0,$at
-+ sllv $at,$d2,$t1
-+ srlv $d2,$d2,$shr
-+ or $d1,$d1,$at
-+ sllv $at,$d3,$t1
-+ srlv $d3,$d3,$shr
-+ or $d2,$d2,$at
-+ sllv $t0,$t0,$t1
-+ or $d3,$d3,$t0
-+# endif
-+.Laligned_inp:
-+#else
-+ lwl $d0,0+MSB($inp) # load input
-+ lwl $d1,4+MSB($inp)
-+ lwl $d2,8+MSB($inp)
-+ lwl $d3,12+MSB($inp)
-+ lwr $d0,0+LSB($inp)
-+ lwr $d1,4+LSB($inp)
-+ lwr $d2,8+LSB($inp)
-+ lwr $d3,12+LSB($inp)
-+#endif
-+#ifdef MIPSEB
-+# if defined(_MIPS_ARCH_MIPS32R2)
-+ wsbh $d0,$d0 # byte swap
-+ wsbh $d1,$d1
-+ wsbh $d2,$d2
-+ wsbh $d3,$d3
-+ rotr $d0,$d0,16
-+ rotr $d1,$d1,16
-+ rotr $d2,$d2,16
-+ rotr $d3,$d3,16
-+# else
-+ srl $at,$d0,24 # byte swap
-+ srl $t0,$d0,8
-+ andi $t1,$d0,0xFF00
-+ sll $d0,$d0,24
-+ andi $t0,0xFF00
-+ sll $t1,$t1,8
-+ or $d0,$at
-+ srl $at,$d1,24
-+ or $t0,$t1
-+ srl $t1,$d1,8
-+ or $d0,$t0
-+ andi $t0,$d1,0xFF00
-+ sll $d1,$d1,24
-+ andi $t1,0xFF00
-+ sll $t0,$t0,8
-+ or $d1,$at
-+ srl $at,$d2,24
-+ or $t1,$t0
-+ srl $t0,$d2,8
-+ or $d1,$t1
-+ andi $t1,$d2,0xFF00
-+ sll $d2,$d2,24
-+ andi $t0,0xFF00
-+ sll $t1,$t1,8
-+ or $d2,$at
-+ srl $at,$d3,24
-+ or $t0,$t1
-+ srl $t1,$d3,8
-+ or $d2,$t0
-+ andi $t0,$d3,0xFF00
-+ sll $d3,$d3,24
-+ andi $t1,0xFF00
-+ sll $t0,$t0,8
-+ or $d3,$at
-+ or $t1,$t0
-+ or $d3,$t1
-+# endif
-+#endif
-+ srl $t0,$h4,2 # modulo-scheduled reduction
-+ andi $h4,$h4,3
-+ sll $at,$t0,2
-+
-+ addu $d0,$d0,$h0 # accumulate input
-+ addu $t0,$t0,$at
-+ sltu $h0,$d0,$h0
-+ addu $d0,$d0,$t0 # ... and residue
-+ sltu $at,$d0,$t0
-+
-+ addu $d1,$d1,$h1
-+ addu $h0,$h0,$at # carry
-+ sltu $h1,$d1,$h1
-+ addu $d1,$d1,$h0
-+ sltu $h0,$d1,$h0
-+
-+ addu $d2,$d2,$h2
-+ addu $h1,$h1,$h0 # carry
-+ sltu $h2,$d2,$h2
-+ addu $d2,$d2,$h1
-+ sltu $h1,$d2,$h1
-+
-+ addu $d3,$d3,$h3
-+ addu $h2,$h2,$h1 # carry
-+ sltu $h3,$d3,$h3
-+ addu $d3,$d3,$h2
-+
-+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
-+ multu $r0,$d0 # d0*r0
-+ sltu $h2,$d3,$h2
-+ maddu $rs3,$d1 # d1*s3
-+ addu $h3,$h3,$h2 # carry
-+ maddu $rs2,$d2 # d2*s2
-+ addu $h4,$h4,$padbit
-+ maddu $rs1,$d3 # d3*s1
-+ addu $h4,$h4,$h3
-+ mfhi $at
-+ mflo $h0
-+
-+ multu $r1,$d0 # d0*r1
-+ maddu $r0,$d1 # d1*r0
-+ maddu $rs3,$d2 # d2*s3
-+ maddu $rs2,$d3 # d3*s2
-+ maddu $rs1,$h4 # h4*s1
-+ maddu $at,$one # hi*1
-+ mfhi $at
-+ mflo $h1
-+
-+ multu $r2,$d0 # d0*r2
-+ maddu $r1,$d1 # d1*r1
-+ maddu $r0,$d2 # d2*r0
-+ maddu $rs3,$d3 # d3*s3
-+ maddu $rs2,$h4 # h4*s2
-+ maddu $at,$one # hi*1
-+ mfhi $at
-+ mflo $h2
-+
-+ mul $t0,$r0,$h4 # h4*r0
-+
-+ multu $r3,$d0 # d0*r3
-+ maddu $r2,$d1 # d1*r2
-+ maddu $r1,$d2 # d2*r1
-+ maddu $r0,$d3 # d3*r0
-+ maddu $rs3,$h4 # h4*s3
-+ maddu $at,$one # hi*1
-+ mfhi $at
-+ mflo $h3
-+
-+ addiu $inp,$inp,16
-+
-+ addu $h4,$t0,$at
-+#else
-+ multu ($r0,$d0) # d0*r0
-+ mflo ($h0,$r0,$d0)
-+ mfhi ($h1,$r0,$d0)
-+
-+ sltu $h2,$d3,$h2
-+ addu $h3,$h3,$h2 # carry
-+
-+ multu ($rs3,$d1) # d1*s3
-+ mflo ($at,$rs3,$d1)
-+ mfhi ($t0,$rs3,$d1)
-+
-+ addu $h4,$h4,$padbit
-+ addiu $inp,$inp,16
-+ addu $h4,$h4,$h3
-+
-+ multu ($rs2,$d2) # d2*s2
-+ mflo ($a3,$rs2,$d2)
-+ mfhi ($t1,$rs2,$d2)
-+ addu $h0,$h0,$at
-+ addu $h1,$h1,$t0
-+ multu ($rs1,$d3) # d3*s1
-+ sltu $at,$h0,$at
-+ addu $h1,$h1,$at
-+
-+ mflo ($at,$rs1,$d3)
-+ mfhi ($t0,$rs1,$d3)
-+ addu $h0,$h0,$a3
-+ addu $h1,$h1,$t1
-+ multu ($r1,$d0) # d0*r1
-+ sltu $a3,$h0,$a3
-+ addu $h1,$h1,$a3
-+
-+
-+ mflo ($a3,$r1,$d0)
-+ mfhi ($h2,$r1,$d0)
-+ addu $h0,$h0,$at
-+ addu $h1,$h1,$t0
-+ multu ($r0,$d1) # d1*r0
-+ sltu $at,$h0,$at
-+ addu $h1,$h1,$at
-+
-+ mflo ($at,$r0,$d1)
-+ mfhi ($t0,$r0,$d1)
-+ addu $h1,$h1,$a3
-+ sltu $a3,$h1,$a3
-+ multu ($rs3,$d2) # d2*s3
-+ addu $h2,$h2,$a3
-+
-+ mflo ($a3,$rs3,$d2)
-+ mfhi ($t1,$rs3,$d2)
-+ addu $h1,$h1,$at
-+ addu $h2,$h2,$t0
-+ multu ($rs2,$d3) # d3*s2
-+ sltu $at,$h1,$at
-+ addu $h2,$h2,$at
-+
-+ mflo ($at,$rs2,$d3)
-+ mfhi ($t0,$rs2,$d3)
-+ addu $h1,$h1,$a3
-+ addu $h2,$h2,$t1
-+ multu ($rs1,$h4) # h4*s1
-+ sltu $a3,$h1,$a3
-+ addu $h2,$h2,$a3
-+
-+ mflo ($a3,$rs1,$h4)
-+ addu $h1,$h1,$at
-+ addu $h2,$h2,$t0
-+ multu ($r2,$d0) # d0*r2
-+ sltu $at,$h1,$at
-+ addu $h2,$h2,$at
-+
-+
-+ mflo ($at,$r2,$d0)
-+ mfhi ($h3,$r2,$d0)
-+ addu $h1,$h1,$a3
-+ sltu $a3,$h1,$a3
-+ multu ($r1,$d1) # d1*r1
-+ addu $h2,$h2,$a3
-+
-+ mflo ($a3,$r1,$d1)
-+ mfhi ($t1,$r1,$d1)
-+ addu $h2,$h2,$at
-+ sltu $at,$h2,$at
-+ multu ($r0,$d2) # d2*r0
-+ addu $h3,$h3,$at
-+
-+ mflo ($at,$r0,$d2)
-+ mfhi ($t0,$r0,$d2)
-+ addu $h2,$h2,$a3
-+ addu $h3,$h3,$t1
-+ multu ($rs3,$d3) # d3*s3
-+ sltu $a3,$h2,$a3
-+ addu $h3,$h3,$a3
-+
-+ mflo ($a3,$rs3,$d3)
-+ mfhi ($t1,$rs3,$d3)
-+ addu $h2,$h2,$at
-+ addu $h3,$h3,$t0
-+ multu ($rs2,$h4) # h4*s2
-+ sltu $at,$h2,$at
-+ addu $h3,$h3,$at
-+
-+ mflo ($at,$rs2,$h4)
-+ addu $h2,$h2,$a3
-+ addu $h3,$h3,$t1
-+ multu ($r3,$d0) # d0*r3
-+ sltu $a3,$h2,$a3
-+ addu $h3,$h3,$a3
-+
-+
-+ mflo ($a3,$r3,$d0)
-+ mfhi ($t1,$r3,$d0)
-+ addu $h2,$h2,$at
-+ sltu $at,$h2,$at
-+ multu ($r2,$d1) # d1*r2
-+ addu $h3,$h3,$at
-+
-+ mflo ($at,$r2,$d1)
-+ mfhi ($t0,$r2,$d1)
-+ addu $h3,$h3,$a3
-+ sltu $a3,$h3,$a3
-+ multu ($r0,$d3) # d3*r0
-+ addu $t1,$t1,$a3
-+
-+ mflo ($a3,$r0,$d3)
-+ mfhi ($d3,$r0,$d3)
-+ addu $h3,$h3,$at
-+ addu $t1,$t1,$t0
-+ multu ($r1,$d2) # d2*r1
-+ sltu $at,$h3,$at
-+ addu $t1,$t1,$at
-+
-+ mflo ($at,$r1,$d2)
-+ mfhi ($t0,$r1,$d2)
-+ addu $h3,$h3,$a3
-+ addu $t1,$t1,$d3
-+ multu ($rs3,$h4) # h4*s3
-+ sltu $a3,$h3,$a3
-+ addu $t1,$t1,$a3
-+
-+ mflo ($a3,$rs3,$h4)
-+ addu $h3,$h3,$at
-+ addu $t1,$t1,$t0
-+ multu ($r0,$h4) # h4*r0
-+ sltu $at,$h3,$at
-+ addu $t1,$t1,$at
-+
-+
-+ mflo ($h4,$r0,$h4)
-+ addu $h3,$h3,$a3
-+ sltu $a3,$h3,$a3
-+ addu $t1,$t1,$a3
-+ addu $h4,$h4,$t1
-+
-+ li $padbit,1 # if we loop, padbit is 1
-+#endif
-+ bne $inp,$len,.Loop
-+
-+ sw $h0,0($ctx) # store hash value
-+ sw $h1,4($ctx)
-+ sw $h2,8($ctx)
-+ sw $h3,12($ctx)
-+ sw $h4,16($ctx)
-+
-+ .set noreorder
-+.Labort:
-+ lw $s11,4*11($sp)
-+ lw $s10,4*10($sp)
-+ lw $s9, 4*9($sp)
-+ lw $s8, 4*8($sp)
-+ lw $s7, 4*7($sp)
-+ lw $s6, 4*6($sp)
-+ lw $s5, 4*5($sp)
-+ lw $s4, 4*4($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
-+ lw $s3, 4*3($sp)
-+ lw $s2, 4*2($sp)
-+ lw $s1, 4*1($sp)
-+ lw $s0, 4*0($sp)
-+___
-+$code.=<<___;
-+ jr $ra
-+ addu $sp,$sp,4*12
-+.end poly1305_blocks
-+___
-+}
-+{
-+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
-+
-+$code.=<<___;
-+.align 5
-+.globl poly1305_emit
-+.ent poly1305_emit
-+poly1305_emit:
-+ .frame $sp,0,$ra
-+ .set reorder
-+
-+ lw $tmp4,16($ctx)
-+ lw $tmp0,0($ctx)
-+ lw $tmp1,4($ctx)
-+ lw $tmp2,8($ctx)
-+ lw $tmp3,12($ctx)
-+
-+ li $in0,-4 # final reduction
-+ srl $ctx,$tmp4,2
-+ and $in0,$in0,$tmp4
-+ andi $tmp4,$tmp4,3
-+ addu $ctx,$ctx,$in0
-+
-+ addu $tmp0,$tmp0,$ctx
-+ sltu $ctx,$tmp0,$ctx
-+ addiu $in0,$tmp0,5 # compare to modulus
-+ addu $tmp1,$tmp1,$ctx
-+ sltiu $in1,$in0,5
-+ sltu $ctx,$tmp1,$ctx
-+ addu $in1,$in1,$tmp1
-+ addu $tmp2,$tmp2,$ctx
-+ sltu $in2,$in1,$tmp1
-+ sltu $ctx,$tmp2,$ctx
-+ addu $in2,$in2,$tmp2
-+ addu $tmp3,$tmp3,$ctx
-+ sltu $in3,$in2,$tmp2
-+ sltu $ctx,$tmp3,$ctx
-+ addu $in3,$in3,$tmp3
-+ addu $tmp4,$tmp4,$ctx
-+ sltu $ctx,$in3,$tmp3
-+ addu $ctx,$tmp4
-+
-+ srl $ctx,2 # see if it carried/borrowed
-+ subu $ctx,$zero,$ctx
-+
-+ xor $in0,$tmp0
-+ xor $in1,$tmp1
-+ xor $in2,$tmp2
-+ xor $in3,$tmp3
-+ and $in0,$ctx
-+ and $in1,$ctx
-+ and $in2,$ctx
-+ and $in3,$ctx
-+ xor $in0,$tmp0
-+ xor $in1,$tmp1
-+ xor $in2,$tmp2
-+ xor $in3,$tmp3
-+
-+ lw $tmp0,0($nonce) # load nonce
-+ lw $tmp1,4($nonce)
-+ lw $tmp2,8($nonce)
-+ lw $tmp3,12($nonce)
-+
-+ addu $in0,$tmp0 # accumulate nonce
-+ sltu $ctx,$in0,$tmp0
-+
-+ addu $in1,$tmp1
-+ sltu $tmp1,$in1,$tmp1
-+ addu $in1,$ctx
-+ sltu $ctx,$in1,$ctx
-+ addu $ctx,$tmp1
-+
-+ addu $in2,$tmp2
-+ sltu $tmp2,$in2,$tmp2
-+ addu $in2,$ctx
-+ sltu $ctx,$in2,$ctx
-+ addu $ctx,$tmp2
-+
-+ addu $in3,$tmp3
-+ addu $in3,$ctx
-+
-+ srl $tmp0,$in0,8 # write mac value
-+ srl $tmp1,$in0,16
-+ srl $tmp2,$in0,24
-+ sb $in0, 0($mac)
-+ sb $tmp0,1($mac)
-+ srl $tmp0,$in1,8
-+ sb $tmp1,2($mac)
-+ srl $tmp1,$in1,16
-+ sb $tmp2,3($mac)
-+ srl $tmp2,$in1,24
-+ sb $in1, 4($mac)
-+ sb $tmp0,5($mac)
-+ srl $tmp0,$in2,8
-+ sb $tmp1,6($mac)
-+ srl $tmp1,$in2,16
-+ sb $tmp2,7($mac)
-+ srl $tmp2,$in2,24
-+ sb $in2, 8($mac)
-+ sb $tmp0,9($mac)
-+ srl $tmp0,$in3,8
-+ sb $tmp1,10($mac)
-+ srl $tmp1,$in3,16
-+ sb $tmp2,11($mac)
-+ srl $tmp2,$in3,24
-+ sb $in3, 12($mac)
-+ sb $tmp0,13($mac)
-+ sb $tmp1,14($mac)
-+ sb $tmp2,15($mac)
-+
-+ jr $ra
-+.end poly1305_emit
-+.rdata
-+.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
-+.align 2
-+___
-+}
-+}}}
-+
-+$output=pop and open STDOUT,">$output";
-+print $code;
-+close STDOUT;
---- a/crypto/Kconfig
-+++ b/crypto/Kconfig
-@@ -707,6 +707,11 @@ config CRYPTO_POLY1305_X86_64
- in IETF protocols. This is the x86_64 assembler implementation using SIMD
- instructions.
-
-+config CRYPTO_POLY1305_MIPS
-+ tristate "Poly1305 authenticator algorithm (MIPS optimized)"
-+ depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
-+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
-+
- config CRYPTO_MD4
- tristate "MD4 digest algorithm"
- select CRYPTO_HASH
---- a/lib/crypto/Kconfig
-+++ b/lib/crypto/Kconfig
-@@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
-
- config CRYPTO_LIB_POLY1305_RSIZE
- int
-+ default 2 if MIPS
- default 4 if X86_64
- default 9 if ARM || ARM64
- default 1