1 files changed, 1563 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
new file mode 100644
index 0000000000..68cac9cc57
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
@@ -0,0 +1,1563 @@
+From a338793df36990e97ab0b824fad6fbf6ef171f94 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:26 +0100
+Subject: [PATCH 020/124] crypto: mips/poly1305 - incorporate
+ OpenSSL/CRYPTOGAMS optimized implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit a11d055e7a64ac34a5e99b6fe731299449cbcd58 upstream.
+
+This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for
+MIPS authored by Andy Polyakov, a prior 64-bit only version of which has been
+contributed by him to the OpenSSL project. The file 'poly1305-mips.pl' is taken
+straight from this upstream GitHub repository [0] at commit
+d22ade312a7af958ec955620b0d241cf42c37feb, and already contains all the changes
+required to build it as part of a Linux kernel module.
+
+[0] https://github.com/dot-asm/cryptogams
+
+Co-developed-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Andy Polyakov <appro@cryptogams.org>
+Co-developed-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/mips/crypto/Makefile         |   14 +
+ arch/mips/crypto/poly1305-glue.c  |  203 +++++
+ arch/mips/crypto/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++
+ crypto/Kconfig                    |    5 +
+ lib/crypto/Kconfig                |    1 +
+ 5 files changed, 1496 insertions(+)
+ create mode 100644 arch/mips/crypto/poly1305-glue.c
+ create mode 100644 arch/mips/crypto/poly1305-mips.pl
+
+--- a/arch/mips/crypto/Makefile
++++ b/arch/mips/crypto/Makefile
+@@ -8,3 +8,17 @@ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32
+ obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
+ chacha-mips-y := chacha-core.o chacha-glue.o
+ AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
++
++obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
++poly1305-mips-y := poly1305-core.o poly1305-glue.o
++
++perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
++perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
++
++quiet_cmd_perlasm = PERLASM $@
++      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
++
++$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
++	$(call if_changed,perlasm)
++
++targets += poly1305-core.S
+--- /dev/null
++++ b/arch/mips/crypto/poly1305-glue.c
+@@ -0,0 +1,203 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
++ *
++ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
++ */
++
++#include <asm/unaligned.h>
++#include <crypto/algapi.h>
++#include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
++#include <linux/cpufeature.h>
++#include <linux/crypto.h>
++#include <linux/module.h>
++
++asmlinkage void poly1305_init_mips(void *state, const u8 *key);
++asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
++asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
++
++void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
++{
++	poly1305_init_mips(&dctx->h, key);
++	dctx->s[0] = get_unaligned_le32(key + 16);
++	dctx->s[1] = get_unaligned_le32(key + 20);
++	dctx->s[2] = get_unaligned_le32(key + 24);
++	dctx->s[3] = get_unaligned_le32(key + 28);
++	dctx->buflen = 0;
++}
++EXPORT_SYMBOL(poly1305_init_arch);
++
++static int mips_poly1305_init(struct shash_desc *desc)
++{
++	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++	dctx->buflen = 0;
++	dctx->rset = 0;
++	dctx->sset = false;
++
++	return 0;
++}
++
++static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
++				 u32 len, u32 hibit)
++{
++	if (unlikely(!dctx->sset)) {
++		if (!dctx->rset) {
++			poly1305_init_mips(&dctx->h, src);
++			src += POLY1305_BLOCK_SIZE;
++			len -= POLY1305_BLOCK_SIZE;
++			dctx->rset = 1;
++		}
++		if (len >= POLY1305_BLOCK_SIZE) {
++			dctx->s[0] = get_unaligned_le32(src +  0);
++			dctx->s[1] = get_unaligned_le32(src +  4);
++			dctx->s[2] = get_unaligned_le32(src +  8);
++			dctx->s[3] = get_unaligned_le32(src + 12);
++			src += POLY1305_BLOCK_SIZE;
++			len -= POLY1305_BLOCK_SIZE;
++			dctx->sset = true;
++		}
++		if (len < POLY1305_BLOCK_SIZE)
++			return;
++	}
++
++	len &= ~(POLY1305_BLOCK_SIZE - 1);
++
++	poly1305_blocks_mips(&dctx->h, src, len, hibit);
++}
++
++static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
++				unsigned int len)
++{
++	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++	if (unlikely(dctx->buflen)) {
++		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++		memcpy(dctx->buf + dctx->buflen, src, bytes);
++		src += bytes;
++		len -= bytes;
++		dctx->buflen += bytes;
++
++		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++			mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
++			dctx->buflen = 0;
++		}
++	}
++
++	if (likely(len >= POLY1305_BLOCK_SIZE)) {
++		mips_poly1305_blocks(dctx, src, len, 1);
++		src += round_down(len, POLY1305_BLOCK_SIZE);
++		len %= POLY1305_BLOCK_SIZE;
++	}
++
++	if (unlikely(len)) {
++		dctx->buflen = len;
++		memcpy(dctx->buf, src, len);
++	}
++	return 0;
++}
++
++void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
++			  unsigned int nbytes)
++{
++	if (unlikely(dctx->buflen)) {
++		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++		memcpy(dctx->buf + dctx->buflen, src, bytes);
++		src += bytes;
++		nbytes -= bytes;
++		dctx->buflen += bytes;
++
++		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++			poly1305_blocks_mips(&dctx->h, dctx->buf,
++					     POLY1305_BLOCK_SIZE, 1);
++			dctx->buflen = 0;
++		}
++	}
++
++	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
++		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
++
++		poly1305_blocks_mips(&dctx->h, src, len, 1);
++		src += len;
++		nbytes %= POLY1305_BLOCK_SIZE;
++	}
++
++	if (unlikely(nbytes)) {
++		dctx->buflen = nbytes;
++		memcpy(dctx->buf, src, nbytes);
++	}
++}
++EXPORT_SYMBOL(poly1305_update_arch);
++
++void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
++{
++	__le32 digest[4];
++	u64 f = 0;
++
++	if (unlikely(dctx->buflen)) {
++		dctx->buf[dctx->buflen++] = 1;
++		memset(dctx->buf + dctx->buflen, 0,
++		       POLY1305_BLOCK_SIZE - dctx->buflen);
++		poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
++	}
++
++	poly1305_emit_mips(&dctx->h, digest, dctx->s);
++
++	/* mac = (h + s) % (2^128) */
++	f = (f >> 32) + le32_to_cpu(digest[0]);
++	put_unaligned_le32(f, dst);
++	f = (f >> 32) + le32_to_cpu(digest[1]);
++	put_unaligned_le32(f, dst + 4);
++	f = (f >> 32) + le32_to_cpu(digest[2]);
++	put_unaligned_le32(f, dst + 8);
++	f = (f >> 32) + le32_to_cpu(digest[3]);
++	put_unaligned_le32(f, dst + 12);
++
++	*dctx = (struct poly1305_desc_ctx){};
++}
++EXPORT_SYMBOL(poly1305_final_arch);
++
++static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
++{
++	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++	if (unlikely(!dctx->sset))
++		return -ENOKEY;
++
++	poly1305_final_arch(dctx, dst);
++	return 0;
++}
++
++static struct shash_alg mips_poly1305_alg = {
++	.init			= mips_poly1305_init,
++	.update			= mips_poly1305_update,
++	.final			= mips_poly1305_final,
++	.digestsize		= POLY1305_DIGEST_SIZE,
++	.descsize		= sizeof(struct poly1305_desc_ctx),
++
++	.base.cra_name		= "poly1305",
++	.base.cra_driver_name	= "poly1305-mips",
++	.base.cra_priority	= 200,
++	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
++	.base.cra_module	= THIS_MODULE,
++};
++
++static int __init mips_poly1305_mod_init(void)
++{
++	return crypto_register_shash(&mips_poly1305_alg);
++}
++
++static void __exit mips_poly1305_mod_exit(void)
++{
++	crypto_unregister_shash(&mips_poly1305_alg);
++}
++
++module_init(mips_poly1305_mod_init);
++module_exit(mips_poly1305_mod_exit);
++
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("poly1305");
++MODULE_ALIAS_CRYPTO("poly1305-mips");
+--- /dev/null
++++ b/arch/mips/crypto/poly1305-mips.pl
+@@ -0,0 +1,1273 @@
++#!/usr/bin/env perl
++# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
++#
++# ====================================================================
++# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
++# project.
++# ====================================================================
++
++# Poly1305 hash for MIPS.
++#
++# May 2016
++#
++# Numbers are cycles per processed byte with poly1305_blocks alone.
++#
++#		IALU/gcc
++# R1x000	~5.5/+130%	(big-endian)
++# Octeon II	2.50/+70%	(little-endian)
++#
++# March 2019
++#
++# Add 32-bit code path.
++#
++# October 2019
++#
++# Modulo-scheduling reduction allows to omit dependency chain at the
++# end of inner loop and improve performance. Also optimize MIPS32R2
++# code path for MIPS 1004K core. Per René von Dorst's suggestions.
++#
++#		IALU/gcc
++# R1x000	~9.8/?		(big-endian)
++# Octeon II	3.65/+140%	(little-endian)
++# MT7621/1004K	4.75/?		(little-endian)
++#
++######################################################################
++# There is a number of MIPS ABI in use, O32 and N32/64 are most
++# widely used. Then there is a new contender: NUBI. It appears that if
++# one picks the latter, it's possible to arrange code in ABI neutral
++# manner. Therefore let's stick to NUBI register layout:
++#
++($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
++($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
++($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
++($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
++#
++# The return value is placed in $a0. Following coding rules facilitate
++# interoperability:
++#
++# - never ever touch $tp, "thread pointer", former $gp [o32 can be
++#   excluded from the rule, because it's specified volatile];
++# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
++#   old code];
++# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
++#
++# For reference here is register layout for N32/64 MIPS ABIs:
++#
++# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
++# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
++# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
++# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
++# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
++#
++# <appro@openssl.org>
++#
++######################################################################
++
++$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
++
++$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
++
++if ($flavour =~ /64|n32/i) {{{
++######################################################################
++# 64-bit code path
++#
++
++my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
++my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
++
++$code.=<<___;
++#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
++     defined(_MIPS_ARCH_MIPS64R6)) \\
++     && !defined(_MIPS_ARCH_MIPS64R2)
++# define _MIPS_ARCH_MIPS64R2
++#endif
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++# define dmultu(rs,rt)
++# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
++# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
++#else
++# define dmultu(rs,rt)		dmultu	rs,rt
++# define mflo(rd,rs,rt)	mflo	rd
++# define mfhi(rd,rs,rt)	mfhi	rd
++#endif
++
++#ifdef	__KERNEL__
++# define poly1305_init   poly1305_init_mips
++# define poly1305_blocks poly1305_blocks_mips
++# define poly1305_emit   poly1305_emit_mips
++#endif
++
++#if defined(__MIPSEB__) && !defined(MIPSEB)
++# define MIPSEB
++#endif
++
++#ifdef MIPSEB
++# define MSB 0
++# define LSB 7
++#else
++# define MSB 7
++# define LSB 0
++#endif
++
++.text
++.set	noat
++.set	noreorder
++
++.align	5
++.globl	poly1305_init
++.ent	poly1305_init
++poly1305_init:
++	.frame	$sp,0,$ra
++	.set	reorder
++
++	sd	$zero,0($ctx)
++	sd	$zero,8($ctx)
++	sd	$zero,16($ctx)
++
++	beqz	$inp,.Lno_key
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++	andi	$tmp0,$inp,7		# $inp % 8
++	dsubu	$inp,$inp,$tmp0		# align $inp
++	sll	$tmp0,$tmp0,3		# byte to bit offset
++	ld	$in0,0($inp)
++	ld	$in1,8($inp)
++	beqz	$tmp0,.Laligned_key
++	ld	$tmp2,16($inp)
++
++	subu	$tmp1,$zero,$tmp0
++# ifdef	MIPSEB
++	dsllv	$in0,$in0,$tmp0
++	dsrlv	$tmp3,$in1,$tmp1
++	dsllv	$in1,$in1,$tmp0
++	dsrlv	$tmp2,$tmp2,$tmp1
++# else
++	dsrlv	$in0,$in0,$tmp0
++	dsllv	$tmp3,$in1,$tmp1
++	dsrlv	$in1,$in1,$tmp0
++	dsllv	$tmp2,$tmp2,$tmp1
++# endif
++	or	$in0,$in0,$tmp3
++	or	$in1,$in1,$tmp2
++.Laligned_key:
++#else
++	ldl	$in0,0+MSB($inp)
++	ldl	$in1,8+MSB($inp)
++	ldr	$in0,0+LSB($inp)
++	ldr	$in1,8+LSB($inp)
++#endif
++#ifdef	MIPSEB
++# if defined(_MIPS_ARCH_MIPS64R2)
++	dsbh	$in0,$in0		# byte swap
++	 dsbh	$in1,$in1
++	dshd	$in0,$in0
++	 dshd	$in1,$in1
++# else
++	ori	$tmp0,$zero,0xFF
++	dsll	$tmp2,$tmp0,32
++	or	$tmp0,$tmp2		# 0x000000FF000000FF
++
++	and	$tmp1,$in0,$tmp0	# byte swap
++	 and	$tmp3,$in1,$tmp0
++	dsrl	$tmp2,$in0,24
++	 dsrl	$tmp4,$in1,24
++	dsll	$tmp1,24
++	 dsll	$tmp3,24
++	and	$tmp2,$tmp0
++	 and	$tmp4,$tmp0
++	dsll	$tmp0,8			# 0x0000FF000000FF00
++	or	$tmp1,$tmp2
++	 or	$tmp3,$tmp4
++	and	$tmp2,$in0,$tmp0
++	 and	$tmp4,$in1,$tmp0
++	dsrl	$in0,8
++	 dsrl	$in1,8
++	dsll	$tmp2,8
++	 dsll	$tmp4,8
++	and	$in0,$tmp0
++	 and	$in1,$tmp0
++	or	$tmp1,$tmp2
++	 or	$tmp3,$tmp4
++	or	$in0,$tmp1
++	 or	$in1,$tmp3
++	dsrl	$tmp1,$in0,32
++	 dsrl	$tmp3,$in1,32
++	dsll	$in0,32
++	 dsll	$in1,32
++	or	$in0,$tmp1
++	 or	$in1,$tmp3
++# endif
++#endif
++	li	$tmp0,1
++	dsll	$tmp0,32		# 0x0000000100000000
++	daddiu	$tmp0,-63		# 0x00000000ffffffc1
++	dsll	$tmp0,28		# 0x0ffffffc10000000
++	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
++
++	and	$in0,$tmp0
++	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
++	and	$in1,$tmp0
++
++	sd	$in0,24($ctx)
++	dsrl	$tmp0,$in1,2
++	sd	$in1,32($ctx)
++	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
++	sd	$tmp0,40($ctx)
++
++.Lno_key:
++	li	$v0,0			# return 0
++	jr	$ra
++.end	poly1305_init
++___
++{
++my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
++
++my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
++   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
++my ($shr,$shl) = ($s6,$s7);		# used on R6
++
++$code.=<<___;
++.align	5
++.globl	poly1305_blocks
++.ent	poly1305_blocks
++poly1305_blocks:
++	.set	noreorder
++	dsrl	$len,4			# number of complete blocks
++	bnez	$len,poly1305_blocks_internal
++	nop
++	jr	$ra
++	nop
++.end	poly1305_blocks
++
++.align	5
++.ent	poly1305_blocks_internal
++poly1305_blocks_internal:
++	.set	noreorder
++#if defined(_MIPS_ARCH_MIPS64R6)
++	.frame	$sp,8*8,$ra
++	.mask	$SAVED_REGS_MASK|0x000c0000,-8
++	dsubu	$sp,8*8
++	sd	$s7,56($sp)
++	sd	$s6,48($sp)
++#else
++	.frame	$sp,6*8,$ra
++	.mask	$SAVED_REGS_MASK,-8
++	dsubu	$sp,6*8
++#endif
++	sd	$s5,40($sp)
++	sd	$s4,32($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
++	sd	$s3,24($sp)
++	sd	$s2,16($sp)
++	sd	$s1,8($sp)
++	sd	$s0,0($sp)
++___
++$code.=<<___;
++	.set	reorder
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++	andi	$shr,$inp,7
++	dsubu	$inp,$inp,$shr		# align $inp
++	sll	$shr,$shr,3		# byte to bit offset
++	subu	$shl,$zero,$shr
++#endif
++
++	ld	$h0,0($ctx)		# load hash value
++	ld	$h1,8($ctx)
++	ld	$h2,16($ctx)
++
++	ld	$r0,24($ctx)		# load key
++	ld	$r1,32($ctx)
++	ld	$rs1,40($ctx)
++
++	dsll	$len,4
++	daddu	$len,$inp		# end of buffer
++	b	.Loop
++
++.align	4
++.Loop:
++#if defined(_MIPS_ARCH_MIPS64R6)
++	ld	$in0,0($inp)		# load input
++	ld	$in1,8($inp)
++	beqz	$shr,.Laligned_inp
++
++	ld	$tmp2,16($inp)
++# ifdef	MIPSEB
++	dsllv	$in0,$in0,$shr
++	dsrlv	$tmp3,$in1,$shl
++	dsllv	$in1,$in1,$shr
++	dsrlv	$tmp2,$tmp2,$shl
++# else
++	dsrlv	$in0,$in0,$shr
++	dsllv	$tmp3,$in1,$shl
++	dsrlv	$in1,$in1,$shr
++	dsllv	$tmp2,$tmp2,$shl
++# endif
++	or	$in0,$in0,$tmp3
++	or	$in1,$in1,$tmp2
++.Laligned_inp:
++#else
++	ldl	$in0,0+MSB($inp)	# load input
++	ldl	$in1,8+MSB($inp)
++	ldr	$in0,0+LSB($inp)
++	ldr	$in1,8+LSB($inp)
++#endif
++	daddiu	$inp,16
++#ifdef	MIPSEB
++# if defined(_MIPS_ARCH_MIPS64R2)
++	dsbh	$in0,$in0		# byte swap
++	 dsbh	$in1,$in1
++	dshd	$in0,$in0
++	 dshd	$in1,$in1
++# else
++	ori	$tmp0,$zero,0xFF
++	dsll	$tmp2,$tmp0,32
++	or	$tmp0,$tmp2		# 0x000000FF000000FF
++
++	and	$tmp1,$in0,$tmp0	# byte swap
++	 and	$tmp3,$in1,$tmp0
++	dsrl	$tmp2,$in0,24
++	 dsrl	$tmp4,$in1,24
++	dsll	$tmp1,24
++	 dsll	$tmp3,24
++	and	$tmp2,$tmp0
++	 and	$tmp4,$tmp0
++	dsll	$tmp0,8			# 0x0000FF000000FF00
++	or	$tmp1,$tmp2
++	 or	$tmp3,$tmp4
++	and	$tmp2,$in0,$tmp0
++	 and	$tmp4,$in1,$tmp0
++	dsrl	$in0,8
++	 dsrl	$in1,8
++	dsll	$tmp2,8
++	 dsll	$tmp4,8
++	and	$in0,$tmp0
++	 and	$in1,$tmp0
++	or	$tmp1,$tmp2
++	 or	$tmp3,$tmp4
++	or	$in0,$tmp1
++	 or	$in1,$tmp3
++	dsrl	$tmp1,$in0,32
++	 dsrl	$tmp3,$in1,32
++	dsll	$in0,32
++	 dsll	$in1,32
++	or	$in0,$tmp1
++	 or	$in1,$tmp3
++# endif
++#endif
++	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
++	andi	$h2,$h2,3
++	dsll	$tmp0,$tmp1,2
++
++	daddu	$d0,$h0,$in0		# accumulate input
++	 daddu	$tmp1,$tmp0
++	sltu	$tmp0,$d0,$h0
++	daddu	$d0,$d0,$tmp1		# ... and residue
++	sltu	$tmp1,$d0,$tmp1
++	daddu	$d1,$h1,$in1
++	daddu	$tmp0,$tmp1
++	sltu	$tmp1,$d1,$h1
++	daddu	$d1,$tmp0
++
++	dmultu	($r0,$d0)		# h0*r0
++	 daddu	$d2,$h2,$padbit
++	 sltu	$tmp0,$d1,$tmp0
++	mflo	($h0,$r0,$d0)
++	mfhi	($h1,$r0,$d0)
++
++	dmultu	($rs1,$d1)		# h1*5*r1
++	 daddu	$d2,$tmp1
++	 daddu	$d2,$tmp0
++	mflo	($tmp0,$rs1,$d1)
++	mfhi	($tmp1,$rs1,$d1)
++
++	dmultu	($r1,$d0)		# h0*r1
++	mflo	($tmp2,$r1,$d0)
++	mfhi	($h2,$r1,$d0)
++	 daddu	$h0,$tmp0
++	 daddu	$h1,$tmp1
++	 sltu	$tmp0,$h0,$tmp0
++
++	dmultu	($r0,$d1)		# h1*r0
++	 daddu	$h1,$tmp0
++	 daddu	$h1,$tmp2
++	mflo	($tmp0,$r0,$d1)
++	mfhi	($tmp1,$r0,$d1)
++
++	dmultu	($rs1,$d2)		# h2*5*r1
++	 sltu	$tmp2,$h1,$tmp2
++	 daddu	$h2,$tmp2
++	mflo	($tmp2,$rs1,$d2)
++
++	dmultu	($r0,$d2)		# h2*r0
++	 daddu	$h1,$tmp0
++	 daddu	$h2,$tmp1
++	mflo	($tmp3,$r0,$d2)
++	 sltu	$tmp0,$h1,$tmp0
++	 daddu	$h2,$tmp0
++
++	daddu	$h1,$tmp2
++	sltu	$tmp2,$h1,$tmp2
++	daddu	$h2,$tmp2
++	daddu	$h2,$tmp3
++
++	bne	$inp,$len,.Loop
++
++	sd	$h0,0($ctx)		# store hash value
++	sd	$h1,8($ctx)
++	sd	$h2,16($ctx)
++
++	.set	noreorder
++#if defined(_MIPS_ARCH_MIPS64R6)
++	ld	$s7,56($sp)
++	ld	$s6,48($sp)
++#endif
++	ld	$s5,40($sp)		# epilogue
++	ld	$s4,32($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
++	ld	$s3,24($sp)
++	ld	$s2,16($sp)
++	ld	$s1,8($sp)
++	ld	$s0,0($sp)
++___
++$code.=<<___;
++	jr	$ra
++#if defined(_MIPS_ARCH_MIPS64R6)
++	daddu	$sp,8*8
++#else
++	daddu	$sp,6*8
++#endif
++.end	poly1305_blocks_internal
++___
++}
++{
++my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
++
++$code.=<<___;
++.align	5
++.globl	poly1305_emit
++.ent	poly1305_emit
++poly1305_emit:
++	.frame	$sp,0,$ra
++	.set	reorder
++
++	ld	$tmp2,16($ctx)
++	ld	$tmp0,0($ctx)
++	ld	$tmp1,8($ctx)
++
++	li	$in0,-4			# final reduction
++	dsrl	$in1,$tmp2,2
++	and	$in0,$tmp2
++	andi	$tmp2,$tmp2,3
++	daddu	$in0,$in1
++
++	daddu	$tmp0,$tmp0,$in0
++	sltu	$in1,$tmp0,$in0
++	 daddiu	$in0,$tmp0,5		# compare to modulus
++	daddu	$tmp1,$tmp1,$in1
++	 sltiu	$tmp3,$in0,5
++	sltu	$tmp4,$tmp1,$in1
++	 daddu	$in1,$tmp1,$tmp3
++	daddu	$tmp2,$tmp2,$tmp4
++	 sltu	$tmp3,$in1,$tmp3
++	 daddu	$tmp2,$tmp2,$tmp3
++
++	dsrl	$tmp2,2			# see if it carried/borrowed
++	dsubu	$tmp2,$zero,$tmp2
++
++	xor	$in0,$tmp0
++	xor	$in1,$tmp1
++	and	$in0,$tmp2
++	and	$in1,$tmp2
++	xor	$in0,$tmp0
++	xor	$in1,$tmp1
++
++	lwu	$tmp0,0($nonce)		# load nonce
++	lwu	$tmp1,4($nonce)
++	lwu	$tmp2,8($nonce)
++	lwu	$tmp3,12($nonce)
++	dsll	$tmp1,32
++	dsll	$tmp3,32
++	or	$tmp0,$tmp1
++	or	$tmp2,$tmp3
++
++	daddu	$in0,$tmp0		# accumulate nonce
++	daddu	$in1,$tmp2
++	sltu	$tmp0,$in0,$tmp0
++	daddu	$in1,$tmp0
++
++	dsrl	$tmp0,$in0,8		# write mac value
++	dsrl	$tmp1,$in0,16
++	dsrl	$tmp2,$in0,24
++	sb	$in0,0($mac)
++	dsrl	$tmp3,$in0,32
++	sb	$tmp0,1($mac)
++	dsrl	$tmp0,$in0,40
++	sb	$tmp1,2($mac)
++	dsrl	$tmp1,$in0,48
++	sb	$tmp2,3($mac)
++	dsrl	$tmp2,$in0,56
++	sb	$tmp3,4($mac)
++	dsrl	$tmp3,$in1,8
++	sb	$tmp0,5($mac)
++	dsrl	$tmp0,$in1,16
++	sb	$tmp1,6($mac)
++	dsrl	$tmp1,$in1,24
++	sb	$tmp2,7($mac)
++
++	sb	$in1,8($mac)
++	dsrl	$tmp2,$in1,32
++	sb	$tmp3,9($mac)
++	dsrl	$tmp3,$in1,40
++	sb	$tmp0,10($mac)
++	dsrl	$tmp0,$in1,48
++	sb	$tmp1,11($mac)
++	dsrl	$tmp1,$in1,56
++	sb	$tmp2,12($mac)
++	sb	$tmp3,13($mac)
++	sb	$tmp0,14($mac)
++	sb	$tmp1,15($mac)
++
++	jr	$ra
++.end	poly1305_emit
++.rdata
++.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
++.align	2
++___
++}
++}}} else {{{
++######################################################################
++# 32-bit code path
++#
++
++my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
++my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
++   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
++
++$code.=<<___;
++#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
++     defined(_MIPS_ARCH_MIPS32R6)) \\
++     && !defined(_MIPS_ARCH_MIPS32R2)
++# define _MIPS_ARCH_MIPS32R2
++#endif
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++# define multu(rs,rt)
++# define mflo(rd,rs,rt)	mulu	rd,rs,rt
++# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
++#else
++# define multu(rs,rt)	multu	rs,rt
++# define mflo(rd,rs,rt)	mflo	rd
++# define mfhi(rd,rs,rt)	mfhi	rd
++#endif
++
++#ifdef	__KERNEL__
++# define poly1305_init   poly1305_init_mips
++# define poly1305_blocks poly1305_blocks_mips
++# define poly1305_emit   poly1305_emit_mips
++#endif
++
++#if defined(__MIPSEB__) && !defined(MIPSEB)
++# define MIPSEB
++#endif
++
++#ifdef MIPSEB
++# define MSB 0
++# define LSB 3
++#else
++# define MSB 3
++# define LSB 0
++#endif
++
++.text
++.set	noat
++.set	noreorder
++
++.align	5
++.globl	poly1305_init
++.ent	poly1305_init
++poly1305_init:
++	.frame	$sp,0,$ra
++	.set	reorder
++
++	sw	$zero,0($ctx)
++	sw	$zero,4($ctx)
++	sw	$zero,8($ctx)
++	sw	$zero,12($ctx)
++	sw	$zero,16($ctx)
++
++	beqz	$inp,.Lno_key
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++	andi	$tmp0,$inp,3		# $inp % 4
++	subu	$inp,$inp,$tmp0		# align $inp
++	sll	$tmp0,$tmp0,3		# byte to bit offset
++	lw	$in0,0($inp)
++	lw	$in1,4($inp)
++	lw	$in2,8($inp)
++	lw	$in3,12($inp)
++	beqz	$tmp0,.Laligned_key
++
++	lw	$tmp2,16($inp)
++	subu	$tmp1,$zero,$tmp0
++# ifdef	MIPSEB
++	sllv	$in0,$in0,$tmp0
++	srlv	$tmp3,$in1,$tmp1
++	sllv	$in1,$in1,$tmp0
++	or	$in0,$in0,$tmp3
++	srlv	$tmp3,$in2,$tmp1
++	sllv	$in2,$in2,$tmp0
++	or	$in1,$in1,$tmp3
++	srlv	$tmp3,$in3,$tmp1
++	sllv	$in3,$in3,$tmp0
++	or	$in2,$in2,$tmp3
++	srlv	$tmp2,$tmp2,$tmp1
++	or	$in3,$in3,$tmp2
++# else
++	srlv	$in0,$in0,$tmp0
++	sllv	$tmp3,$in1,$tmp1
++	srlv	$in1,$in1,$tmp0
++	or	$in0,$in0,$tmp3
++	sllv	$tmp3,$in2,$tmp1
++	srlv	$in2,$in2,$tmp0
++	or	$in1,$in1,$tmp3
++	sllv	$tmp3,$in3,$tmp1
++	srlv	$in3,$in3,$tmp0
++	or	$in2,$in2,$tmp3
++	sllv	$tmp2,$tmp2,$tmp1
++	or	$in3,$in3,$tmp2
++# endif
++.Laligned_key:
++#else
++	lwl	$in0,0+MSB($inp)
++	lwl	$in1,4+MSB($inp)
++	lwl	$in2,8+MSB($inp)
++	lwl	$in3,12+MSB($inp)
++	lwr	$in0,0+LSB($inp)
++	lwr	$in1,4+LSB($inp)
++	lwr	$in2,8+LSB($inp)
++	lwr	$in3,12+LSB($inp)
++#endif
++#ifdef	MIPSEB
++# if defined(_MIPS_ARCH_MIPS32R2)
++	wsbh	$in0,$in0		# byte swap
++	wsbh	$in1,$in1
++	wsbh	$in2,$in2
++	wsbh	$in3,$in3
++	rotr	$in0,$in0,16
++	rotr	$in1,$in1,16
++	rotr	$in2,$in2,16
++	rotr	$in3,$in3,16
++# else
++	srl	$tmp0,$in0,24		# byte swap
++	srl	$tmp1,$in0,8
++	andi	$tmp2,$in0,0xFF00
++	sll	$in0,$in0,24
++	andi	$tmp1,0xFF00
++	sll	$tmp2,$tmp2,8
++	or	$in0,$tmp0
++	 srl	$tmp0,$in1,24
++	or	$tmp1,$tmp2
++	 srl	$tmp2,$in1,8
++	or	$in0,$tmp1
++	 andi	$tmp1,$in1,0xFF00
++	 sll	$in1,$in1,24
++	 andi	$tmp2,0xFF00
++	 sll	$tmp1,$tmp1,8
++	 or	$in1,$tmp0
++	srl	$tmp0,$in2,24
++	 or	$tmp2,$tmp1
++	srl	$tmp1,$in2,8
++	 or	$in1,$tmp2
++	andi	$tmp2,$in2,0xFF00
++	sll	$in2,$in2,24
++	andi	$tmp1,0xFF00
++	sll	$tmp2,$tmp2,8
++	or	$in2,$tmp0
++	 srl	$tmp0,$in3,24
++	or	$tmp1,$tmp2
++	 srl	$tmp2,$in3,8
++	or	$in2,$tmp1
++	 andi	$tmp1,$in3,0xFF00
++	 sll	$in3,$in3,24
++	 andi	$tmp2,0xFF00
++	 sll	$tmp1,$tmp1,8
++	 or	$in3,$tmp0
++	 or	$tmp2,$tmp1
++	 or	$in3,$tmp2
++# endif
++#endif
++	lui	$tmp0,0x0fff
++	ori	$tmp0,0xffff		# 0x0fffffff
++	and	$in0,$in0,$tmp0
++	subu	$tmp0,3			# 0x0ffffffc
++	and	$in1,$in1,$tmp0
++	and	$in2,$in2,$tmp0
++	and	$in3,$in3,$tmp0
++
++	sw	$in0,20($ctx)
++	sw	$in1,24($ctx)
++	sw	$in2,28($ctx)
++	sw	$in3,32($ctx)
++
++	srl	$tmp1,$in1,2
++	srl	$tmp2,$in2,2
++	srl	$tmp3,$in3,2
++	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
++	addu	$in2,$in2,$tmp2
++	addu	$in3,$in3,$tmp3
++	sw	$in1,36($ctx)
++	sw	$in2,40($ctx)
++	sw	$in3,44($ctx)
++.Lno_key:
++	li	$v0,0
++	jr	$ra
++.end	poly1305_init
++___
++{
++my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
++
++my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
++   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
++my ($d0,$d1,$d2,$d3) =
++   ($a4,$a5,$a6,$a7);
++my $shr = $t2;		# used on R6
++my $one = $t2;		# used on R2
++
++$code.=<<___;
++.globl	poly1305_blocks
++.align	5
++.ent	poly1305_blocks
++poly1305_blocks:
++	.frame	$sp,16*4,$ra
++	.mask	$SAVED_REGS_MASK,-4
++	.set	noreorder
++	subu	$sp, $sp,4*12
++	sw	$s11,4*11($sp)
++	sw	$s10,4*10($sp)
++	sw	$s9, 4*9($sp)
++	sw	$s8, 4*8($sp)
++	sw	$s7, 4*7($sp)
++	sw	$s6, 4*6($sp)
++	sw	$s5, 4*5($sp)
++	sw	$s4, 4*4($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
++	sw	$s3, 4*3($sp)
++	sw	$s2, 4*2($sp)
++	sw	$s1, 4*1($sp)
++	sw	$s0, 4*0($sp)
++___
++$code.=<<___;
++	.set	reorder
++
++	srl	$len,4			# number of complete blocks
++	li	$one,1
++	beqz	$len,.Labort
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++	andi	$shr,$inp,3
++	subu	$inp,$inp,$shr		# align $inp
++	sll	$shr,$shr,3		# byte to bit offset
++#endif
++
++	lw	$h0,0($ctx)		# load hash value
++	lw	$h1,4($ctx)
++	lw	$h2,8($ctx)
++	lw	$h3,12($ctx)
++	lw	$h4,16($ctx)
++
++	lw	$r0,20($ctx)		# load key
++	lw	$r1,24($ctx)
++	lw	$r2,28($ctx)
++	lw	$r3,32($ctx)
++	lw	$rs1,36($ctx)
++	lw	$rs2,40($ctx)
++	lw	$rs3,44($ctx)
++
++	sll	$len,4
++	addu	$len,$len,$inp		# end of buffer
++	b	.Loop
++
++.align	4
++.Loop:
++#if defined(_MIPS_ARCH_MIPS32R6)
++	lw	$d0,0($inp)		# load input
++	lw	$d1,4($inp)
++	lw	$d2,8($inp)
++	lw	$d3,12($inp)
++	beqz	$shr,.Laligned_inp
++
++	lw	$t0,16($inp)
++	subu	$t1,$zero,$shr
++# ifdef	MIPSEB
++	sllv	$d0,$d0,$shr
++	srlv	$at,$d1,$t1
++	sllv	$d1,$d1,$shr
++	or	$d0,$d0,$at
++	srlv	$at,$d2,$t1
++	sllv	$d2,$d2,$shr
++	or	$d1,$d1,$at
++	srlv	$at,$d3,$t1
++	sllv	$d3,$d3,$shr
++	or	$d2,$d2,$at
++	srlv	$t0,$t0,$t1
++	or	$d3,$d3,$t0
++# else
++	srlv	$d0,$d0,$shr
++	sllv	$at,$d1,$t1
++	srlv	$d1,$d1,$shr
++	or	$d0,$d0,$at
++	sllv	$at,$d2,$t1
++	srlv	$d2,$d2,$shr
++	or	$d1,$d1,$at
++	sllv	$at,$d3,$t1
++	srlv	$d3,$d3,$shr
++	or	$d2,$d2,$at
++	sllv	$t0,$t0,$t1
++	or	$d3,$d3,$t0
++# endif
++.Laligned_inp:
++#else
++	lwl	$d0,0+MSB($inp)		# load input
++	lwl	$d1,4+MSB($inp)
++	lwl	$d2,8+MSB($inp)
++	lwl	$d3,12+MSB($inp)
++	lwr	$d0,0+LSB($inp)
++	lwr	$d1,4+LSB($inp)
++	lwr	$d2,8+LSB($inp)
++	lwr	$d3,12+LSB($inp)
++#endif
++#ifdef	MIPSEB
++# if defined(_MIPS_ARCH_MIPS32R2)
++	wsbh	$d0,$d0			# byte swap
++	wsbh	$d1,$d1
++	wsbh	$d2,$d2
++	wsbh	$d3,$d3
++	rotr	$d0,$d0,16
++	rotr	$d1,$d1,16
++	rotr	$d2,$d2,16
++	rotr	$d3,$d3,16
++# else
++	srl	$at,$d0,24		# byte swap
++	srl	$t0,$d0,8
++	andi	$t1,$d0,0xFF00
++	sll	$d0,$d0,24
++	andi	$t0,0xFF00
++	sll	$t1,$t1,8
++	or	$d0,$at
++	 srl	$at,$d1,24
++	or	$t0,$t1
++	 srl	$t1,$d1,8
++	or	$d0,$t0
++	 andi	$t0,$d1,0xFF00
++	 sll	$d1,$d1,24
++	 andi	$t1,0xFF00
++	 sll	$t0,$t0,8
++	 or	$d1,$at
++	srl	$at,$d2,24
++	 or	$t1,$t0
++	srl	$t0,$d2,8
++	 or	$d1,$t1
++	andi	$t1,$d2,0xFF00
++	sll	$d2,$d2,24
++	andi	$t0,0xFF00
++	sll	$t1,$t1,8
++	or	$d2,$at
++	 srl	$at,$d3,24
++	or	$t0,$t1
++	 srl	$t1,$d3,8
++	or	$d2,$t0
++	 andi	$t0,$d3,0xFF00
++	 sll	$d3,$d3,24
++	 andi	$t1,0xFF00
++	 sll	$t0,$t0,8
++	 or	$d3,$at
++	 or	$t1,$t0
++	 or	$d3,$t1
++# endif
++#endif
++	srl	$t0,$h4,2		# modulo-scheduled reduction
++	andi	$h4,$h4,3
++	sll	$at,$t0,2
++
++	addu	$d0,$d0,$h0		# accumulate input
++	 addu	$t0,$t0,$at
++	sltu	$h0,$d0,$h0
++	addu	$d0,$d0,$t0		# ... and residue
++	sltu	$at,$d0,$t0
++
++	addu	$d1,$d1,$h1
++	 addu	$h0,$h0,$at		# carry
++	sltu	$h1,$d1,$h1
++	addu	$d1,$d1,$h0
++	sltu	$h0,$d1,$h0
++
++	addu	$d2,$d2,$h2
++	 addu	$h1,$h1,$h0		# carry
++	sltu	$h2,$d2,$h2
++	addu	$d2,$d2,$h1
++	sltu	$h1,$d2,$h1
++
++	addu	$d3,$d3,$h3
++	 addu	$h2,$h2,$h1		# carry
++	sltu	$h3,$d3,$h3
++	addu	$d3,$d3,$h2
++
++#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
++	multu	$r0,$d0			# d0*r0
++	 sltu	$h2,$d3,$h2
++	maddu	$rs3,$d1		# d1*s3
++	 addu	$h3,$h3,$h2		# carry
++	maddu	$rs2,$d2		# d2*s2
++	 addu	$h4,$h4,$padbit
++	maddu	$rs1,$d3		# d3*s1
++	 addu	$h4,$h4,$h3
++	mfhi	$at
++	mflo	$h0
++
++	multu	$r1,$d0			# d0*r1
++	maddu	$r0,$d1			# d1*r0
++	maddu	$rs3,$d2		# d2*s3
++	maddu	$rs2,$d3		# d3*s2
++	maddu	$rs1,$h4		# h4*s1
++	maddu	$at,$one		# hi*1
++	mfhi	$at
++	mflo	$h1
++
++	multu	$r2,$d0			# d0*r2
++	maddu	$r1,$d1			# d1*r1
++	maddu	$r0,$d2			# d2*r0
++	maddu	$rs3,$d3		# d3*s3
++	maddu	$rs2,$h4		# h4*s2
++	maddu	$at,$one		# hi*1
++	mfhi	$at
++	mflo	$h2
++
++	mul	$t0,$r0,$h4		# h4*r0
++
++	multu	$r3,$d0			# d0*r3
++	maddu	$r2,$d1			# d1*r2
++	maddu	$r1,$d2			# d2*r1
++	maddu	$r0,$d3			# d3*r0
++	maddu	$rs3,$h4		# h4*s3
++	maddu	$at,$one		# hi*1
++	mfhi	$at
++	mflo	$h3
++
++	 addiu	$inp,$inp,16
++
++	addu	$h4,$t0,$at
++#else
++	multu	($r0,$d0)		# d0*r0
++	mflo	($h0,$r0,$d0)
++	mfhi	($h1,$r0,$d0)
++
++	 sltu	$h2,$d3,$h2
++	 addu	$h3,$h3,$h2		# carry
++
++	multu	($rs3,$d1)		# d1*s3
++	mflo	($at,$rs3,$d1)
++	mfhi	($t0,$rs3,$d1)
++
++	 addu	$h4,$h4,$padbit
++	 addiu	$inp,$inp,16
++	 addu	$h4,$h4,$h3
++
++	multu	($rs2,$d2)		# d2*s2
++	mflo	($a3,$rs2,$d2)
++	mfhi	($t1,$rs2,$d2)
++	 addu	$h0,$h0,$at
++	 addu	$h1,$h1,$t0
++	multu	($rs1,$d3)		# d3*s1
++	 sltu	$at,$h0,$at
++	 addu	$h1,$h1,$at
++
++	mflo	($at,$rs1,$d3)
++	mfhi	($t0,$rs1,$d3)
++	 addu	$h0,$h0,$a3
++	 addu	$h1,$h1,$t1
++	multu	($r1,$d0)		# d0*r1
++	 sltu	$a3,$h0,$a3
++	 addu	$h1,$h1,$a3
++
++
++	mflo	($a3,$r1,$d0)
++	mfhi	($h2,$r1,$d0)
++	 addu	$h0,$h0,$at
++	 addu	$h1,$h1,$t0
++	multu	($r0,$d1)		# d1*r0
++	 sltu	$at,$h0,$at
++	 addu	$h1,$h1,$at
++
++	mflo	($at,$r0,$d1)
++	mfhi	($t0,$r0,$d1)
++	 addu	$h1,$h1,$a3
++	 sltu	$a3,$h1,$a3
++	multu	($rs3,$d2)		# d2*s3
++	 addu	$h2,$h2,$a3
++
++	mflo	($a3,$rs3,$d2)
++	mfhi	($t1,$rs3,$d2)
++	 addu	$h1,$h1,$at
++	 addu	$h2,$h2,$t0
++	multu	($rs2,$d3)		# d3*s2
++	 sltu	$at,$h1,$at
++	 addu	$h2,$h2,$at
++
++	mflo	($at,$rs2,$d3)
++	mfhi	($t0,$rs2,$d3)
++	 addu	$h1,$h1,$a3
++	 addu	$h2,$h2,$t1
++	multu	($rs1,$h4)		# h4*s1
++	 sltu	$a3,$h1,$a3
++	 addu	$h2,$h2,$a3
++
++	mflo	($a3,$rs1,$h4)
++	 addu	$h1,$h1,$at
++	 addu	$h2,$h2,$t0
++	multu	($r2,$d0)		# d0*r2
++	 sltu	$at,$h1,$at
++	 addu	$h2,$h2,$at
++
++
++	mflo	($at,$r2,$d0)
++	mfhi	($h3,$r2,$d0)
++	 addu	$h1,$h1,$a3
++	 sltu	$a3,$h1,$a3
++	multu	($r1,$d1)		# d1*r1
++	 addu	$h2,$h2,$a3
++
++	mflo	($a3,$r1,$d1)
++	mfhi	($t1,$r1,$d1)
++	 addu	$h2,$h2,$at
++	 sltu	$at,$h2,$at
++	multu	($r0,$d2)		# d2*r0
++	 addu	$h3,$h3,$at
++
++	mflo	($at,$r0,$d2)
++	mfhi	($t0,$r0,$d2)
++	 addu	$h2,$h2,$a3
++	 addu	$h3,$h3,$t1
++	multu	($rs3,$d3)		# d3*s3
++	 sltu	$a3,$h2,$a3
++	 addu	$h3,$h3,$a3
++
++	mflo	($a3,$rs3,$d3)
++	mfhi	($t1,$rs3,$d3)
++	 addu	$h2,$h2,$at
++	 addu	$h3,$h3,$t0
++	multu	($rs2,$h4)		# h4*s2
++	 sltu	$at,$h2,$at
++	 addu	$h3,$h3,$at
++
++	mflo	($at,$rs2,$h4)
++	 addu	$h2,$h2,$a3
++	 addu	$h3,$h3,$t1
++	multu	($r3,$d0)		# d0*r3
++	 sltu	$a3,$h2,$a3
++	 addu	$h3,$h3,$a3
++
++
++	mflo	($a3,$r3,$d0)
++	mfhi	($t1,$r3,$d0)
++	 addu	$h2,$h2,$at
++	 sltu	$at,$h2,$at
++	multu	($r2,$d1)		# d1*r2
++	 addu	$h3,$h3,$at
++
++	mflo	($at,$r2,$d1)
++	mfhi	($t0,$r2,$d1)
++	 addu	$h3,$h3,$a3
++	 sltu	$a3,$h3,$a3
++	multu	($r0,$d3)		# d3*r0
++	 addu	$t1,$t1,$a3
++
++	mflo	($a3,$r0,$d3)
++	mfhi	($d3,$r0,$d3)
++	 addu	$h3,$h3,$at
++	 addu	$t1,$t1,$t0
++	multu	($r1,$d2)		# d2*r1
++	 sltu	$at,$h3,$at
++	 addu	$t1,$t1,$at
++
++	mflo	($at,$r1,$d2)
++	mfhi	($t0,$r1,$d2)
++	 addu	$h3,$h3,$a3
++	 addu	$t1,$t1,$d3
++	multu	($rs3,$h4)		# h4*s3
++	 sltu	$a3,$h3,$a3
++	 addu	$t1,$t1,$a3
++
++	mflo	($a3,$rs3,$h4)
++	 addu	$h3,$h3,$at
++	 addu	$t1,$t1,$t0
++	multu	($r0,$h4)		# h4*r0
++	 sltu	$at,$h3,$at
++	 addu	$t1,$t1,$at
++
++
++	mflo	($h4,$r0,$h4)
++	 addu	$h3,$h3,$a3
++	 sltu	$a3,$h3,$a3
++	 addu	$t1,$t1,$a3
++	addu	$h4,$h4,$t1
++
++	li	$padbit,1		# if we loop, padbit is 1
++#endif
++	bne	$inp,$len,.Loop
++
++	sw	$h0,0($ctx)		# store hash value
++	sw	$h1,4($ctx)
++	sw	$h2,8($ctx)
++	sw	$h3,12($ctx)
++	sw	$h4,16($ctx)
++
++	.set	noreorder
++.Labort:
++	lw	$s11,4*11($sp)
++	lw	$s10,4*10($sp)
++	lw	$s9, 4*9($sp)
++	lw	$s8, 4*8($sp)
++	lw	$s7, 4*7($sp)
++	lw	$s6, 4*6($sp)
++	lw	$s5, 4*5($sp)
++	lw	$s4, 4*4($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
++	lw	$s3, 4*3($sp)
++	lw	$s2, 4*2($sp)
++	lw	$s1, 4*1($sp)
++	lw	$s0, 4*0($sp)
++___
++$code.=<<___;
++	jr	$ra
++	addu	$sp,$sp,4*12
++.end	poly1305_blocks
++___
++}
++{
++my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
++
++$code.=<<___;
++.align	5
++.globl	poly1305_emit
++.ent	poly1305_emit
++poly1305_emit:
++	.frame	$sp,0,$ra
++	.set	reorder
++
++	lw	$tmp4,16($ctx)
++	lw	$tmp0,0($ctx)
++	lw	$tmp1,4($ctx)
++	lw	$tmp2,8($ctx)
++	lw	$tmp3,12($ctx)
++
++	li	$in0,-4			# final reduction
++	srl	$ctx,$tmp4,2
++	and	$in0,$in0,$tmp4
++	andi	$tmp4,$tmp4,3
++	addu	$ctx,$ctx,$in0
++
++	addu	$tmp0,$tmp0,$ctx
++	sltu	$ctx,$tmp0,$ctx
++	 addiu	$in0,$tmp0,5		# compare to modulus
++	addu	$tmp1,$tmp1,$ctx
++	 sltiu	$in1,$in0,5
++	sltu	$ctx,$tmp1,$ctx
++	 addu	$in1,$in1,$tmp1
++	addu	$tmp2,$tmp2,$ctx
++	 sltu	$in2,$in1,$tmp1
++	sltu	$ctx,$tmp2,$ctx
++	 addu	$in2,$in2,$tmp2
++	addu	$tmp3,$tmp3,$ctx
++	 sltu	$in3,$in2,$tmp2
++	sltu	$ctx,$tmp3,$ctx
++	 addu	$in3,$in3,$tmp3
++	addu	$tmp4,$tmp4,$ctx
++	 sltu	$ctx,$in3,$tmp3
++	 addu	$ctx,$tmp4
++
++	srl	$ctx,2			# see if it carried/borrowed
++	subu	$ctx,$zero,$ctx
++
++	xor	$in0,$tmp0
++	xor	$in1,$tmp1
++	xor	$in2,$tmp2
++	xor	$in3,$tmp3
++	and	$in0,$ctx
++	and	$in1,$ctx
++	and	$in2,$ctx
++	and	$in3,$ctx
++	xor	$in0,$tmp0
++	xor	$in1,$tmp1
++	xor	$in2,$tmp2
++	xor	$in3,$tmp3
++
++	lw	$tmp0,0($nonce)		# load nonce
++	lw	$tmp1,4($nonce)
++	lw	$tmp2,8($nonce)
++	lw	$tmp3,12($nonce)
++
++	addu	$in0,$tmp0		# accumulate nonce
++	sltu	$ctx,$in0,$tmp0
++
++	addu	$in1,$tmp1
++	sltu	$tmp1,$in1,$tmp1
++	addu	$in1,$ctx
++	sltu	$ctx,$in1,$ctx
++	addu	$ctx,$tmp1
++
++	addu	$in2,$tmp2
++	sltu	$tmp2,$in2,$tmp2
++	addu	$in2,$ctx
++	sltu	$ctx,$in2,$ctx
++	addu	$ctx,$tmp2
++
++	addu	$in3,$tmp3
++	addu	$in3,$ctx
++
++	srl	$tmp0,$in0,8		# write mac value
++	srl	$tmp1,$in0,16
++	srl	$tmp2,$in0,24
++	sb	$in0, 0($mac)
++	sb	$tmp0,1($mac)
++	srl	$tmp0,$in1,8
++	sb	$tmp1,2($mac)
++	srl	$tmp1,$in1,16
++	sb	$tmp2,3($mac)
++	srl	$tmp2,$in1,24
++	sb	$in1, 4($mac)
++	sb	$tmp0,5($mac)
++	srl	$tmp0,$in2,8
++	sb	$tmp1,6($mac)
++	srl	$tmp1,$in2,16
++	sb	$tmp2,7($mac)
++	srl	$tmp2,$in2,24
++	sb	$in2, 8($mac)
++	sb	$tmp0,9($mac)
++	srl	$tmp0,$in3,8
++	sb	$tmp1,10($mac)
++	srl	$tmp1,$in3,16
++	sb	$tmp2,11($mac)
++	srl	$tmp2,$in3,24
++	sb	$in3, 12($mac)
++	sb	$tmp0,13($mac)
++	sb	$tmp1,14($mac)
++	sb	$tmp2,15($mac)
++
++	jr	$ra
++.end	poly1305_emit
++.rdata
++.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
++.align	2
++___
++}
++}}}
++
++$output=pop and open STDOUT,">$output";
++print $code;
++close STDOUT;
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -707,6 +707,11 @@ config CRYPTO_POLY1305_X86_64
+ 	  in IETF protocols. This is the x86_64 assembler implementation using SIMD
+ 	  instructions.
+ 
++config CRYPTO_POLY1305_MIPS
++	tristate "Poly1305 authenticator algorithm (MIPS optimized)"
++	depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
++	select CRYPTO_ARCH_HAVE_LIB_POLY1305
++
+ config CRYPTO_MD4
+ 	tristate "MD4 digest algorithm"
+ 	select CRYPTO_HASH
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
+ 
+ config CRYPTO_LIB_POLY1305_RSIZE
+ 	int
++	default 2 if MIPS
+ 	default 4 if X86_64
+ 	default 9 if ARM || ARM64
+ 	default 1