diff options
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch')
-rw-r--r-- | target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch | 1563 |
1 files changed, 0 insertions, 1563 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch deleted file mode 100644 index 272e1797da..0000000000 --- a/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch +++ /dev/null @@ -1,1563 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ard Biesheuvel <ardb@kernel.org> -Date: Fri, 8 Nov 2019 13:22:26 +0100 -Subject: [PATCH] crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS - optimized implementation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit a11d055e7a64ac34a5e99b6fe731299449cbcd58 upstream. - -This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for -MIPS authored by Andy Polyakov, a prior 64-bit only version of which has been -contributed by him to the OpenSSL project. The file 'poly1305-mips.pl' is taken -straight from this upstream GitHub repository [0] at commit -d22ade312a7af958ec955620b0d241cf42c37feb, and already contains all the changes -required to build it as part of a Linux kernel module. - -[0] https://github.com/dot-asm/cryptogams - -Co-developed-by: Andy Polyakov <appro@cryptogams.org> -Signed-off-by: Andy Polyakov <appro@cryptogams.org> -Co-developed-by: René van Dorst <opensource@vdorst.com> -Signed-off-by: René van Dorst <opensource@vdorst.com> -Signed-off-by: Ard Biesheuvel <ardb@kernel.org> -Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> -Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> ---- - arch/mips/crypto/Makefile | 14 + - arch/mips/crypto/poly1305-glue.c | 203 +++++ - arch/mips/crypto/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++ - crypto/Kconfig | 5 + - lib/crypto/Kconfig | 1 + - 5 files changed, 1496 insertions(+) - create mode 100644 arch/mips/crypto/poly1305-glue.c - create mode 100644 arch/mips/crypto/poly1305-mips.pl - ---- a/arch/mips/crypto/Makefile -+++ b/arch/mips/crypto/Makefile -@@ -8,3 +8,17 @@ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32 - obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o - chacha-mips-y := chacha-core.o chacha-glue.o - AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots -+ -+obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o -+poly1305-mips-y := poly1305-core.o poly1305-glue.o -+ -+perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32 -+perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64 -+ -+quiet_cmd_perlasm = PERLASM $@ -+ cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) -+ -+$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE -+ $(call if_changed,perlasm) -+ -+targets += poly1305-core.S ---- /dev/null -+++ b/arch/mips/crypto/poly1305-glue.c -@@ -0,0 +1,203 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS -+ * -+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> -+ */ -+ -+#include <asm/unaligned.h> -+#include <crypto/algapi.h> -+#include <crypto/internal/hash.h> -+#include <crypto/internal/poly1305.h> -+#include <linux/cpufeature.h> -+#include <linux/crypto.h> -+#include <linux/module.h> -+ -+asmlinkage void poly1305_init_mips(void *state, const u8 *key); -+asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit); -+asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce); -+ -+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) -+{ -+ poly1305_init_mips(&dctx->h, key); -+ dctx->s[0] = get_unaligned_le32(key + 16); -+ dctx->s[1] = get_unaligned_le32(key + 20); -+ dctx->s[2] = get_unaligned_le32(key + 24); -+ dctx->s[3] = get_unaligned_le32(key + 28); -+ dctx->buflen = 0; -+} -+EXPORT_SYMBOL(poly1305_init_arch); -+ -+static int mips_poly1305_init(struct shash_desc *desc) -+{ -+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); -+ -+ dctx->buflen = 0; -+ dctx->rset = 0; -+ dctx->sset = false; -+ -+ return 0; -+} -+ -+static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, -+ u32 len, u32 hibit) -+{ -+ if (unlikely(!dctx->sset)) { -+ if (!dctx->rset) { -+ poly1305_init_mips(&dctx->h, src); -+ src += POLY1305_BLOCK_SIZE; -+ len -= POLY1305_BLOCK_SIZE; -+ dctx->rset = 1; -+ } -+ if (len >= POLY1305_BLOCK_SIZE) { -+ dctx->s[0] = get_unaligned_le32(src + 0); -+ dctx->s[1] = get_unaligned_le32(src + 4); -+ dctx->s[2] = get_unaligned_le32(src + 8); -+ dctx->s[3] = get_unaligned_le32(src + 12); -+ src += POLY1305_BLOCK_SIZE; -+ len -= POLY1305_BLOCK_SIZE; -+ dctx->sset = true; -+ } -+ if (len < POLY1305_BLOCK_SIZE) -+ return; -+ } -+ -+ len &= ~(POLY1305_BLOCK_SIZE - 1); -+ -+ poly1305_blocks_mips(&dctx->h, src, len, hibit); -+} -+ -+static int mips_poly1305_update(struct shash_desc *desc, const u8 *src, -+ unsigned int len) -+{ -+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); -+ -+ if (unlikely(dctx->buflen)) { -+ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); -+ -+ memcpy(dctx->buf + dctx->buflen, src, bytes); -+ src += bytes; -+ len -= bytes; -+ dctx->buflen += bytes; -+ -+ if (dctx->buflen == POLY1305_BLOCK_SIZE) { -+ mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1); -+ dctx->buflen = 0; -+ } -+ } -+ -+ if (likely(len >= POLY1305_BLOCK_SIZE)) { -+ mips_poly1305_blocks(dctx, src, len, 1); -+ src += round_down(len, POLY1305_BLOCK_SIZE); -+ len %= POLY1305_BLOCK_SIZE; -+ } -+ -+ if (unlikely(len)) { -+ dctx->buflen = len; -+ memcpy(dctx->buf, src, len); -+ } -+ return 0; -+} -+ -+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, -+ unsigned int nbytes) -+{ -+ if (unlikely(dctx->buflen)) { -+ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); -+ -+ memcpy(dctx->buf + dctx->buflen, src, bytes); -+ src += bytes; -+ nbytes -= bytes; -+ dctx->buflen += bytes; -+ -+ if (dctx->buflen == POLY1305_BLOCK_SIZE) { -+ poly1305_blocks_mips(&dctx->h, dctx->buf, -+ POLY1305_BLOCK_SIZE, 1); -+ dctx->buflen = 0; -+ } -+ } -+ -+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { -+ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); -+ -+ poly1305_blocks_mips(&dctx->h, src, len, 1); -+ src += len; -+ nbytes %= POLY1305_BLOCK_SIZE; -+ } -+ -+ if (unlikely(nbytes)) { -+ dctx->buflen = nbytes; -+ memcpy(dctx->buf, src, nbytes); -+ } -+} -+EXPORT_SYMBOL(poly1305_update_arch); -+ -+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) -+{ -+ __le32 digest[4]; -+ u64 f = 0; -+ -+ if (unlikely(dctx->buflen)) { -+ dctx->buf[dctx->buflen++] = 1; -+ memset(dctx->buf + dctx->buflen, 0, -+ POLY1305_BLOCK_SIZE - dctx->buflen); -+ poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); -+ } -+ -+ poly1305_emit_mips(&dctx->h, digest, dctx->s); -+ -+ /* mac = (h + s) % (2^128) */ -+ f = (f >> 32) + le32_to_cpu(digest[0]); -+ put_unaligned_le32(f, dst); -+ f = (f >> 32) + le32_to_cpu(digest[1]); -+ put_unaligned_le32(f, dst + 4); -+ f = (f >> 32) + le32_to_cpu(digest[2]); -+ put_unaligned_le32(f, dst + 8); -+ f = (f >> 32) + le32_to_cpu(digest[3]); -+ put_unaligned_le32(f, dst + 12); -+ -+ *dctx = (struct poly1305_desc_ctx){}; -+} -+EXPORT_SYMBOL(poly1305_final_arch); -+ -+static int mips_poly1305_final(struct shash_desc *desc, u8 *dst) -+{ -+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); -+ -+ if (unlikely(!dctx->sset)) -+ return -ENOKEY; -+ -+ poly1305_final_arch(dctx, dst); -+ return 0; -+} -+ -+static struct shash_alg mips_poly1305_alg = { -+ .init = mips_poly1305_init, -+ .update = mips_poly1305_update, -+ .final = mips_poly1305_final, -+ .digestsize = POLY1305_DIGEST_SIZE, -+ .descsize = sizeof(struct poly1305_desc_ctx), -+ -+ .base.cra_name = "poly1305", -+ .base.cra_driver_name = "poly1305-mips", -+ .base.cra_priority = 200, -+ .base.cra_blocksize = POLY1305_BLOCK_SIZE, -+ .base.cra_module = THIS_MODULE, -+}; -+ -+static int __init mips_poly1305_mod_init(void) -+{ -+ return crypto_register_shash(&mips_poly1305_alg); -+} -+ -+static void __exit mips_poly1305_mod_exit(void) -+{ -+ crypto_unregister_shash(&mips_poly1305_alg); -+} -+ -+module_init(mips_poly1305_mod_init); -+module_exit(mips_poly1305_mod_exit); -+ -+MODULE_LICENSE("GPL v2"); -+MODULE_ALIAS_CRYPTO("poly1305"); -+MODULE_ALIAS_CRYPTO("poly1305-mips"); ---- /dev/null -+++ b/arch/mips/crypto/poly1305-mips.pl -@@ -0,0 +1,1273 @@ -+#!/usr/bin/env perl -+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause -+# -+# ==================================================================== -+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL -+# project. -+# ==================================================================== -+ -+# Poly1305 hash for MIPS. -+# -+# May 2016 -+# -+# Numbers are cycles per processed byte with poly1305_blocks alone. -+# -+# IALU/gcc -+# R1x000 ~5.5/+130% (big-endian) -+# Octeon II 2.50/+70% (little-endian) -+# -+# March 2019 -+# -+# Add 32-bit code path. -+# -+# October 2019 -+# -+# Modulo-scheduling reduction allows to omit dependency chain at the -+# end of inner loop and improve performance. Also optimize MIPS32R2 -+# code path for MIPS 1004K core. Per René von Dorst's suggestions. -+# -+# IALU/gcc -+# R1x000 ~9.8/? (big-endian) -+# Octeon II 3.65/+140% (little-endian) -+# MT7621/1004K 4.75/? (little-endian) -+# -+###################################################################### -+# There is a number of MIPS ABI in use, O32 and N32/64 are most -+# widely used. Then there is a new contender: NUBI. It appears that if -+# one picks the latter, it's possible to arrange code in ABI neutral -+# manner. Therefore let's stick to NUBI register layout: -+# -+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); -+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); -+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); -+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); -+# -+# The return value is placed in $a0. Following coding rules facilitate -+# interoperability: -+# -+# - never ever touch $tp, "thread pointer", former $gp [o32 can be -+# excluded from the rule, because it's specified volatile]; -+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting -+# old code]; -+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; -+# -+# For reference here is register layout for N32/64 MIPS ABIs: -+# -+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); -+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); -+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); -+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); -+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); -+# -+# <appro@openssl.org> -+# -+###################################################################### -+ -+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 -+ -+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; -+ -+if ($flavour =~ /64|n32/i) {{{ -+###################################################################### -+# 64-bit code path -+# -+ -+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); -+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); -+ -+$code.=<<___; -+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ -+ defined(_MIPS_ARCH_MIPS64R6)) \\ -+ && !defined(_MIPS_ARCH_MIPS64R2) -+# define _MIPS_ARCH_MIPS64R2 -+#endif -+ -+#if defined(_MIPS_ARCH_MIPS64R6) -+# define dmultu(rs,rt) -+# define mflo(rd,rs,rt) dmulu rd,rs,rt -+# define mfhi(rd,rs,rt) dmuhu rd,rs,rt -+#else -+# define dmultu(rs,rt) dmultu rs,rt -+# define mflo(rd,rs,rt) mflo rd -+# define mfhi(rd,rs,rt) mfhi rd -+#endif -+ -+#ifdef __KERNEL__ -+# define poly1305_init poly1305_init_mips -+# define poly1305_blocks poly1305_blocks_mips -+# define poly1305_emit poly1305_emit_mips -+#endif -+ -+#if defined(__MIPSEB__) && !defined(MIPSEB) -+# define MIPSEB -+#endif -+ -+#ifdef MIPSEB -+# define MSB 0 -+# define LSB 7 -+#else -+# define MSB 7 -+# define LSB 0 -+#endif -+ -+.text -+.set noat -+.set noreorder -+ -+.align 5 -+.globl poly1305_init -+.ent poly1305_init -+poly1305_init: -+ .frame $sp,0,$ra -+ .set reorder -+ -+ sd $zero,0($ctx) -+ sd $zero,8($ctx) -+ sd $zero,16($ctx) -+ -+ beqz $inp,.Lno_key -+ -+#if defined(_MIPS_ARCH_MIPS64R6) -+ andi $tmp0,$inp,7 # $inp % 8 -+ dsubu $inp,$inp,$tmp0 # align $inp -+ sll $tmp0,$tmp0,3 # byte to bit offset -+ ld $in0,0($inp) -+ ld $in1,8($inp) -+ beqz $tmp0,.Laligned_key -+ ld $tmp2,16($inp) -+ -+ subu $tmp1,$zero,$tmp0 -+# ifdef MIPSEB -+ dsllv $in0,$in0,$tmp0 -+ dsrlv $tmp3,$in1,$tmp1 -+ dsllv $in1,$in1,$tmp0 -+ dsrlv $tmp2,$tmp2,$tmp1 -+# else -+ dsrlv $in0,$in0,$tmp0 -+ dsllv $tmp3,$in1,$tmp1 -+ dsrlv $in1,$in1,$tmp0 -+ dsllv $tmp2,$tmp2,$tmp1 -+# endif -+ or $in0,$in0,$tmp3 -+ or $in1,$in1,$tmp2 -+.Laligned_key: -+#else -+ ldl $in0,0+MSB($inp) -+ ldl $in1,8+MSB($inp) -+ ldr $in0,0+LSB($inp) -+ ldr $in1,8+LSB($inp) -+#endif -+#ifdef MIPSEB -+# if defined(_MIPS_ARCH_MIPS64R2) -+ dsbh $in0,$in0 # byte swap -+ dsbh $in1,$in1 -+ dshd $in0,$in0 -+ dshd $in1,$in1 -+# else -+ ori $tmp0,$zero,0xFF -+ dsll $tmp2,$tmp0,32 -+ or $tmp0,$tmp2 # 0x000000FF000000FF -+ -+ and $tmp1,$in0,$tmp0 # byte swap -+ and $tmp3,$in1,$tmp0 -+ dsrl $tmp2,$in0,24 -+ dsrl $tmp4,$in1,24 -+ dsll $tmp1,24 -+ dsll $tmp3,24 -+ and $tmp2,$tmp0 -+ and $tmp4,$tmp0 -+ dsll $tmp0,8 # 0x0000FF000000FF00 -+ or $tmp1,$tmp2 -+ or $tmp3,$tmp4 -+ and $tmp2,$in0,$tmp0 -+ and $tmp4,$in1,$tmp0 -+ dsrl $in0,8 -+ dsrl $in1,8 -+ dsll $tmp2,8 -+ dsll $tmp4,8 -+ and $in0,$tmp0 -+ and $in1,$tmp0 -+ or $tmp1,$tmp2 -+ or $tmp3,$tmp4 -+ or $in0,$tmp1 -+ or $in1,$tmp3 -+ dsrl $tmp1,$in0,32 -+ dsrl $tmp3,$in1,32 -+ dsll $in0,32 -+ dsll $in1,32 -+ or $in0,$tmp1 -+ or $in1,$tmp3 -+# endif -+#endif -+ li $tmp0,1 -+ dsll $tmp0,32 # 0x0000000100000000 -+ daddiu $tmp0,-63 # 0x00000000ffffffc1 -+ dsll $tmp0,28 # 0x0ffffffc10000000 -+ daddiu $tmp0,-1 # 0x0ffffffc0fffffff -+ -+ and $in0,$tmp0 -+ daddiu $tmp0,-3 # 0x0ffffffc0ffffffc -+ and $in1,$tmp0 -+ -+ sd $in0,24($ctx) -+ dsrl $tmp0,$in1,2 -+ sd $in1,32($ctx) -+ daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) -+ sd $tmp0,40($ctx) -+ -+.Lno_key: -+ li $v0,0 # return 0 -+ jr $ra -+.end poly1305_init -+___ -+{ -+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; -+ -+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = -+ ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); -+my ($shr,$shl) = ($s6,$s7); # used on R6 -+ -+$code.=<<___; -+.align 5 -+.globl poly1305_blocks -+.ent poly1305_blocks -+poly1305_blocks: -+ .set noreorder -+ dsrl $len,4 # number of complete blocks -+ bnez $len,poly1305_blocks_internal -+ nop -+ jr $ra -+ nop -+.end poly1305_blocks -+ -+.align 5 -+.ent poly1305_blocks_internal -+poly1305_blocks_internal: -+ .set noreorder -+#if defined(_MIPS_ARCH_MIPS64R6) -+ .frame $sp,8*8,$ra -+ .mask $SAVED_REGS_MASK|0x000c0000,-8 -+ dsubu $sp,8*8 -+ sd $s7,56($sp) -+ sd $s6,48($sp) -+#else -+ .frame $sp,6*8,$ra -+ .mask $SAVED_REGS_MASK,-8 -+ dsubu $sp,6*8 -+#endif -+ sd $s5,40($sp) -+ sd $s4,32($sp) -+___ -+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue -+ sd $s3,24($sp) -+ sd $s2,16($sp) -+ sd $s1,8($sp) -+ sd $s0,0($sp) -+___ -+$code.=<<___; -+ .set reorder -+ -+#if defined(_MIPS_ARCH_MIPS64R6) -+ andi $shr,$inp,7 -+ dsubu $inp,$inp,$shr # align $inp -+ sll $shr,$shr,3 # byte to bit offset -+ subu $shl,$zero,$shr -+#endif -+ -+ ld $h0,0($ctx) # load hash value -+ ld $h1,8($ctx) -+ ld $h2,16($ctx) -+ -+ ld $r0,24($ctx) # load key -+ ld $r1,32($ctx) -+ ld $rs1,40($ctx) -+ -+ dsll $len,4 -+ daddu $len,$inp # end of buffer -+ b .Loop -+ -+.align 4 -+.Loop: -+#if defined(_MIPS_ARCH_MIPS64R6) -+ ld $in0,0($inp) # load input -+ ld $in1,8($inp) -+ beqz $shr,.Laligned_inp -+ -+ ld $tmp2,16($inp) -+# ifdef MIPSEB -+ dsllv $in0,$in0,$shr -+ dsrlv $tmp3,$in1,$shl -+ dsllv $in1,$in1,$shr -+ dsrlv $tmp2,$tmp2,$shl -+# else -+ dsrlv $in0,$in0,$shr -+ dsllv $tmp3,$in1,$shl -+ dsrlv $in1,$in1,$shr -+ dsllv $tmp2,$tmp2,$shl -+# endif -+ or $in0,$in0,$tmp3 -+ or $in1,$in1,$tmp2 -+.Laligned_inp: -+#else -+ ldl $in0,0+MSB($inp) # load input -+ ldl $in1,8+MSB($inp) -+ ldr $in0,0+LSB($inp) -+ ldr $in1,8+LSB($inp) -+#endif -+ daddiu $inp,16 -+#ifdef MIPSEB -+# if defined(_MIPS_ARCH_MIPS64R2) -+ dsbh $in0,$in0 # byte swap -+ dsbh $in1,$in1 -+ dshd $in0,$in0 -+ dshd $in1,$in1 -+# else -+ ori $tmp0,$zero,0xFF -+ dsll $tmp2,$tmp0,32 -+ or $tmp0,$tmp2 # 0x000000FF000000FF -+ -+ and $tmp1,$in0,$tmp0 # byte swap -+ and $tmp3,$in1,$tmp0 -+ dsrl $tmp2,$in0,24 -+ dsrl $tmp4,$in1,24 -+ dsll $tmp1,24 -+ dsll $tmp3,24 -+ and $tmp2,$tmp0 -+ and $tmp4,$tmp0 -+ dsll $tmp0,8 # 0x0000FF000000FF00 -+ or $tmp1,$tmp2 -+ or $tmp3,$tmp4 -+ and $tmp2,$in0,$tmp0 -+ and $tmp4,$in1,$tmp0 -+ dsrl $in0,8 -+ dsrl $in1,8 -+ dsll $tmp2,8 -+ dsll $tmp4,8 -+ and $in0,$tmp0 -+ and $in1,$tmp0 -+ or $tmp1,$tmp2 -+ or $tmp3,$tmp4 -+ or $in0,$tmp1 -+ or $in1,$tmp3 -+ dsrl $tmp1,$in0,32 -+ dsrl $tmp3,$in1,32 -+ dsll $in0,32 -+ dsll $in1,32 -+ or $in0,$tmp1 -+ or $in1,$tmp3 -+# endif -+#endif -+ dsrl $tmp1,$h2,2 # modulo-scheduled reduction -+ andi $h2,$h2,3 -+ dsll $tmp0,$tmp1,2 -+ -+ daddu $d0,$h0,$in0 # accumulate input -+ daddu $tmp1,$tmp0 -+ sltu $tmp0,$d0,$h0 -+ daddu $d0,$d0,$tmp1 # ... and residue -+ sltu $tmp1,$d0,$tmp1 -+ daddu $d1,$h1,$in1 -+ daddu $tmp0,$tmp1 -+ sltu $tmp1,$d1,$h1 -+ daddu $d1,$tmp0 -+ -+ dmultu ($r0,$d0) # h0*r0 -+ daddu $d2,$h2,$padbit -+ sltu $tmp0,$d1,$tmp0 -+ mflo ($h0,$r0,$d0) -+ mfhi ($h1,$r0,$d0) -+ -+ dmultu ($rs1,$d1) # h1*5*r1 -+ daddu $d2,$tmp1 -+ daddu $d2,$tmp0 -+ mflo ($tmp0,$rs1,$d1) -+ mfhi ($tmp1,$rs1,$d1) -+ -+ dmultu ($r1,$d0) # h0*r1 -+ mflo ($tmp2,$r1,$d0) -+ mfhi ($h2,$r1,$d0) -+ daddu $h0,$tmp0 -+ daddu $h1,$tmp1 -+ sltu $tmp0,$h0,$tmp0 -+ -+ dmultu ($r0,$d1) # h1*r0 -+ daddu $h1,$tmp0 -+ daddu $h1,$tmp2 -+ mflo ($tmp0,$r0,$d1) -+ mfhi ($tmp1,$r0,$d1) -+ -+ dmultu ($rs1,$d2) # h2*5*r1 -+ sltu $tmp2,$h1,$tmp2 -+ daddu $h2,$tmp2 -+ mflo ($tmp2,$rs1,$d2) -+ -+ dmultu ($r0,$d2) # h2*r0 -+ daddu $h1,$tmp0 -+ daddu $h2,$tmp1 -+ mflo ($tmp3,$r0,$d2) -+ sltu $tmp0,$h1,$tmp0 -+ daddu $h2,$tmp0 -+ -+ daddu $h1,$tmp2 -+ sltu $tmp2,$h1,$tmp2 -+ daddu $h2,$tmp2 -+ daddu $h2,$tmp3 -+ -+ bne $inp,$len,.Loop -+ -+ sd $h0,0($ctx) # store hash value -+ sd $h1,8($ctx) -+ sd $h2,16($ctx) -+ -+ .set noreorder -+#if defined(_MIPS_ARCH_MIPS64R6) -+ ld $s7,56($sp) -+ ld $s6,48($sp) -+#endif -+ ld $s5,40($sp) # epilogue -+ ld $s4,32($sp) -+___ -+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue -+ ld $s3,24($sp) -+ ld $s2,16($sp) -+ ld $s1,8($sp) -+ ld $s0,0($sp) -+___ -+$code.=<<___; -+ jr $ra -+#if defined(_MIPS_ARCH_MIPS64R6) -+ daddu $sp,8*8 -+#else -+ daddu $sp,6*8 -+#endif -+.end poly1305_blocks_internal -+___ -+} -+{ -+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); -+ -+$code.=<<___; -+.align 5 -+.globl poly1305_emit -+.ent poly1305_emit -+poly1305_emit: -+ .frame $sp,0,$ra -+ .set reorder -+ -+ ld $tmp2,16($ctx) -+ ld $tmp0,0($ctx) -+ ld $tmp1,8($ctx) -+ -+ li $in0,-4 # final reduction -+ dsrl $in1,$tmp2,2 -+ and $in0,$tmp2 -+ andi $tmp2,$tmp2,3 -+ daddu $in0,$in1 -+ -+ daddu $tmp0,$tmp0,$in0 -+ sltu $in1,$tmp0,$in0 -+ daddiu $in0,$tmp0,5 # compare to modulus -+ daddu $tmp1,$tmp1,$in1 -+ sltiu $tmp3,$in0,5 -+ sltu $tmp4,$tmp1,$in1 -+ daddu $in1,$tmp1,$tmp3 -+ daddu $tmp2,$tmp2,$tmp4 -+ sltu $tmp3,$in1,$tmp3 -+ daddu $tmp2,$tmp2,$tmp3 -+ -+ dsrl $tmp2,2 # see if it carried/borrowed -+ dsubu $tmp2,$zero,$tmp2 -+ -+ xor $in0,$tmp0 -+ xor $in1,$tmp1 -+ and $in0,$tmp2 -+ and $in1,$tmp2 -+ xor $in0,$tmp0 -+ xor $in1,$tmp1 -+ -+ lwu $tmp0,0($nonce) # load nonce -+ lwu $tmp1,4($nonce) -+ lwu $tmp2,8($nonce) -+ lwu $tmp3,12($nonce) -+ dsll $tmp1,32 -+ dsll $tmp3,32 -+ or $tmp0,$tmp1 -+ or $tmp2,$tmp3 -+ -+ daddu $in0,$tmp0 # accumulate nonce -+ daddu $in1,$tmp2 -+ sltu $tmp0,$in0,$tmp0 -+ daddu $in1,$tmp0 -+ -+ dsrl $tmp0,$in0,8 # write mac value -+ dsrl $tmp1,$in0,16 -+ dsrl $tmp2,$in0,24 -+ sb $in0,0($mac) -+ dsrl $tmp3,$in0,32 -+ sb $tmp0,1($mac) -+ dsrl $tmp0,$in0,40 -+ sb $tmp1,2($mac) -+ dsrl $tmp1,$in0,48 -+ sb $tmp2,3($mac) -+ dsrl $tmp2,$in0,56 -+ sb $tmp3,4($mac) -+ dsrl $tmp3,$in1,8 -+ sb $tmp0,5($mac) -+ dsrl $tmp0,$in1,16 -+ sb $tmp1,6($mac) -+ dsrl $tmp1,$in1,24 -+ sb $tmp2,7($mac) -+ -+ sb $in1,8($mac) -+ dsrl $tmp2,$in1,32 -+ sb $tmp3,9($mac) -+ dsrl $tmp3,$in1,40 -+ sb $tmp0,10($mac) -+ dsrl $tmp0,$in1,48 -+ sb $tmp1,11($mac) -+ dsrl $tmp1,$in1,56 -+ sb $tmp2,12($mac) -+ sb $tmp3,13($mac) -+ sb $tmp0,14($mac) -+ sb $tmp1,15($mac) -+ -+ jr $ra -+.end poly1305_emit -+.rdata -+.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" -+.align 2 -+___ -+} -+}}} else {{{ -+###################################################################### -+# 32-bit code path -+# -+ -+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); -+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = -+ ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); -+ -+$code.=<<___; -+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ -+ defined(_MIPS_ARCH_MIPS32R6)) \\ -+ && !defined(_MIPS_ARCH_MIPS32R2) -+# define _MIPS_ARCH_MIPS32R2 -+#endif -+ -+#if defined(_MIPS_ARCH_MIPS32R6) -+# define multu(rs,rt) -+# define mflo(rd,rs,rt) mulu rd,rs,rt -+# define mfhi(rd,rs,rt) muhu rd,rs,rt -+#else -+# define multu(rs,rt) multu rs,rt -+# define mflo(rd,rs,rt) mflo rd -+# define mfhi(rd,rs,rt) mfhi rd -+#endif -+ -+#ifdef __KERNEL__ -+# define poly1305_init poly1305_init_mips -+# define poly1305_blocks poly1305_blocks_mips -+# define poly1305_emit poly1305_emit_mips -+#endif -+ -+#if defined(__MIPSEB__) && !defined(MIPSEB) -+# define MIPSEB -+#endif -+ -+#ifdef MIPSEB -+# define MSB 0 -+# define LSB 3 -+#else -+# define MSB 3 -+# define LSB 0 -+#endif -+ -+.text -+.set noat -+.set noreorder -+ -+.align 5 -+.globl poly1305_init -+.ent poly1305_init -+poly1305_init: -+ .frame $sp,0,$ra -+ .set reorder -+ -+ sw $zero,0($ctx) -+ sw $zero,4($ctx) -+ sw $zero,8($ctx) -+ sw $zero,12($ctx) -+ sw $zero,16($ctx) -+ -+ beqz $inp,.Lno_key -+ -+#if defined(_MIPS_ARCH_MIPS32R6) -+ andi $tmp0,$inp,3 # $inp % 4 -+ subu $inp,$inp,$tmp0 # align $inp -+ sll $tmp0,$tmp0,3 # byte to bit offset -+ lw $in0,0($inp) -+ lw $in1,4($inp) -+ lw $in2,8($inp) -+ lw $in3,12($inp) -+ beqz $tmp0,.Laligned_key -+ -+ lw $tmp2,16($inp) -+ subu $tmp1,$zero,$tmp0 -+# ifdef MIPSEB -+ sllv $in0,$in0,$tmp0 -+ srlv $tmp3,$in1,$tmp1 -+ sllv $in1,$in1,$tmp0 -+ or $in0,$in0,$tmp3 -+ srlv $tmp3,$in2,$tmp1 -+ sllv $in2,$in2,$tmp0 -+ or $in1,$in1,$tmp3 -+ srlv $tmp3,$in3,$tmp1 -+ sllv $in3,$in3,$tmp0 -+ or $in2,$in2,$tmp3 -+ srlv $tmp2,$tmp2,$tmp1 -+ or $in3,$in3,$tmp2 -+# else -+ srlv $in0,$in0,$tmp0 -+ sllv $tmp3,$in1,$tmp1 -+ srlv $in1,$in1,$tmp0 -+ or $in0,$in0,$tmp3 -+ sllv $tmp3,$in2,$tmp1 -+ srlv $in2,$in2,$tmp0 -+ or $in1,$in1,$tmp3 -+ sllv $tmp3,$in3,$tmp1 -+ srlv $in3,$in3,$tmp0 -+ or $in2,$in2,$tmp3 -+ sllv $tmp2,$tmp2,$tmp1 -+ or $in3,$in3,$tmp2 -+# endif -+.Laligned_key: -+#else -+ lwl $in0,0+MSB($inp) -+ lwl $in1,4+MSB($inp) -+ lwl $in2,8+MSB($inp) -+ lwl $in3,12+MSB($inp) -+ lwr $in0,0+LSB($inp) -+ lwr $in1,4+LSB($inp) -+ lwr $in2,8+LSB($inp) -+ lwr $in3,12+LSB($inp) -+#endif -+#ifdef MIPSEB -+# if defined(_MIPS_ARCH_MIPS32R2) -+ wsbh $in0,$in0 # byte swap -+ wsbh $in1,$in1 -+ wsbh $in2,$in2 -+ wsbh $in3,$in3 -+ rotr $in0,$in0,16 -+ rotr $in1,$in1,16 -+ rotr $in2,$in2,16 -+ rotr $in3,$in3,16 -+# else -+ srl $tmp0,$in0,24 # byte swap -+ srl $tmp1,$in0,8 -+ andi $tmp2,$in0,0xFF00 -+ sll $in0,$in0,24 -+ andi $tmp1,0xFF00 -+ sll $tmp2,$tmp2,8 -+ or $in0,$tmp0 -+ srl $tmp0,$in1,24 -+ or $tmp1,$tmp2 -+ srl $tmp2,$in1,8 -+ or $in0,$tmp1 -+ andi $tmp1,$in1,0xFF00 -+ sll $in1,$in1,24 -+ andi $tmp2,0xFF00 -+ sll $tmp1,$tmp1,8 -+ or $in1,$tmp0 -+ srl $tmp0,$in2,24 -+ or $tmp2,$tmp1 -+ srl $tmp1,$in2,8 -+ or $in1,$tmp2 -+ andi $tmp2,$in2,0xFF00 -+ sll $in2,$in2,24 -+ andi $tmp1,0xFF00 -+ sll $tmp2,$tmp2,8 -+ or $in2,$tmp0 -+ srl $tmp0,$in3,24 -+ or $tmp1,$tmp2 -+ srl $tmp2,$in3,8 -+ or $in2,$tmp1 -+ andi $tmp1,$in3,0xFF00 -+ sll $in3,$in3,24 -+ andi $tmp2,0xFF00 -+ sll $tmp1,$tmp1,8 -+ or $in3,$tmp0 -+ or $tmp2,$tmp1 -+ or $in3,$tmp2 -+# endif -+#endif -+ lui $tmp0,0x0fff -+ ori $tmp0,0xffff # 0x0fffffff -+ and $in0,$in0,$tmp0 -+ subu $tmp0,3 # 0x0ffffffc -+ and $in1,$in1,$tmp0 -+ and $in2,$in2,$tmp0 -+ and $in3,$in3,$tmp0 -+ -+ sw $in0,20($ctx) -+ sw $in1,24($ctx) -+ sw $in2,28($ctx) -+ sw $in3,32($ctx) -+ -+ srl $tmp1,$in1,2 -+ srl $tmp2,$in2,2 -+ srl $tmp3,$in3,2 -+ addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) -+ addu $in2,$in2,$tmp2 -+ addu $in3,$in3,$tmp3 -+ sw $in1,36($ctx) -+ sw $in2,40($ctx) -+ sw $in3,44($ctx) -+.Lno_key: -+ li $v0,0 -+ jr $ra -+.end poly1305_init -+___ -+{ -+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; -+ -+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = -+ ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); -+my ($d0,$d1,$d2,$d3) = -+ ($a4,$a5,$a6,$a7); -+my $shr = $t2; # used on R6 -+my $one = $t2; # used on R2 -+ -+$code.=<<___; -+.globl poly1305_blocks -+.align 5 -+.ent poly1305_blocks -+poly1305_blocks: -+ .frame $sp,16*4,$ra -+ .mask $SAVED_REGS_MASK,-4 -+ .set noreorder -+ subu $sp, $sp,4*12 -+ sw $s11,4*11($sp) -+ sw $s10,4*10($sp) -+ sw $s9, 4*9($sp) -+ sw $s8, 4*8($sp) -+ sw $s7, 4*7($sp) -+ sw $s6, 4*6($sp) -+ sw $s5, 4*5($sp) -+ sw $s4, 4*4($sp) -+___ -+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue -+ sw $s3, 4*3($sp) -+ sw $s2, 4*2($sp) -+ sw $s1, 4*1($sp) -+ sw $s0, 4*0($sp) -+___ -+$code.=<<___; -+ .set reorder -+ -+ srl $len,4 # number of complete blocks -+ li $one,1 -+ beqz $len,.Labort -+ -+#if defined(_MIPS_ARCH_MIPS32R6) -+ andi $shr,$inp,3 -+ subu $inp,$inp,$shr # align $inp -+ sll $shr,$shr,3 # byte to bit offset -+#endif -+ -+ lw $h0,0($ctx) # load hash value -+ lw $h1,4($ctx) -+ lw $h2,8($ctx) -+ lw $h3,12($ctx) -+ lw $h4,16($ctx) -+ -+ lw $r0,20($ctx) # load key -+ lw $r1,24($ctx) -+ lw $r2,28($ctx) -+ lw $r3,32($ctx) -+ lw $rs1,36($ctx) -+ lw $rs2,40($ctx) -+ lw $rs3,44($ctx) -+ -+ sll $len,4 -+ addu $len,$len,$inp # end of buffer -+ b .Loop -+ -+.align 4 -+.Loop: -+#if defined(_MIPS_ARCH_MIPS32R6) -+ lw $d0,0($inp) # load input -+ lw $d1,4($inp) -+ lw $d2,8($inp) -+ lw $d3,12($inp) -+ beqz $shr,.Laligned_inp -+ -+ lw $t0,16($inp) -+ subu $t1,$zero,$shr -+# ifdef MIPSEB -+ sllv $d0,$d0,$shr -+ srlv $at,$d1,$t1 -+ sllv $d1,$d1,$shr -+ or $d0,$d0,$at -+ srlv $at,$d2,$t1 -+ sllv $d2,$d2,$shr -+ or $d1,$d1,$at -+ srlv $at,$d3,$t1 -+ sllv $d3,$d3,$shr -+ or $d2,$d2,$at -+ srlv $t0,$t0,$t1 -+ or $d3,$d3,$t0 -+# else -+ srlv $d0,$d0,$shr -+ sllv $at,$d1,$t1 -+ srlv $d1,$d1,$shr -+ or $d0,$d0,$at -+ sllv $at,$d2,$t1 -+ srlv $d2,$d2,$shr -+ or $d1,$d1,$at -+ sllv $at,$d3,$t1 -+ srlv $d3,$d3,$shr -+ or $d2,$d2,$at -+ sllv $t0,$t0,$t1 -+ or $d3,$d3,$t0 -+# endif -+.Laligned_inp: -+#else -+ lwl $d0,0+MSB($inp) # load input -+ lwl $d1,4+MSB($inp) -+ lwl $d2,8+MSB($inp) -+ lwl $d3,12+MSB($inp) -+ lwr $d0,0+LSB($inp) -+ lwr $d1,4+LSB($inp) -+ lwr $d2,8+LSB($inp) -+ lwr $d3,12+LSB($inp) -+#endif -+#ifdef MIPSEB -+# if defined(_MIPS_ARCH_MIPS32R2) -+ wsbh $d0,$d0 # byte swap -+ wsbh $d1,$d1 -+ wsbh $d2,$d2 -+ wsbh $d3,$d3 -+ rotr $d0,$d0,16 -+ rotr $d1,$d1,16 -+ rotr $d2,$d2,16 -+ rotr $d3,$d3,16 -+# else -+ srl $at,$d0,24 # byte swap -+ srl $t0,$d0,8 -+ andi $t1,$d0,0xFF00 -+ sll $d0,$d0,24 -+ andi $t0,0xFF00 -+ sll $t1,$t1,8 -+ or $d0,$at -+ srl $at,$d1,24 -+ or $t0,$t1 -+ srl $t1,$d1,8 -+ or $d0,$t0 -+ andi $t0,$d1,0xFF00 -+ sll $d1,$d1,24 -+ andi $t1,0xFF00 -+ sll $t0,$t0,8 -+ or $d1,$at -+ srl $at,$d2,24 -+ or $t1,$t0 -+ srl $t0,$d2,8 -+ or $d1,$t1 -+ andi $t1,$d2,0xFF00 -+ sll $d2,$d2,24 -+ andi $t0,0xFF00 -+ sll $t1,$t1,8 -+ or $d2,$at -+ srl $at,$d3,24 -+ or $t0,$t1 -+ srl $t1,$d3,8 -+ or $d2,$t0 -+ andi $t0,$d3,0xFF00 -+ sll $d3,$d3,24 -+ andi $t1,0xFF00 -+ sll $t0,$t0,8 -+ or $d3,$at -+ or $t1,$t0 -+ or $d3,$t1 -+# endif -+#endif -+ srl $t0,$h4,2 # modulo-scheduled reduction -+ andi $h4,$h4,3 -+ sll $at,$t0,2 -+ -+ addu $d0,$d0,$h0 # accumulate input -+ addu $t0,$t0,$at -+ sltu $h0,$d0,$h0 -+ addu $d0,$d0,$t0 # ... and residue -+ sltu $at,$d0,$t0 -+ -+ addu $d1,$d1,$h1 -+ addu $h0,$h0,$at # carry -+ sltu $h1,$d1,$h1 -+ addu $d1,$d1,$h0 -+ sltu $h0,$d1,$h0 -+ -+ addu $d2,$d2,$h2 -+ addu $h1,$h1,$h0 # carry -+ sltu $h2,$d2,$h2 -+ addu $d2,$d2,$h1 -+ sltu $h1,$d2,$h1 -+ -+ addu $d3,$d3,$h3 -+ addu $h2,$h2,$h1 # carry -+ sltu $h3,$d3,$h3 -+ addu $d3,$d3,$h2 -+ -+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) -+ multu $r0,$d0 # d0*r0 -+ sltu $h2,$d3,$h2 -+ maddu $rs3,$d1 # d1*s3 -+ addu $h3,$h3,$h2 # carry -+ maddu $rs2,$d2 # d2*s2 -+ addu $h4,$h4,$padbit -+ maddu $rs1,$d3 # d3*s1 -+ addu $h4,$h4,$h3 -+ mfhi $at -+ mflo $h0 -+ -+ multu $r1,$d0 # d0*r1 -+ maddu $r0,$d1 # d1*r0 -+ maddu $rs3,$d2 # d2*s3 -+ maddu $rs2,$d3 # d3*s2 -+ maddu $rs1,$h4 # h4*s1 -+ maddu $at,$one # hi*1 -+ mfhi $at -+ mflo $h1 -+ -+ multu $r2,$d0 # d0*r2 -+ maddu $r1,$d1 # d1*r1 -+ maddu $r0,$d2 # d2*r0 -+ maddu $rs3,$d3 # d3*s3 -+ maddu $rs2,$h4 # h4*s2 -+ maddu $at,$one # hi*1 -+ mfhi $at -+ mflo $h2 -+ -+ mul $t0,$r0,$h4 # h4*r0 -+ -+ multu $r3,$d0 # d0*r3 -+ maddu $r2,$d1 # d1*r2 -+ maddu $r1,$d2 # d2*r1 -+ maddu $r0,$d3 # d3*r0 -+ maddu $rs3,$h4 # h4*s3 -+ maddu $at,$one # hi*1 -+ mfhi $at -+ mflo $h3 -+ -+ addiu $inp,$inp,16 -+ -+ addu $h4,$t0,$at -+#else -+ multu ($r0,$d0) # d0*r0 -+ mflo ($h0,$r0,$d0) -+ mfhi ($h1,$r0,$d0) -+ -+ sltu $h2,$d3,$h2 -+ addu $h3,$h3,$h2 # carry -+ -+ multu ($rs3,$d1) # d1*s3 -+ mflo ($at,$rs3,$d1) -+ mfhi ($t0,$rs3,$d1) -+ -+ addu $h4,$h4,$padbit -+ addiu $inp,$inp,16 -+ addu $h4,$h4,$h3 -+ -+ multu ($rs2,$d2) # d2*s2 -+ mflo ($a3,$rs2,$d2) -+ mfhi ($t1,$rs2,$d2) -+ addu $h0,$h0,$at -+ addu $h1,$h1,$t0 -+ multu ($rs1,$d3) # d3*s1 -+ sltu $at,$h0,$at -+ addu $h1,$h1,$at -+ -+ mflo ($at,$rs1,$d3) -+ mfhi ($t0,$rs1,$d3) -+ addu $h0,$h0,$a3 -+ addu $h1,$h1,$t1 -+ multu ($r1,$d0) # d0*r1 -+ sltu $a3,$h0,$a3 -+ addu $h1,$h1,$a3 -+ -+ -+ mflo ($a3,$r1,$d0) -+ mfhi ($h2,$r1,$d0) -+ addu $h0,$h0,$at -+ addu $h1,$h1,$t0 -+ multu ($r0,$d1) # d1*r0 -+ sltu $at,$h0,$at -+ addu $h1,$h1,$at -+ -+ mflo ($at,$r0,$d1) -+ mfhi ($t0,$r0,$d1) -+ addu $h1,$h1,$a3 -+ sltu $a3,$h1,$a3 -+ multu ($rs3,$d2) # d2*s3 -+ addu $h2,$h2,$a3 -+ -+ mflo ($a3,$rs3,$d2) -+ mfhi ($t1,$rs3,$d2) -+ addu $h1,$h1,$at -+ addu $h2,$h2,$t0 -+ multu ($rs2,$d3) # d3*s2 -+ sltu $at,$h1,$at -+ addu $h2,$h2,$at -+ -+ mflo ($at,$rs2,$d3) -+ mfhi ($t0,$rs2,$d3) -+ addu $h1,$h1,$a3 -+ addu $h2,$h2,$t1 -+ multu ($rs1,$h4) # h4*s1 -+ sltu $a3,$h1,$a3 -+ addu $h2,$h2,$a3 -+ -+ mflo ($a3,$rs1,$h4) -+ addu $h1,$h1,$at -+ addu $h2,$h2,$t0 -+ multu ($r2,$d0) # d0*r2 -+ sltu $at,$h1,$at -+ addu $h2,$h2,$at -+ -+ -+ mflo ($at,$r2,$d0) -+ mfhi ($h3,$r2,$d0) -+ addu $h1,$h1,$a3 -+ sltu $a3,$h1,$a3 -+ multu ($r1,$d1) # d1*r1 -+ addu $h2,$h2,$a3 -+ -+ mflo ($a3,$r1,$d1) -+ mfhi ($t1,$r1,$d1) -+ addu $h2,$h2,$at -+ sltu $at,$h2,$at -+ multu ($r0,$d2) # d2*r0 -+ addu $h3,$h3,$at -+ -+ mflo ($at,$r0,$d2) -+ mfhi ($t0,$r0,$d2) -+ addu $h2,$h2,$a3 -+ addu $h3,$h3,$t1 -+ multu ($rs3,$d3) # d3*s3 -+ sltu $a3,$h2,$a3 -+ addu $h3,$h3,$a3 -+ -+ mflo ($a3,$rs3,$d3) -+ mfhi ($t1,$rs3,$d3) -+ addu $h2,$h2,$at -+ addu $h3,$h3,$t0 -+ multu ($rs2,$h4) # h4*s2 -+ sltu $at,$h2,$at -+ addu $h3,$h3,$at -+ -+ mflo ($at,$rs2,$h4) -+ addu $h2,$h2,$a3 -+ addu $h3,$h3,$t1 -+ multu ($r3,$d0) # d0*r3 -+ sltu $a3,$h2,$a3 -+ addu $h3,$h3,$a3 -+ -+ -+ mflo ($a3,$r3,$d0) -+ mfhi ($t1,$r3,$d0) -+ addu $h2,$h2,$at -+ sltu $at,$h2,$at -+ multu ($r2,$d1) # d1*r2 -+ addu $h3,$h3,$at -+ -+ mflo ($at,$r2,$d1) -+ mfhi ($t0,$r2,$d1) -+ addu $h3,$h3,$a3 -+ sltu $a3,$h3,$a3 -+ multu ($r0,$d3) # d3*r0 -+ addu $t1,$t1,$a3 -+ -+ mflo ($a3,$r0,$d3) -+ mfhi ($d3,$r0,$d3) -+ addu $h3,$h3,$at -+ addu $t1,$t1,$t0 -+ multu ($r1,$d2) # d2*r1 -+ sltu $at,$h3,$at -+ addu $t1,$t1,$at -+ -+ mflo ($at,$r1,$d2) -+ mfhi ($t0,$r1,$d2) -+ addu $h3,$h3,$a3 -+ addu $t1,$t1,$d3 -+ multu ($rs3,$h4) # h4*s3 -+ sltu $a3,$h3,$a3 -+ addu $t1,$t1,$a3 -+ -+ mflo ($a3,$rs3,$h4) -+ addu $h3,$h3,$at -+ addu $t1,$t1,$t0 -+ multu ($r0,$h4) # h4*r0 -+ sltu $at,$h3,$at -+ addu $t1,$t1,$at -+ -+ -+ mflo ($h4,$r0,$h4) -+ addu $h3,$h3,$a3 -+ sltu $a3,$h3,$a3 -+ addu $t1,$t1,$a3 -+ addu $h4,$h4,$t1 -+ -+ li $padbit,1 # if we loop, padbit is 1 -+#endif -+ bne $inp,$len,.Loop -+ -+ sw $h0,0($ctx) # store hash value -+ sw $h1,4($ctx) -+ sw $h2,8($ctx) -+ sw $h3,12($ctx) -+ sw $h4,16($ctx) -+ -+ .set noreorder -+.Labort: -+ lw $s11,4*11($sp) -+ lw $s10,4*10($sp) -+ lw $s9, 4*9($sp) -+ lw $s8, 4*8($sp) -+ lw $s7, 4*7($sp) -+ lw $s6, 4*6($sp) -+ lw $s5, 4*5($sp) -+ lw $s4, 4*4($sp) -+___ -+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue -+ lw $s3, 4*3($sp) -+ lw $s2, 4*2($sp) -+ lw $s1, 4*1($sp) -+ lw $s0, 4*0($sp) -+___ -+$code.=<<___; -+ jr $ra -+ addu $sp,$sp,4*12 -+.end poly1305_blocks -+___ -+} -+{ -+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); -+ -+$code.=<<___; -+.align 5 -+.globl poly1305_emit -+.ent poly1305_emit -+poly1305_emit: -+ .frame $sp,0,$ra -+ .set reorder -+ -+ lw $tmp4,16($ctx) -+ lw $tmp0,0($ctx) -+ lw $tmp1,4($ctx) -+ lw $tmp2,8($ctx) -+ lw $tmp3,12($ctx) -+ -+ li $in0,-4 # final reduction -+ srl $ctx,$tmp4,2 -+ and $in0,$in0,$tmp4 -+ andi $tmp4,$tmp4,3 -+ addu $ctx,$ctx,$in0 -+ -+ addu $tmp0,$tmp0,$ctx -+ sltu $ctx,$tmp0,$ctx -+ addiu $in0,$tmp0,5 # compare to modulus -+ addu $tmp1,$tmp1,$ctx -+ sltiu $in1,$in0,5 -+ sltu $ctx,$tmp1,$ctx -+ addu $in1,$in1,$tmp1 -+ addu $tmp2,$tmp2,$ctx -+ sltu $in2,$in1,$tmp1 -+ sltu $ctx,$tmp2,$ctx -+ addu $in2,$in2,$tmp2 -+ addu $tmp3,$tmp3,$ctx -+ sltu $in3,$in2,$tmp2 -+ sltu $ctx,$tmp3,$ctx -+ addu $in3,$in3,$tmp3 -+ addu $tmp4,$tmp4,$ctx -+ sltu $ctx,$in3,$tmp3 -+ addu $ctx,$tmp4 -+ -+ srl $ctx,2 # see if it carried/borrowed -+ subu $ctx,$zero,$ctx -+ -+ xor $in0,$tmp0 -+ xor $in1,$tmp1 -+ xor $in2,$tmp2 -+ xor $in3,$tmp3 -+ and $in0,$ctx -+ and $in1,$ctx -+ and $in2,$ctx -+ and $in3,$ctx -+ xor $in0,$tmp0 -+ xor $in1,$tmp1 -+ xor $in2,$tmp2 -+ xor $in3,$tmp3 -+ -+ lw $tmp0,0($nonce) # load nonce -+ lw $tmp1,4($nonce) -+ lw $tmp2,8($nonce) -+ lw $tmp3,12($nonce) -+ -+ addu $in0,$tmp0 # accumulate nonce -+ sltu $ctx,$in0,$tmp0 -+ -+ addu $in1,$tmp1 -+ sltu $tmp1,$in1,$tmp1 -+ addu $in1,$ctx -+ sltu $ctx,$in1,$ctx -+ addu $ctx,$tmp1 -+ -+ addu $in2,$tmp2 -+ sltu $tmp2,$in2,$tmp2 -+ addu $in2,$ctx -+ sltu $ctx,$in2,$ctx -+ addu $ctx,$tmp2 -+ -+ addu $in3,$tmp3 -+ addu $in3,$ctx -+ -+ srl $tmp0,$in0,8 # write mac value -+ srl $tmp1,$in0,16 -+ srl $tmp2,$in0,24 -+ sb $in0, 0($mac) -+ sb $tmp0,1($mac) -+ srl $tmp0,$in1,8 -+ sb $tmp1,2($mac) -+ srl $tmp1,$in1,16 -+ sb $tmp2,3($mac) -+ srl $tmp2,$in1,24 -+ sb $in1, 4($mac) -+ sb $tmp0,5($mac) -+ srl $tmp0,$in2,8 -+ sb $tmp1,6($mac) -+ srl $tmp1,$in2,16 -+ sb $tmp2,7($mac) -+ srl $tmp2,$in2,24 -+ sb $in2, 8($mac) -+ sb $tmp0,9($mac) -+ srl $tmp0,$in3,8 -+ sb $tmp1,10($mac) -+ srl $tmp1,$in3,16 -+ sb $tmp2,11($mac) -+ srl $tmp2,$in3,24 -+ sb $in3, 12($mac) -+ sb $tmp0,13($mac) -+ sb $tmp1,14($mac) -+ sb $tmp2,15($mac) -+ -+ jr $ra -+.end poly1305_emit -+.rdata -+.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" -+.align 2 -+___ -+} -+}}} -+ -+$output=pop and open STDOUT,">$output"; -+print $code; -+close STDOUT; ---- a/crypto/Kconfig -+++ b/crypto/Kconfig -@@ -707,6 +707,11 @@ config CRYPTO_POLY1305_X86_64 - in IETF protocols. This is the x86_64 assembler implementation using SIMD - instructions. - -+config CRYPTO_POLY1305_MIPS -+ tristate "Poly1305 authenticator algorithm (MIPS optimized)" -+ depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT) -+ select CRYPTO_ARCH_HAVE_LIB_POLY1305 -+ - config CRYPTO_MD4 - tristate "MD4 digest algorithm" - select CRYPTO_HASH ---- a/lib/crypto/Kconfig -+++ b/lib/crypto/Kconfig -@@ -39,6 +39,7 @@ config CRYPTO_LIB_DES - - config CRYPTO_LIB_POLY1305_RSIZE - int -+ default 2 if MIPS - default 4 if X86_64 - default 9 if ARM || ARM64 - default 1 |