summaryrefslogtreecommitdiffstats
path: root/tboot/vmac.c
diff options
context:
space:
mode:
Diffstat (limited to 'tboot/vmac.c')
-rw-r--r--tboot/vmac.c1228
1 files changed, 1228 insertions, 0 deletions
diff --git a/tboot/vmac.c b/tboot/vmac.c
new file mode 100644
index 0000000..be234ed
--- /dev/null
+++ b/tboot/vmac.c
@@ -0,0 +1,1228 @@
+/* --------------------------------------------------------------------------
+ * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
+ * This implementation is herby placed in the public domain.
+ * The authors offers no warranty. Use at your own risk.
+ * Please send bug reports to the authors.
+ * Last modified: 17 APR 08, 1700 PDT
+ * ----------------------------------------------------------------------- */
+/*
+ * Portions copyright (c) 2010, Intel Corporation
+ */
+
+//#include "vmac.h"
+//#include <string.h>
+//#include <stdio.h>
+/* start for tboot */
+#include <config.h>
+#include <efibase.h>
+#include <types.h>
+#include <vmac.h>
+/*#define UINT64_C(x) x##ULL*/
+/* end for tboot */
+
+/* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */
+#ifndef VMAC_ARCH_64
+#define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64)
+#endif
+
+/* Enable code tuned for Intel SSE2 instruction set */
+#if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64))
+#define VMAC_USE_SSE2 1
+#include <emmintrin.h>
+#endif
+
+/* Native word reads. Update (or define via compiler) if incorrect */
+#ifndef VMAC_ARCH_BIG_ENDIAN /* Assume big-endian unless on the list */
+#define VMAC_ARCH_BIG_ENDIAN \
+ (!(__x86_64__ || __i386__ || _M_IX86 || \
+ _M_X64 || __ARMEL__ || __MIPSEL__))
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Constants and masks */
+
+const uint64_t p64 = UINT64_C(0xfffffffffffffeff); /* 2^64 - 257 prime */
+const uint64_t m62 = UINT64_C(0x3fffffffffffffff); /* 62-bit mask */
+const uint64_t m63 = UINT64_C(0x7fffffffffffffff); /* 63-bit mask */
+const uint64_t m64 = UINT64_C(0xffffffffffffffff); /* 64-bit mask */
+const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */
+
+/* ----------------------------------------------------------------------- *
+ * The following routines are used in this implementation. They are
+ * written via macros to simulate zero-overhead call-by-reference.
+ * All have default implemantations for when they are not defined in an
+ * architecture-specific manner.
+ *
+ * MUL64: 64x64->128-bit multiplication
+ * PMUL64: assumes top bits cleared on inputs
+ * ADD128: 128x128->128-bit addition
+ * GET_REVERSED_64: load and byte-reverse 64-bit word
+ * ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+#if (__GNUC__ && (__x86_64__ || __amd64__))
+/* ----------------------------------------------------------------------- */
+
+#define ADD128(rh,rl,ih,il) \
+ asm ("addq %3, %1 \n\t" \
+ "adcq %2, %0" \
+ : "+r"(rh),"+r"(rl) \
+ : "r"(ih),"r"(il) : "cc");
+
+#define MUL64(rh,rl,i1,i2) \
+ asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc")
+
+#define PMUL64 MUL64
+
+#define GET_REVERSED_64(p) \
+ ({uint64_t x; \
+ asm ("bswapq %0" : "=r" (x) : "0"(*(uint64_t *)(p))); x;})
+
+/* ----------------------------------------------------------------------- */
+#elif (__GNUC__ && __i386__)
+/* ----------------------------------------------------------------------- */
+
+#define GET_REVERSED_64(p) \
+ ({ uint64_t x; \
+ uint32_t *tp = (uint32_t *)(p); \
+ asm ("bswap %%edx\n\t" \
+ "bswap %%eax" \
+ : "=A"(x) \
+ : "a"(tp[1]), "d"(tp[0])); \
+ x; })
+
+/* ----------------------------------------------------------------------- */
+#elif (__GNUC__ && __ppc64__)
+/* ----------------------------------------------------------------------- */
+
+#define ADD128(rh,rl,ih,il) \
+ asm volatile ( "addc %1, %1, %3 \n\t" \
+ "adde %0, %0, %2" \
+ : "+r"(rh),"+r"(rl) \
+ : "r"(ih),"r"(il));
+
+#define MUL64(rh,rl,i1,i2) \
+{ uint64_t _i1 = (i1), _i2 = (i2); \
+ rl = _i1 * _i2; \
+ asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\
+}
+
+#define PMUL64 MUL64
+
+#define GET_REVERSED_64(p) \
+ ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \
+ asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \
+ asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \
+ ((uint64_t)hi << 32) | (uint64_t)lo; } )
+
+/* ----------------------------------------------------------------------- */
+#elif (__GNUC__ && (__ppc__ || __PPC__))
+/* ----------------------------------------------------------------------- */
+
+#define GET_REVERSED_64(p) \
+ ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \
+ asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \
+ asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \
+ ((uint64_t)hi << 32) | (uint64_t)lo; } )
+
+/* ----------------------------------------------------------------------- */
+#elif (__GNUC__ && (__ARMEL__ || __ARM__))
+/* ----------------------------------------------------------------------- */
+
+#define bswap32(v) \
+({ uint32_t tmp,out; \
+ asm volatile( \
+ "eor %1, %2, %2, ror #16\n" \
+ "bic %1, %1, #0x00ff0000\n" \
+ "mov %0, %2, ror #8\n" \
+ "eor %0, %0, %1, lsr #8" \
+ : "=r" (out), "=&r" (tmp) \
+ : "r" (v)); \
+ out;})
+
+/* ----------------------------------------------------------------------- */
+#elif _MSC_VER
+/* ----------------------------------------------------------------------- */
+
+#include <intrin.h>
+
+#if (_M_IA64 || _M_X64) && \
+ (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
+#define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh));
+#pragma intrinsic(_umul128)
+#define PMUL64 MUL64
+#endif
+
+/* MSVC uses add, adc in this version */
+#define ADD128(rh,rl,ih,il) \
+ { uint64_t _il = (il); \
+ (rl) += (_il); \
+ (rh) += (ih) + ((rl) < (_il)); \
+ }
+
+#if _MSC_VER >= 1300
+#define GET_REVERSED_64(p) _byteswap_uint64(*(uint64_t *)(p))
+#pragma intrinsic(_byteswap_uint64)
+#endif
+
+#if _MSC_VER >= 1400 && \
+ (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
+#define MUL32(i1,i2) (__emulu((uint32_t)(i1),(uint32_t)(i2)))
+#pragma intrinsic(__emulu)
+#endif
+
+/* ----------------------------------------------------------------------- */
+#endif
+/* ----------------------------------------------------------------------- */
+
+#if __GNUC__
+/*#define ALIGN(n) __attribute__ ((aligned(n)))*/
+#define NOINLINE __attribute__ ((noinline))
+#define FASTCALL
+#elif _MSC_VER
+#define ALIGN(n) __declspec(align(n))
+#define NOINLINE __declspec(noinline)
+#define FASTCALL __fastcall
+#else
+#define ALIGN(n)
+#define NOINLINE
+#define FASTCALL
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Default implementations, if not defined above */
+/* ----------------------------------------------------------------------- */
+
+#ifndef ADD128
+#define ADD128(rh,rl,ih,il) \
+ { uint64_t _il = (il); \
+ (rl) += (_il); \
+ if ((rl) < (_il)) (rh)++; \
+ (rh) += (ih); \
+ }
+#endif
+
+#ifndef MUL32
+#define MUL32(i1,i2) ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2))
+#endif
+
+#ifndef PMUL64 /* rh may not be same as i1 or i2 */
+#define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow */ \
+ { uint64_t _i1 = (i1), _i2 = (i2); \
+ uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2); \
+ rh = MUL32(_i1>>32,_i2>>32); \
+ rl = MUL32(_i1,_i2); \
+ ADD128(rh,rl,(m >> 32),(m << 32)); \
+ }
+#endif
+
+#ifndef MUL64
+#define MUL64(rh,rl,i1,i2) \
+ { uint64_t _i1 = (i1), _i2 = (i2); \
+ uint64_t m1= MUL32(_i1,_i2>>32); \
+ uint64_t m2= MUL32(_i1>>32,_i2); \
+ rh = MUL32(_i1>>32,_i2>>32); \
+ rl = MUL32(_i1,_i2); \
+ ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \
+ ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \
+ }
+#endif
+
+#ifndef GET_REVERSED_64
+#ifndef bswap64
+#ifndef bswap32
+#define bswap32(x) \
+ ({ uint32_t bsx = (x); \
+ ((((bsx) & 0xff000000u) >> 24) | (((bsx) & 0x00ff0000u) >> 8) | \
+ (((bsx) & 0x0000ff00u) << 8) | (((bsx) & 0x000000ffu) << 24)); })
+#endif
+#define bswap64(x) \
+ ({ union { uint64_t ll; uint32_t l[2]; } w, r; \
+ w.ll = (x); \
+ r.l[0] = bswap32 (w.l[1]); \
+ r.l[1] = bswap32 (w.l[0]); \
+ r.ll; })
+#endif
+#define GET_REVERSED_64(p) bswap64(*(uint64_t *)(p))
+#endif
+
+/* ----------------------------------------------------------------------- */
+
+#if (VMAC_PREFER_BIG_ENDIAN)
+# define get64PE get64BE
+#else
+# define get64PE get64LE
+#endif
+
+#if (VMAC_ARCH_BIG_ENDIAN)
+# define get64BE(ptr) (*(uint64_t *)(ptr))
+# define get64LE(ptr) GET_REVERSED_64(ptr)
+#else /* assume little-endian */
+# define get64BE(ptr) GET_REVERSED_64(ptr)
+# define get64LE(ptr) (*(uint64_t *)(ptr))
+#endif
+
+
+/* --------------------------------------------------------------------- *
+ * For highest performance the L1 NH and L2 polynomial hashes should be
+ * carefully implemented to take advantage of one's target architechture.
+ * Here these two hash functions are defined multiple time; once for
+ * 64-bit architectures, once for 32-bit SSE2 architectures, and once
+ * for the rest (32-bit) architectures.
+ * For each, nh_16 *must* be defined (works on multiples of 16 bytes).
+ * Optionally, nh_vmac_nhbytes can be defined (for multiples of
+ * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two
+ * NH computations at once).
+ * --------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+#if VMAC_ARCH_64
+/* ----------------------------------------------------------------------- */
+
+#define nh_16(mp, kp, nw, rh, rl) \
+{ int i; uint64_t th, tl; \
+ rh = rl = 0; \
+ for (i = 0; i < nw; i+= 2) { \
+ MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
+ ADD128(rh,rl,th,tl); \
+ } \
+}
+#define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1) \
+{ int i; uint64_t th, tl; \
+ rh1 = rl1 = rh = rl = 0; \
+ for (i = 0; i < nw; i+= 2) { \
+ MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
+ ADD128(rh,rl,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
+ ADD128(rh1,rl1,th,tl); \
+ } \
+}
+
+#if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */
+#define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \
+{ int i; uint64_t th, tl; \
+ rh = rl = 0; \
+ for (i = 0; i < nw; i+= 8) { \
+ MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
+ ADD128(rh,rl,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
+ ADD128(rh,rl,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
+ ADD128(rh,rl,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
+ ADD128(rh,rl,th,tl); \
+ } \
+}
+#define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1) \
+{ int i; uint64_t th, tl; \
+ rh1 = rl1 = rh = rl = 0; \
+ for (i = 0; i < nw; i+= 8) { \
+ MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
+ ADD128(rh,rl,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
+ ADD128(rh1,rl1,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
+ ADD128(rh,rl,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\
+ ADD128(rh1,rl1,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
+ ADD128(rh,rl,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\
+ ADD128(rh1,rl1,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
+ ADD128(rh,rl,th,tl); \
+ MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\
+ ADD128(rh1,rl1,th,tl); \
+ } \
+}
+#endif
+
+#define poly_step(ah, al, kh, kl, mh, ml) \
+{ uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0; \
+ /* compute ab*cd, put bd into result registers */ \
+ PMUL64(t3h,t3l,al,kh); \
+ PMUL64(t2h,t2l,ah,kl); \
+ PMUL64(t1h,t1l,ah,2*kh); \
+ PMUL64(ah,al,al,kl); \
+ /* add 2 * ac to result */ \
+ ADD128(ah,al,t1h,t1l); \
+ /* add together ad + bc */ \
+ ADD128(t2h,t2l,t3h,t3l); \
+ /* now (ah,al), (t2l,2*t2h) need summing */ \
+ /* first add the high registers, carrying into t2h */ \
+ ADD128(t2h,ah,z,t2l); \
+ /* double t2h and add top bit of ah */ \
+ t2h = 2 * t2h + (ah >> 63); \
+ ah &= m63; \
+ /* now add the low registers */ \
+ ADD128(ah,al,mh,ml); \
+ ADD128(ah,al,z,t2h); \
+}
+
+/* ----------------------------------------------------------------------- */
+#elif VMAC_USE_SSE2
+/* ----------------------------------------------------------------------- */
+
+// macros from Crypto++ for sharing inline assembly code between MSVC and GNU C
+#if defined(__GNUC__)
+ // define these in two steps to allow arguments to be expanded
+ #define GNU_AS2(x, y) #x ", " #y ";"
+ #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
+ #define GNU_ASL(x) "\n" #x ":"
+ #define GNU_ASJ(x, y, z) #x " " #y #z ";"
+ #define AS2(x, y) GNU_AS2(x, y)
+ #define AS3(x, y, z) GNU_AS3(x, y, z)
+ #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
+ #define ASL(x) GNU_ASL(x)
+ #define ASJ(x, y, z) GNU_ASJ(x, y, z)
+#else
+ #define AS2(x, y) __asm {x, y}
+ #define AS3(x, y, z) __asm {x, y, z}
+ #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
+ #define ASL(x) __asm {label##x:}
+ #define ASJ(x, y, z) __asm {x label##y}
+#endif
+
+static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl)
+{
+ // This assembly version, using MMX registers, is just as fast as the
+ // intrinsics version (which uses XMM registers) on the Intel Core 2,
+ // but is much faster on the Pentium 4. In order to schedule multiplies
+ // as early as possible, the loop interleaves operations for the current
+ // block and the next block. To mask out high 32-bits, we use "movd"
+ // to move the lower 32-bits to the stack and then back. Surprisingly,
+ // this is faster than any other method.
+#ifdef __GNUC__
+ __asm__ __volatile__
+ (
+ ".intel_syntax noprefix;"
+#else
+ AS2( mov esi, mp)
+ AS2( mov edi, kp)
+ AS2( mov ecx, nw)
+ AS2( mov eax, rl)
+ AS2( mov edx, rh)
+#endif
+ AS2( sub esp, 12)
+ AS2( movq mm6, [esi])
+ AS2( paddq mm6, [edi])
+ AS2( movq mm5, [esi+8])
+ AS2( paddq mm5, [edi+8])
+ AS2( add esi, 16)
+ AS2( add edi, 16)
+ AS2( movq mm4, mm6)
+ ASS( pshufw mm2, mm6, 1, 0, 3, 2)
+ AS2( pmuludq mm6, mm5)
+ ASS( pshufw mm3, mm5, 1, 0, 3, 2)
+ AS2( pmuludq mm5, mm2)
+ AS2( pmuludq mm2, mm3)
+ AS2( pmuludq mm3, mm4)
+ AS2( pxor mm7, mm7)
+ AS2( movd [esp], mm6)
+ AS2( psrlq mm6, 32)
+ AS2( movd [esp+4], mm5)
+ AS2( psrlq mm5, 32)
+ AS2( sub ecx, 2)
+ ASJ( jz, 1, f)
+ ASL(0)
+ AS2( movq mm0, [esi])
+ AS2( paddq mm0, [edi])
+ AS2( movq mm1, [esi+8])
+ AS2( paddq mm1, [edi+8])
+ AS2( add esi, 16)
+ AS2( add edi, 16)
+ AS2( movq mm4, mm0)
+ AS2( paddq mm5, mm2)
+ ASS( pshufw mm2, mm0, 1, 0, 3, 2)
+ AS2( pmuludq mm0, mm1)
+ AS2( movd [esp+8], mm3)
+ AS2( psrlq mm3, 32)
+ AS2( paddq mm5, mm3)
+ ASS( pshufw mm3, mm1, 1, 0, 3, 2)
+ AS2( pmuludq mm1, mm2)
+ AS2( pmuludq mm2, mm3)
+ AS2( pmuludq mm3, mm4)
+ AS2( movd mm4, [esp])
+ AS2( paddq mm7, mm4)
+ AS2( movd mm4, [esp+4])
+ AS2( paddq mm6, mm4)
+ AS2( movd mm4, [esp+8])
+ AS2( paddq mm6, mm4)
+ AS2( movd [esp], mm0)
+ AS2( psrlq mm0, 32)
+ AS2( paddq mm6, mm0)
+ AS2( movd [esp+4], mm1)
+ AS2( psrlq mm1, 32)
+ AS2( paddq mm5, mm1)
+ AS2( sub ecx, 2)
+ ASJ( jnz, 0, b)
+ ASL(1)
+ AS2( paddq mm5, mm2)
+ AS2( movd [esp+8], mm3)
+ AS2( psrlq mm3, 32)
+ AS2( paddq mm5, mm3)
+ AS2( movd mm4, [esp])
+ AS2( paddq mm7, mm4)
+ AS2( movd mm4, [esp+4])
+ AS2( paddq mm6, mm4)
+ AS2( movd mm4, [esp+8])
+ AS2( paddq mm6, mm4)
+
+ ASS( pshufw mm0, mm7, 3, 2, 1, 0)
+ AS2( psrlq mm7, 32)
+ AS2( paddq mm6, mm7)
+ AS2( punpckldq mm0, mm6)
+ AS2( psrlq mm6, 32)
+ AS2( paddq mm5, mm6)
+ AS2( movq [eax], mm0)
+ AS2( movq [edx], mm5)
+ AS2( add esp, 12)
+#ifdef __GNUC__
+ ".att_syntax prefix;"
+ :
+ : "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh)
+ : "memory", "cc"
+ );
+#endif
+}
+#define nh_16(mp, kp, nw, rh, rl) nh_16_func(mp, kp, nw, &(rh), &(rl));
+
+static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
+ const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
+{
+ // This code tries to schedule the multiplies as early as possible to overcome
+ // the long latencies on the Pentium 4. It also minimizes "movq" instructions
+ // which are very expensive on the P4.
+
+#define a0 [eax+0]
+#define a1 [eax+4]
+#define a2 [ebx+0]
+#define a3 [ebx+4]
+#define k0 [ecx+0]
+#define k1 [ecx+4]
+#define k2 [edx+0]
+#define k3 [edx+4]
+
+#ifdef __GNUC__
+ uint32_t temp;
+ __asm__ __volatile__
+ (
+ "mov %%ebx, %0;"
+ "mov %1, %%ebx;"
+ ".intel_syntax noprefix;"
+#else
+ AS2( mov ebx, ahi)
+ AS2( mov edx, kh)
+ AS2( mov eax, alo)
+ AS2( mov ecx, kl)
+ AS2( mov esi, mh)
+ AS2( mov edi, ml)
+#endif
+
+ AS2( movd mm0, a3)
+ AS2( movq mm4, mm0)
+ AS2( pmuludq mm0, k3) // a3*k3
+ AS2( movd mm1, a0)
+ AS2( pmuludq mm1, k2) // a0*k2
+ AS2( movd mm2, a1)
+ AS2( movd mm6, k1)
+ AS2( pmuludq mm2, mm6) // a1*k1
+ AS2( movd mm3, a2)
+ AS2( movq mm5, mm3)
+ AS2( movd mm7, k0)
+ AS2( pmuludq mm3, mm7) // a2*k0
+ AS2( pmuludq mm4, mm7) // a3*k0
+ AS2( pmuludq mm5, mm6) // a2*k1
+ AS2( psllq mm0, 1)
+ AS2( paddq mm0, [esi])
+ AS2( paddq mm0, mm1)
+ AS2( movd mm1, a1)
+ AS2( paddq mm4, mm5)
+ AS2( movq mm5, mm1)
+ AS2( pmuludq mm1, k2) // a1*k2
+ AS2( paddq mm0, mm2)
+ AS2( movd mm2, a0)
+ AS2( paddq mm0, mm3)
+ AS2( movq mm3, mm2)
+ AS2( pmuludq mm2, k3) // a0*k3
+ AS2( pmuludq mm3, mm7) // a0*k0
+ AS2( movd esi, mm0)
+ AS2( psrlq mm0, 32)
+ AS2( pmuludq mm7, mm5) // a1*k0
+ AS2( pmuludq mm5, k3) // a1*k3
+ AS2( paddq mm0, mm1)
+ AS2( movd mm1, a2)
+ AS2( pmuludq mm1, k2) // a2*k2
+ AS2( paddq mm0, mm2)
+ AS2( paddq mm0, mm4)
+ AS2( movq mm4, mm0)
+ AS2( movd mm2, a3)
+ AS2( pmuludq mm2, mm6) // a3*k1
+ AS2( pmuludq mm6, a0) // a0*k1
+ AS2( psrlq mm0, 31)
+ AS2( paddq mm0, mm3)
+ AS2( movd mm3, [edi])
+ AS2( paddq mm0, mm3)
+ AS2( movd mm3, a2)
+ AS2( pmuludq mm3, k3) // a2*k3
+ AS2( paddq mm5, mm1)
+ AS2( movd mm1, a3)
+ AS2( pmuludq mm1, k2) // a3*k2
+ AS2( paddq mm5, mm2)
+ AS2( movd mm2, [edi+4])
+ AS2( psllq mm5, 1)
+ AS2( paddq mm0, mm5)
+ AS2( movq mm5, mm0)
+ AS2( psllq mm4, 33)
+ AS2( psrlq mm0, 32)
+ AS2( paddq mm6, mm7)
+ AS2( movd mm7, esi)
+ AS2( paddq mm0, mm6)
+ AS2( paddq mm0, mm2)
+ AS2( paddq mm3, mm1)
+ AS2( psllq mm3, 1)
+ AS2( paddq mm0, mm3)
+ AS2( psrlq mm4, 1)
+ AS2( punpckldq mm5, mm0)
+ AS2( psrlq mm0, 32)
+ AS2( por mm4, mm7)
+ AS2( paddq mm0, mm4)
+ AS2( movq a0, mm5)
+ AS2( movq a2, mm0)
+#ifdef __GNUC__
+ ".att_syntax prefix;"
+ "mov %0, %%ebx;"
+ : "=m" (temp)
+ : "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl)
+ : "memory", "cc"
+ );
+#endif
+
+
+#undef a0
+#undef a1
+#undef a2
+#undef a3
+#undef k0
+#undef k1
+#undef k2
+#undef k3
+}
+
+#define poly_step(ah, al, kh, kl, mh, ml) \
+ poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
+
+/* ----------------------------------------------------------------------- */
+#else /* not VMAC_ARCH_64 and not SSE2 */
+/* ----------------------------------------------------------------------- */
+
+#ifndef nh_16
+#define nh_16(mp, kp, nw, rh, rl) \
+{ uint64_t t1,t2,m1,m2,t; \
+ int i; \
+ rh = rl = t = 0; \
+ for (i = 0; i < nw; i+=2) { \
+ t1 = get64PE(mp+i) + kp[i]; \
+ t2 = get64PE(mp+i+1) + kp[i+1]; \
+ m2 = MUL32(t1 >> 32, t2); \
+ m1 = MUL32(t1, t2 >> 32); \
+ ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2)); \
+ rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32); \
+ t += (uint64_t)(uint32_t)m1 + (uint32_t)m2; \
+ } \
+ ADD128(rh,rl,(t >> 32),(t << 32)); \
+}
+#endif
+
+static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
+ const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
+{
+
+#if VMAC_ARCH_BIG_ENDIAN
+#define INDEX_HIGH 0
+#define INDEX_LOW 1
+#else
+#define INDEX_HIGH 1
+#define INDEX_LOW 0
+#endif
+
+#define a0 *(((uint32_t*)alo)+INDEX_LOW)
+#define a1 *(((uint32_t*)alo)+INDEX_HIGH)
+#define a2 *(((uint32_t*)ahi)+INDEX_LOW)
+#define a3 *(((uint32_t*)ahi)+INDEX_HIGH)
+#define k0 *(((uint32_t*)kl)+INDEX_LOW)
+#define k1 *(((uint32_t*)kl)+INDEX_HIGH)
+#define k2 *(((uint32_t*)kh)+INDEX_LOW)
+#define k3 *(((uint32_t*)kh)+INDEX_HIGH)
+
+ uint64_t p, q, t;
+ uint32_t t2;
+
+ p = MUL32(a3, k3);
+ p += p;
+ p += *(uint64_t *)mh;
+ p += MUL32(a0, k2);
+ p += MUL32(a1, k1);
+ p += MUL32(a2, k0);
+ t = (uint32_t)(p);
+ p >>= 32;
+ p += MUL32(a0, k3);
+ p += MUL32(a1, k2);
+ p += MUL32(a2, k1);
+ p += MUL32(a3, k0);
+ t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32;
+ p >>= 31;
+ p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]);
+ p += MUL32(a0, k0);
+ q = MUL32(a1, k3);
+ q += MUL32(a2, k2);
+ q += MUL32(a3, k1);
+ q += q;
+ p += q;
+ t2 = (uint32_t)(p);
+ p >>= 32;
+ p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]);
+ p += MUL32(a0, k1);
+ p += MUL32(a1, k0);
+ q = MUL32(a2, k3);
+ q += MUL32(a3, k2);
+ q += q;
+ p += q;
+ *(uint64_t *)(alo) = (p << 32) | t2;
+ p >>= 32;
+ *(uint64_t *)(ahi) = p + t;
+
+#undef a0
+#undef a1
+#undef a2
+#undef a3
+#undef k0
+#undef k1
+#undef k2
+#undef k3
+}
+
+#define poly_step(ah, al, kh, kl, mh, ml) \
+ poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
+
+/* ----------------------------------------------------------------------- */
+#endif /* end of specialized NH and poly definitions */
+/* ----------------------------------------------------------------------- */
+
+/* At least nh_16 is defined. Defined others as needed here */
+#ifndef nh_16_2
+#define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2) \
+ nh_16(mp, kp, nw, rh, rl); \
+ nh_16(mp, ((kp)+2), nw, rh2, rl2);
+#endif
+#ifndef nh_vmac_nhbytes
+#define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \
+ nh_16(mp, kp, nw, rh, rl)
+#endif
+#ifndef nh_vmac_nhbytes_2
+#define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2) \
+ nh_vmac_nhbytes(mp, kp, nw, rh, rl); \
+ nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2);
+#endif
+
+/* ----------------------------------------------------------------------- */
+
+void vhash_abort(vmac_ctx_t *ctx)
+{
+ ctx->polytmp[0] = ctx->polykey[0] ;
+ ctx->polytmp[1] = ctx->polykey[1] ;
+ #if (VMAC_TAG_LEN == 128)
+ ctx->polytmp[2] = ctx->polykey[2] ;
+ ctx->polytmp[3] = ctx->polykey[3] ;
+ #endif
+ ctx->first_block_processed = 0;
+}
+
+/* ----------------------------------------------------------------------- */
+static uint64_t l3hash(uint64_t p1, uint64_t p2,
+ uint64_t k1, uint64_t k2, uint64_t len)
+{
+ uint64_t rh, rl, t, z=0;
+
+ /* fully reduce (p1,p2)+(len,0) mod p127 */
+ t = p1 >> 63;
+ p1 &= m63;
+ ADD128(p1, p2, len, t);
+ /* At this point, (p1,p2) is at most 2^127+(len<<64) */
+ t = (p1 > m63) + ((p1 == m63) && (p2 == m64));
+ ADD128(p1, p2, z, t);
+ p1 &= m63;
+
+ /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
+ t = p1 + (p2 >> 32);
+ t += (t >> 32);
+ t += (uint32_t)t > 0xfffffffeu;
+ p1 += (t >> 32);
+ p2 += (p1 << 32);
+
+ /* compute (p1+k1)%p64 and (p2+k2)%p64 */
+ p1 += k1;
+ p1 += (0 - (p1 < k1)) & 257;
+ p2 += k2;
+ p2 += (0 - (p2 < k2)) & 257;
+
+ /* compute (p1+k1)*(p2+k2)%p64 */
+ MUL64(rh, rl, p1, p2);
+ t = rh >> 56;
+ ADD128(t, rl, z, rh);
+ rh <<= 8;
+ ADD128(t, rl, z, rh);
+ t += t << 8;
+ rl += t;
+ rl += (0 - (rl < t)) & 257;
+ rl += (0 - (rl > p64-1)) & 257;
+ return rl;
+}
+
+/* ----------------------------------------------------------------------- */
+
+void vhash_update(unsigned char *m,
+ unsigned int mbytes, /* Pos multiple of VMAC_NHBYTES */
+ vmac_ctx_t *ctx)
+{
+ uint64_t rh, rl, *mptr;
+ const uint64_t *kptr = (uint64_t *)ctx->nhkey;
+ int i;
+ uint64_t ch, cl;
+ uint64_t pkh = ctx->polykey[0];
+ uint64_t pkl = ctx->polykey[1];
+ #if (VMAC_TAG_LEN == 128)
+ uint64_t ch2, cl2, rh2, rl2;
+ uint64_t pkh2 = ctx->polykey[2];
+ uint64_t pkl2 = ctx->polykey[3];
+ #endif
+
+ mptr = (uint64_t *)m;
+ i = mbytes / VMAC_NHBYTES; /* Must be non-zero */
+
+ ch = ctx->polytmp[0];
+ cl = ctx->polytmp[1];
+ #if (VMAC_TAG_LEN == 128)
+ ch2 = ctx->polytmp[2];
+ cl2 = ctx->polytmp[3];
+ #endif
+
+ if ( ! ctx->first_block_processed) {
+ ctx->first_block_processed = 1;
+ #if (VMAC_TAG_LEN == 64)
+ nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
+ #else
+ nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
+ rh2 &= m62;
+ ADD128(ch2,cl2,rh2,rl2);
+ #endif
+ rh &= m62;
+ ADD128(ch,cl,rh,rl);
+ mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+ i--;
+ }
+
+ while (i--) {
+ #if (VMAC_TAG_LEN == 64)
+ nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
+ #else
+ nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
+ rh2 &= m62;
+ poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
+ #endif
+ rh &= m62;
+ poly_step(ch,cl,pkh,pkl,rh,rl);
+ mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+ }
+
+ ctx->polytmp[0] = ch;
+ ctx->polytmp[1] = cl;
+ #if (VMAC_TAG_LEN == 128)
+ ctx->polytmp[2] = ch2;
+ ctx->polytmp[3] = cl2;
+ #endif
+ #if VMAC_USE_SSE2
+ _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
+ #endif
+}
+
+/* ----------------------------------------------------------------------- */
+
+uint64_t xvhash(unsigned char m[],
+ unsigned int mbytes,
+ uint64_t *tagl,
+ vmac_ctx_t *ctx)
+{
+ uint64_t ch, cl, rh, rl, *mptr;
+ #if (VMAC_TAG_LEN == 128)
+ uint64_t ch2, cl2, rh2, rl2;
+ #endif
+ const uint64_t *kptr = (uint64_t *)ctx->nhkey;
+ int i, remaining;
+ (void)tagl;
+
+ remaining = mbytes % VMAC_NHBYTES;
+ i = mbytes-remaining;
+ mptr = (uint64_t *)(m+i);
+ if (i) vhash_update(m,i,ctx);
+
+ ch = ctx->polytmp[0];
+ cl = ctx->polytmp[1];
+ #if (VMAC_TAG_LEN == 128)
+ ch2 = ctx->polytmp[2];
+ cl2 = ctx->polytmp[3];
+ #endif
+
+ if (remaining) {
+ #if (VMAC_TAG_LEN == 128)
+ nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
+ rh2 &= m62;
+ #else
+ nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
+ #endif
+ rh &= m62;
+ if (i) {
+ poly_step(ch,cl,ctx->polykey[0],ctx->polykey[1],rh,rl);
+ #if (VMAC_TAG_LEN == 128)
+ poly_step(ch2,cl2,ctx->polykey[2],ctx->polykey[3],rh2,rl2);
+ #endif
+ } else {
+ ADD128(ch,cl,rh,rl);
+ #if (VMAC_TAG_LEN == 128)
+ ADD128(ch2,cl2,rh2,rl2);
+ #endif
+ }
+ }
+
+ #if VMAC_USE_SSE2
+ _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
+ #endif
+ vhash_abort(ctx);
+ remaining *= 8;
+#if (VMAC_TAG_LEN == 128)
+ *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
+#endif
+ return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
+}
+
+uint64_t vhash(unsigned char m[],
+ unsigned int mbytes,
+ uint64_t *tagl,
+ vmac_ctx_t *ctx)
+{
+ uint64_t rh, rl, *mptr;
+ const uint64_t *kptr = (uint64_t *)ctx->nhkey;
+ int i, remaining;
+ uint64_t ch, cl;
+ uint64_t pkh = ctx->polykey[0];
+ uint64_t pkl = ctx->polykey[1];
+ #if (VMAC_TAG_LEN == 128)
+ uint64_t ch2, cl2, rh2, rl2;
+ uint64_t pkh2 = ctx->polykey[2];
+ uint64_t pkl2 = ctx->polykey[3];
+ #endif
+ (void)tagl;
+
+ mptr = (uint64_t *)m;
+ i = mbytes / VMAC_NHBYTES;
+ remaining = mbytes % VMAC_NHBYTES;
+
+ if (ctx->first_block_processed)
+ {
+ ch = ctx->polytmp[0];
+ cl = ctx->polytmp[1];
+ #if (VMAC_TAG_LEN == 128)
+ ch2 = ctx->polytmp[2];
+ cl2 = ctx->polytmp[3];
+ #endif
+ }
+ else if (i)
+ {
+ #if (VMAC_TAG_LEN == 64)
+ nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl);
+ #else
+ nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2);
+ ch2 &= m62;
+ ADD128(ch2,cl2,pkh2,pkl2);
+ #endif
+ ch &= m62;
+ ADD128(ch,cl,pkh,pkl);
+ mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+ i--;
+ }
+ else if (remaining)
+ {
+ #if (VMAC_TAG_LEN == 64)
+ nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl);
+ #else
+ nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2);
+ ch2 &= m62;
+ ADD128(ch2,cl2,pkh2,pkl2);
+ #endif
+ ch &= m62;
+ ADD128(ch,cl,pkh,pkl);
+ mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+ goto do_l3;
+ }
+ else /* Empty String */
+ {
+ ch = pkh; cl = pkl;
+ #if (VMAC_TAG_LEN == 128)
+ ch2 = pkh2; cl2 = pkl2;
+ #endif
+ goto do_l3;
+ }
+
+ while (i--) {
+ #if (VMAC_TAG_LEN == 64)
+ nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
+ #else
+ nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
+ rh2 &= m62;
+ poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
+ #endif
+ rh &= m62;
+ poly_step(ch,cl,pkh,pkl,rh,rl);
+ mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+ }
+ if (remaining) {
+ #if (VMAC_TAG_LEN == 64)
+ nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
+ #else
+ nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
+ rh2 &= m62;
+ poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
+ #endif
+ rh &= m62;
+ poly_step(ch,cl,pkh,pkl,rh,rl);
+ }
+
+do_l3:
+ #if VMAC_USE_SSE2
+ _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
+ #endif
+ vhash_abort(ctx);
+ remaining *= 8;
+#if (VMAC_TAG_LEN == 128)
+ *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
+#endif
+ return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
+}
+
+/* ----------------------------------------------------------------------- */
+
+uint64_t vmac(unsigned char m[],
+ unsigned int mbytes,
+ unsigned char n[16],
+ uint64_t *tagl,
+ vmac_ctx_t *ctx)
+{
+#if (VMAC_TAG_LEN == 64)
+ uint64_t *in_n, *out_p;
+ uint64_t p, h;
+ int i;
+ (void)tagl;
+
+ #if VMAC_CACHE_NONCES
+ in_n = ctx->cached_nonce;
+ out_p = ctx->cached_aes;
+ #else
+ uint64_t tmp[2];
+ in_n = out_p = tmp;
+ #endif
+
+ i = n[15] & 1;
+ #if VMAC_CACHE_NONCES
+ if ((*(uint64_t *)(n+8) != in_n[1]) ||
+ (*(uint64_t *)(n ) != in_n[0])) {
+ #endif
+
+ in_n[0] = *(uint64_t *)(n );
+ in_n[1] = *(uint64_t *)(n+8);
+ ((unsigned char *)in_n)[15] &= 0xFE;
+ aes_encryption(in_n, out_p, &ctx->cipher_key);
+
+ #if VMAC_CACHE_NONCES
+ ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
+ }
+ #endif
+ p = get64BE(out_p + i);
+ h = vhash(m, mbytes, (uint64_t *)0, ctx);
+ return p + h;
+#else
+ uint64_t tmp[2];
+ uint64_t th,tl;
+ aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key);
+ th = vhash(m, mbytes, &tl, ctx);
+ th += get64BE(tmp);
+ *tagl = tl + get64BE(tmp+1);
+ return th;
+#endif
+}
+
+/* ----------------------------------------------------------------------- */
+
+void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx)
+{
+ uint64_t in[2] = {0}, out[2];
+ unsigned i;
+ aes_key_setup(user_key, &ctx->cipher_key);
+
+ /* Fill nh key */
+ ((unsigned char *)in)[0] = 0x80;
+ for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) {
+ aes_encryption((unsigned char *)in, (unsigned char *)out,
+ &ctx->cipher_key);
+ ctx->nhkey[i ] = get64BE(out);
+ ctx->nhkey[i+1] = get64BE(out+1);
+ ((unsigned char *)in)[15] += 1;
+ }
+
+ /* Fill poly key */
+ ((unsigned char *)in)[0] = 0xC0;
+ in[1] = 0;
+ for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) {
+ aes_encryption((unsigned char *)in, (unsigned char *)out,
+ &ctx->cipher_key);
+ ctx->polytmp[i ] = ctx->polykey[i ] = get64BE(out) & mpoly;
+ ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly;
+ ((unsigned char *)in)[15] += 1;
+ }
+
+ /* Fill ip key */
+ ((unsigned char *)in)[0] = 0xE0;
+ in[1] = 0;
+ for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) {
+ do {
+ aes_encryption((unsigned char *)in, (unsigned char *)out,
+ &ctx->cipher_key);
+ ctx->l3key[i ] = get64BE(out);
+ ctx->l3key[i+1] = get64BE(out+1);
+ ((unsigned char *)in)[15] += 1;
+ } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64);
+ }
+
+ /* Invalidate nonce/aes cache and reset other elements */
+ #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
+ ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */
+ ctx->cached_nonce[1] = (uint64_t)0; /* Ensure illegal nonce */
+ #endif
+ ctx->first_block_processed = 0;
+}
+
+/* ----------------------------------------------------------------------- */
+
+
+#if VMAC_RUN_TESTS
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <string.h>
+
+unsigned prime(void) /* Wake variable speed cpu, get rough speed estimate */
+{
+ volatile uint64_t i;
+ volatile uint64_t j=1;
+ unsigned cnt=0;
+ volatile clock_t ticks = clock();
+ do {
+ for (i = 0; i < 500000; i++) {
+ uint64_t x = get64PE(&j);
+ j = x * x + (uint64_t)ticks;
+ }
+ cnt++;
+ } while (clock() - ticks < (CLOCKS_PER_SEC/2));
+ return cnt; /* cnt is millions of iterations per second */
+}
+
+int main(void)
+{
+ ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2;
+ uint64_t res, tagl;
+ void *p;
+ unsigned char *m;
+ ALIGN(4) unsigned char key[] = "abcdefghijklmnop";
+ ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi";
+ unsigned int vector_lengths[] = {0,3,48,300,3000000};
+ #if (VMAC_TAG_LEN == 64)
+ ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5",
+ "E8421F61D573D298","4492DF6C5CAC1BBE",
+ "09BA597DD7601113"};
+ #else
+ ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC",
+ "4EE815A06A1D71EDD36FC75D51188A42",
+ "09F2C80C8E1007A0C12FAE19FE4504AE",
+ "66438817154850C61D8A412164803BCB",
+ "2B6B02288FFC461B75485DE893C629DC"};
+ #endif
+ unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+ unsigned i, j, *speed_iters;
+ clock_t ticks;
+ double cpb;
+ const unsigned int buf_len = 3 * (1 << 20);
+
+ j = prime();
+ i = sizeof(speed_lengths)/sizeof(speed_lengths[0]);
+ speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0]));
+ speed_iters[i-1] = j * (1 << 12);
+ while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]);
+
+ /* Initialize context and message buffer, all 16-byte aligned */
+ p = malloc(buf_len + 32);
+ m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15));
+ memset(m, 0, buf_len + 16);
+ vmac_set_key(key, &ctx);
+
+ /* Test incremental and all-in-one interfaces for correctness */
+ vmac_set_key(key, &ctx_aio);
+ vmac_set_key(key, &ctx_inc1);
+ vmac_set_key(key, &ctx_inc2);
+
+
+ /*
+ for (i = 0; i <= 512; i++) {
+ vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
+ tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES,
+ nonce, &tagl, &ctx);
+ vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
+ for (j = 0; j < vector_lengths[i]; j++)
+ m[j] = (unsigned char)('a'+j%3);
+
+ }
+ */
+
+ /* Generate vectors */
+ for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) {
+ for (j = 0; j < vector_lengths[i]; j++)
+ m[j] = (unsigned char)('a'+j%3);
+ res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx);
+ #if (VMAC_TAG_LEN == 64)
+ printf("\'abc\' * %7u: %016llX Should be: %s\n",
+ vector_lengths[i]/3,res,should_be[i]);
+ #else
+ printf("\'abc\' * %7u: %016llX%016llX\nShould be : %s\n",
+ vector_lengths[i]/3,res,tagl,should_be[i]);
+ #endif
+ }
+
+ /* Speed test */
+ for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) {
+ ticks = clock();
+ for (j = 0; j < speed_iters[i]; j++) {
+ #if HASH_ONLY
+ res = vhash(m, speed_lengths[i], &tagl, &ctx);
+ #else
+ res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx);
+ nonce[7]++;
+ #endif
+ }
+ ticks = clock() - ticks;
+ cpb = ((ticks*VMAC_HZ)/
+ ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i]));
+ printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb);
+ }
+ return 1;
+}
+
+#endif