From e98c8b548b279e7f7067fa09619aca28659a02e3 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 9 Feb 2012 11:33:29 +0000
Subject: arm: bit manipulation, copy and division libraries

Bit manipulation, division and memcpy & friends implementations for the
ARM architecture, shamelessly taken from Linux.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
Committed-by: Ian Campbell <ian.campbell@citrix.com>
---
 xen/arch/arm/lib/Makefile        |   5 +
 xen/arch/arm/lib/assembler.h     |  49 +++++
 xen/arch/arm/lib/bitops.h        |  87 +++++++++
 xen/arch/arm/lib/changebit.S     |  18 ++
 xen/arch/arm/lib/clearbit.S      |  19 ++
 xen/arch/arm/lib/copy_template.S | 267 +++++++++++++++++++++++++++
 xen/arch/arm/lib/div64.S         | 211 +++++++++++++++++++++
 xen/arch/arm/lib/findbit.S       | 198 ++++++++++++++++++++
 xen/arch/arm/lib/lib1funcs.S     | 389 +++++++++++++++++++++++++++++++++++++++
 xen/arch/arm/lib/memcpy.S        |  63 +++++++
 xen/arch/arm/lib/memmove.S       | 200 ++++++++++++++++++++
 xen/arch/arm/lib/memset.S        | 129 +++++++++++++
 xen/arch/arm/lib/memzero.S       | 127 +++++++++++++
 xen/arch/arm/lib/setbit.S        |  18 ++
 xen/arch/arm/lib/testchangebit.S |  18 ++
 xen/arch/arm/lib/testclearbit.S  |  18 ++
 xen/arch/arm/lib/testsetbit.S    |  18 ++
 xen/include/asm-arm/config.h     |   3 +
 18 files changed, 1837 insertions(+)
 create mode 100644 xen/arch/arm/lib/Makefile
 create mode 100644 xen/arch/arm/lib/assembler.h
 create mode 100644 xen/arch/arm/lib/bitops.h
 create mode 100644 xen/arch/arm/lib/changebit.S
 create mode 100644 xen/arch/arm/lib/clearbit.S
 create mode 100644 xen/arch/arm/lib/copy_template.S
 create mode 100644 xen/arch/arm/lib/div64.S
 create mode 100644 xen/arch/arm/lib/findbit.S
 create mode 100644 xen/arch/arm/lib/lib1funcs.S
 create mode 100644 xen/arch/arm/lib/memcpy.S
 create mode 100644 xen/arch/arm/lib/memmove.S
 create mode 100644 xen/arch/arm/lib/memset.S
 create mode 100644 xen/arch/arm/lib/memzero.S
 create mode 100644 xen/arch/arm/lib/setbit.S
 create mode 100644 xen/arch/arm/lib/testchangebit.S
 create mode 100644 xen/arch/arm/lib/testclearbit.S
 create mode 100644 xen/arch/arm/lib/testsetbit.S

diff --git a/xen/arch/arm/lib/Makefile b/xen/arch/arm/lib/Makefile
new file mode 100644
index 0000000000..cbbed682f7
--- /dev/null
+++ b/xen/arch/arm/lib/Makefile
@@ -0,0 +1,5 @@
+obj-y += memcpy.o memmove.o memset.o memzero.o
+obj-y += findbit.o setbit.o
+obj-y += setbit.o clearbit.o changebit.o
+obj-y += testsetbit.o testclearbit.o testchangebit.o
+obj-y += lib1funcs.o div64.o
diff --git a/xen/arch/arm/lib/assembler.h b/xen/arch/arm/lib/assembler.h
new file mode 100644
index 0000000000..f8f0961a00
--- /dev/null
+++ b/xen/arch/arm/lib/assembler.h
@@ -0,0 +1,49 @@
+#ifndef __ARCH_ARM_LIB_ASSEMBLER_H__
+#define __ARCH_ARM_LIB_ASSEMBLER_H__
+
+/* From Linux arch/arm/include/asm/assembler.h */
+/*
+ * Data preload for architectures that support it
+ */
+#define PLD(code...)    code
+
+/*
+ * This can be used to enable code to cacheline align the destination
+ * pointer when bulk writing to memory.  Experiments on StrongARM and
+ * XScale didn't show this a worthwhile thing to do when the cache is not
+ * set to write-allocate (this would need further testing on XScale when WA
+ * is used).
+ *
+ * On Feroceon there is much to gain however, regardless of cache mode.
+ */
+#ifdef CONFIG_CPU_FEROCEON /* Not in Xen... */
+#define CALGN(code...) code
+#else
+#define CALGN(code...)
+#endif
+
+// No Thumb, hence:
+#define W(instr)        instr
+#define ARM(instr...)   instr
+#define THUMB(instr...)
+
+#ifdef CONFIG_ARM_UNWIND
+#define UNWIND(code...)         code
+#else
+#define UNWIND(code...)
+#endif
+
+#define pull            lsl
+#define push            lsr
+#define get_byte_0      lsr #24
+#define get_byte_1      lsr #16
+#define get_byte_2      lsr #8
+#define get_byte_3      lsl #0
+#define put_byte_0      lsl #24
+#define put_byte_1      lsl #16
+#define put_byte_2      lsl #8
+#define put_byte_3      lsl #0
+
+#define smp_dmb dmb
+
+#endif /*  __ARCH_ARM_LIB_ASSEMBLER_H__ */
diff --git a/xen/arch/arm/lib/bitops.h b/xen/arch/arm/lib/bitops.h
new file mode 100644
index 0000000000..77c7909986
--- /dev/null
+++ b/xen/arch/arm/lib/bitops.h
@@ -0,0 +1,87 @@
+#include <xen/config.h>
+
+#if __LINUX_ARM_ARCH__ >= 6
+       .macro  bitop, instr
+       ands    ip, r1, #3
+       strneb  r1, [ip]                @ assert word-aligned
+       mov     r2, #1
+       and     r3, r0, #31             @ Get bit offset
+       mov     r0, r0, lsr #5
+       add     r1, r1, r0, lsl #2      @ Get word offset
+       mov     r3, r2, lsl r3
+1:     ldrex   r2, [r1]
+       \instr  r2, r2, r3
+       strex   r0, r2, [r1]
+       cmp     r0, #0
+       bne     1b
+       bx      lr
+       .endm
+
+       .macro  testop, instr, store
+       ands    ip, r1, #3
+       strneb  r1, [ip]                @ assert word-aligned
+       mov     r2, #1
+       and     r3, r0, #31             @ Get bit offset
+       mov     r0, r0, lsr #5
+       add     r1, r1, r0, lsl #2      @ Get word offset
+       mov     r3, r2, lsl r3          @ create mask
+       smp_dmb
+1:     ldrex   r2, [r1]
+       ands    r0, r2, r3              @ save old value of bit
+       \instr  r2, r2, r3              @ toggle bit
+       strex   ip, r2, [r1]
+       cmp     ip, #0
+       bne     1b
+       smp_dmb
+       cmp     r0, #0
+       movne   r0, #1
+2:     bx      lr
+       .endm
+#else
+       .macro  bitop, name, instr
+ENTRY( \name           )
+UNWIND(        .fnstart        )
+       ands    ip, r1, #3
+       strneb  r1, [ip]                @ assert word-aligned
+       and     r2, r0, #31
+       mov     r0, r0, lsr #5
+       mov     r3, #1
+       mov     r3, r3, lsl r2
+       save_and_disable_irqs ip
+       ldr     r2, [r1, r0, lsl #2]
+       \instr  r2, r2, r3
+       str     r2, [r1, r0, lsl #2]
+       restore_irqs ip
+       mov     pc, lr
+UNWIND(        .fnend          )
+ENDPROC(\name          )
+       .endm
+
+/**
+ * testop - implement a test_and_xxx_bit operation.
+ * @instr: operational instruction
+ * @store: store instruction
+ *
+ * Note: we can trivially conditionalise the store instruction
+ * to avoid dirtying the data cache.
+ */
+       .macro  testop, name, instr, store
+ENTRY( \name           )
+UNWIND(        .fnstart        )
+       ands    ip, r1, #3
+       strneb  r1, [ip]                @ assert word-aligned
+       and     r3, r0, #31
+       mov     r0, r0, lsr #5
+       save_and_disable_irqs ip
+       ldr     r2, [r1, r0, lsl #2]!
+       mov     r0, #1
+       tst     r2, r0, lsl r3
+       \instr  r2, r2, r0, lsl r3
+       \store  r2, [r1]
+       moveq   r0, #0
+       restore_irqs ip
+       mov     pc, lr
+UNWIND(        .fnend          )
+ENDPROC(\name          )
+       .endm
+#endif
diff --git a/xen/arch/arm/lib/changebit.S b/xen/arch/arm/lib/changebit.S
new file mode 100644
index 0000000000..624d6b5b31
--- /dev/null
+++ b/xen/arch/arm/lib/changebit.S
@@ -0,0 +1,18 @@
+/*
+ *  linux/arch/arm/lib/changebit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+                .text
+
+ENTRY(_change_bit)
+       bitop   eor
+ENDPROC(_change_bit)
diff --git a/xen/arch/arm/lib/clearbit.S b/xen/arch/arm/lib/clearbit.S
new file mode 100644
index 0000000000..f406d9af08
--- /dev/null
+++ b/xen/arch/arm/lib/clearbit.S
@@ -0,0 +1,19 @@
+/*
+ *  linux/arch/arm/lib/clearbit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+                .text
+
+ENTRY(_clear_bit)
+       bitop   bic
+ENDPROC(_clear_bit)
diff --git a/xen/arch/arm/lib/copy_template.S b/xen/arch/arm/lib/copy_template.S
new file mode 100644
index 0000000000..5adf2bc737
--- /dev/null
+++ b/xen/arch/arm/lib/copy_template.S
@@ -0,0 +1,267 @@
+/*
+ *  linux/arch/arm/lib/copy_template.s
+ *
+ *  Code template for optimized memory copy functions
+ *
+ *  Author:    Nicolas Pitre
+ *  Created:   Sep 28, 2005
+ *  Copyright: MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+/*
+ * Theory of operation
+ * -------------------
+ *
+ * This file provides the core code for a forward memory copy used in
+ * the implementation of memcopy(), copy_to_user() and copy_from_user().
+ *
+ * The including file must define the following accessor macros
+ * according to the need of the given function:
+ *
+ * ldr1w ptr reg abort
+ *
+ *     This loads one word from 'ptr', stores it in 'reg' and increments
+ *     'ptr' to the next word. The 'abort' argument is used for fixup tables.
+ *
+ * ldr4w ptr reg1 reg2 reg3 reg4 abort
+ * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ *
+ *     This loads four or eight words starting from 'ptr', stores them
+ *     in provided registers and increments 'ptr' past those words.
+ *     The'abort' argument is used for fixup tables.
+ *
+ * ldr1b ptr reg cond abort
+ *
+ *     Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
+ *     It also must apply the condition code if provided, otherwise the
+ *     "al" condition is assumed by default.
+ *
+ * str1w ptr reg abort
+ * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ * str1b ptr reg cond abort
+ *
+ *     Same as their ldr* counterparts, but data is stored to 'ptr' location
+ *     rather than being loaded.
+ *
+ * enter reg1 reg2
+ *
+ *     Preserve the provided registers on the stack plus any additional
+ *     data as needed by the implementation including this code. Called
+ *     upon code entry.
+ *
+ * exit reg1 reg2
+ *
+ *     Restore registers with the values previously saved with the
+ *     'preserv' macro. Called upon code termination.
+ *
+ * LDR1W_SHIFT
+ * STR1W_SHIFT
+ *
+ *     Correction to be applied to the "ip" register when branching into
+ *     the ldr1w or str1w instructions (some of these macros may expand to
+ *     than one 32bit instruction in Thumb-2)
+ */
+
+
+               enter   r4, lr
+
+               subs    r2, r2, #4
+               blt     8f
+               ands    ip, r0, #3
+       PLD(    pld     [r1, #0]                )
+               bne     9f
+               ands    ip, r1, #3
+               bne     10f
+
+1:             subs    r2, r2, #(28)
+               stmfd   sp!, {r5 - r8}
+               blt     5f
+
+       CALGN(  ands    ip, r0, #31             )
+       CALGN(  rsb     r3, ip, #32             )
+       CALGN(  sbcnes  r4, r3, r2              )  @ C is always set here
+       CALGN(  bcs     2f                      )
+       CALGN(  adr     r4, 6f                  )
+       CALGN(  subs    r2, r2, r3              )  @ C gets set
+       CALGN(  add     pc, r4, ip              )
+
+       PLD(    pld     [r1, #0]                )
+2:     PLD(    subs    r2, r2, #96             )
+       PLD(    pld     [r1, #28]               )
+       PLD(    blt     4f                      )
+       PLD(    pld     [r1, #60]               )
+       PLD(    pld     [r1, #92]               )
+
+3:     PLD(    pld     [r1, #124]              )
+4:             ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+               subs    r2, r2, #32
+               str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+               bge     3b
+       PLD(    cmn     r2, #96                 )
+       PLD(    bge     4b                      )
+
+5:             ands    ip, r2, #28
+               rsb     ip, ip, #32
+#if LDR1W_SHIFT > 0
+               lsl     ip, ip, #LDR1W_SHIFT
+#endif
+               addne   pc, pc, ip              @ C is always clear here
+               b       7f
+6:
+               .rept   (1 << LDR1W_SHIFT)
+               W(nop)
+               .endr
+               ldr1w   r1, r3, abort=20f
+               ldr1w   r1, r4, abort=20f
+               ldr1w   r1, r5, abort=20f
+               ldr1w   r1, r6, abort=20f
+               ldr1w   r1, r7, abort=20f
+               ldr1w   r1, r8, abort=20f
+               ldr1w   r1, lr, abort=20f
+
+#if LDR1W_SHIFT < STR1W_SHIFT
+               lsl     ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+               lsr     ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
+               add     pc, pc, ip
+               nop
+               .rept   (1 << STR1W_SHIFT)
+               W(nop)
+               .endr
+               str1w   r0, r3, abort=20f
+               str1w   r0, r4, abort=20f
+               str1w   r0, r5, abort=20f
+               str1w   r0, r6, abort=20f
+               str1w   r0, r7, abort=20f
+               str1w   r0, r8, abort=20f
+               str1w   r0, lr, abort=20f
+
+       CALGN(  bcs     2b                      )
+
+7:             ldmfd   sp!, {r5 - r8}
+
+8:             movs    r2, r2, lsl #31
+               ldr1b   r1, r3, ne, abort=21f
+               ldr1b   r1, r4, cs, abort=21f
+               ldr1b   r1, ip, cs, abort=21f
+               str1b   r0, r3, ne, abort=21f
+               str1b   r0, r4, cs, abort=21f
+               str1b   r0, ip, cs, abort=21f
+
+               exit    r4, pc
+
+9:             rsb     ip, ip, #4
+               cmp     ip, #2
+               ldr1b   r1, r3, gt, abort=21f
+               ldr1b   r1, r4, ge, abort=21f
+               ldr1b   r1, lr, abort=21f
+               str1b   r0, r3, gt, abort=21f
+               str1b   r0, r4, ge, abort=21f
+               subs    r2, r2, ip
+               str1b   r0, lr, abort=21f
+               blt     8b
+               ands    ip, r1, #3
+               beq     1b
+
+10:            bic     r1, r1, #3
+               cmp     ip, #2
+               ldr1w   r1, lr, abort=21f
+               beq     17f
+               bgt     18f
+
+
+               .macro  forward_copy_shift pull push
+
+               subs    r2, r2, #28
+               blt     14f
+
+       CALGN(  ands    ip, r0, #31             )
+       CALGN(  rsb     ip, ip, #32             )
+       CALGN(  sbcnes  r4, ip, r2              )  @ C is always set here
+       CALGN(  subcc   r2, r2, ip              )
+       CALGN(  bcc     15f                     )
+
+11:            stmfd   sp!, {r5 - r9}
+
+       PLD(    pld     [r1, #0]                )
+       PLD(    subs    r2, r2, #96             )
+       PLD(    pld     [r1, #28]               )
+       PLD(    blt     13f                     )
+       PLD(    pld     [r1, #60]               )
+       PLD(    pld     [r1, #92]               )
+
+12:    PLD(    pld     [r1, #124]              )
+13:            ldr4w   r1, r4, r5, r6, r7, abort=19f
+               mov     r3, lr, pull #\pull
+               subs    r2, r2, #32
+               ldr4w   r1, r8, r9, ip, lr, abort=19f
+               orr     r3, r3, r4, push #\push
+               mov     r4, r4, pull #\pull
+               orr     r4, r4, r5, push #\push
+               mov     r5, r5, pull #\pull
+               orr     r5, r5, r6, push #\push
+               mov     r6, r6, pull #\pull
+               orr     r6, r6, r7, push #\push
+               mov     r7, r7, pull #\pull
+               orr     r7, r7, r8, push #\push
+               mov     r8, r8, pull #\pull
+               orr     r8, r8, r9, push #\push
+               mov     r9, r9, pull #\pull
+               orr     r9, r9, ip, push #\push
+               mov     ip, ip, pull #\pull
+               orr     ip, ip, lr, push #\push
+               str8w   r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+               bge     12b
+       PLD(    cmn     r2, #96                 )
+       PLD(    bge     13b                     )
+
+               ldmfd   sp!, {r5 - r9}
+
+14:            ands    ip, r2, #28
+               beq     16f
+
+15:            mov     r3, lr, pull #\pull
+               ldr1w   r1, lr, abort=21f
+               subs    ip, ip, #4
+               orr     r3, r3, lr, push #\push
+               str1w   r0, r3, abort=21f
+               bgt     15b
+       CALGN(  cmp     r2, #0                  )
+       CALGN(  bge     11b                     )
+
+16:            sub     r1, r1, #(\push / 8)
+               b       8b
+
+               .endm
+
+
+               forward_copy_shift      pull=8  push=24
+
+17:            forward_copy_shift      pull=16 push=16
+
+18:            forward_copy_shift      pull=24 push=8
+
+
+/*
+ * Abort preamble and completion macros.
+ * If a fixup handler is required then those macros must surround it.
+ * It is assumed that the fixup code will handle the private part of
+ * the exit macro.
+ */
+
+       .macro  copy_abort_preamble
+19:    ldmfd   sp!, {r5 - r9}
+       b       21f
+20:    ldmfd   sp!, {r5 - r8}
+21:
+       .endm
+
+       .macro  copy_abort_end
+       ldmfd   sp!, {r4, pc}
+       .endm
+
diff --git a/xen/arch/arm/lib/div64.S b/xen/arch/arm/lib/div64.S
new file mode 100644
index 0000000000..2d364b354b
--- /dev/null
+++ b/xen/arch/arm/lib/div64.S
@@ -0,0 +1,211 @@
+/*
+ *  linux/arch/arm/lib/div64.S
+ *
+ *  Optimized computation of 64-bit dividend / 32-bit divisor
+ *
+ *  Author:    Nicolas Pitre
+ *  Created:   Oct 5, 2003
+ *  Copyright: Monta Vista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+#include "assembler.h"
+
+#ifdef __ARMEB__
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
+#else
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+#endif
+
+/*
+ * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
+ *
+ * Note: Calling convention is totally non standard for optimal code.
+ *       This is meant to be used by do_div() from include/asm/div64.h only.
+ *
+ * Input parameters:
+ *     xh-xl   = dividend (clobbered)
+ *     r4      = divisor (preserved)
+ *
+ * Output values:
+ *     yh-yl   = result
+ *     xh      = remainder
+ *
+ * Clobbered regs: xl, ip
+ */
+
+ENTRY(__do_div64)
+UNWIND(.fnstart)
+
+       @ Test for easy paths first.
+       subs    ip, r4, #1
+       bls     9f                      @ divisor is 0 or 1
+       tst     ip, r4
+       beq     8f                      @ divisor is power of 2
+
+       @ See if we need to handle upper 32-bit result.
+       cmp     xh, r4
+       mov     yh, #0
+       blo     3f
+
+       @ Align divisor with upper part of dividend.
+       @ The aligned divisor is stored in yl preserving the original.
+       @ The bit position is stored in ip.
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+       clz     yl, r4
+       clz     ip, xh
+       sub     yl, yl, ip
+       mov     ip, #1
+       mov     ip, ip, lsl yl
+       mov     yl, r4, lsl yl
+
+#else
+
+       mov     yl, r4
+       mov     ip, #1
+1:     cmp     yl, #0x80000000
+       cmpcc   yl, xh
+       movcc   yl, yl, lsl #1
+       movcc   ip, ip, lsl #1
+       bcc     1b
+
+#endif
+
+       @ The division loop for needed upper bit positions.
+       @ Break out early if dividend reaches 0.
+2:     cmp     xh, yl
+       orrcs   yh, yh, ip
+       subcss  xh, xh, yl
+       movnes  ip, ip, lsr #1
+       mov     yl, yl, lsr #1
+       bne     2b
+
+       @ See if we need to handle lower 32-bit result.
+3:     cmp     xh, #0
+       mov     yl, #0
+       cmpeq   xl, r4
+       movlo   xh, xl
+       movlo   pc, lr
+
+       @ The division loop for lower bit positions.
+       @ Here we shift remainer bits leftwards rather than moving the
+       @ divisor for comparisons, considering the carry-out bit as well.
+       mov     ip, #0x80000000
+4:     movs    xl, xl, lsl #1
+       adcs    xh, xh, xh
+       beq     6f
+       cmpcc   xh, r4
+5:     orrcs   yl, yl, ip
+       subcs   xh, xh, r4
+       movs    ip, ip, lsr #1
+       bne     4b
+       mov     pc, lr
+
+       @ The top part of remainder became zero.  If carry is set
+       @ (the 33th bit) this is a false positive so resume the loop.
+       @ Otherwise, if lower part is also null then we are done.
+6:     bcs     5b
+       cmp     xl, #0
+       moveq   pc, lr
+
+       @ We still have remainer bits in the low part.  Bring them up.
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+       clz     xh, xl                  @ we know xh is zero here so...
+       add     xh, xh, #1
+       mov     xl, xl, lsl xh
+       mov     ip, ip, lsr xh
+
+#else
+
+7:     movs    xl, xl, lsl #1
+       mov     ip, ip, lsr #1
+       bcc     7b
+
+#endif
+
+       @ Current remainder is now 1.  It is worthless to compare with
+       @ divisor at this point since divisor can not be smaller than 3 here.
+       @ If possible, branch for another shift in the division loop.
+       @ If no bit position left then we are done.
+       movs    ip, ip, lsr #1
+       mov     xh, #1
+       bne     4b
+       mov     pc, lr
+
+8:     @ Division by a power of 2: determine what that divisor order is
+       @ then simply shift values around
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+       clz     ip, r4
+       rsb     ip, ip, #31
+
+#else
+
+       mov     yl, r4
+       cmp     r4, #(1 << 16)
+       mov     ip, #0
+       movhs   yl, yl, lsr #16
+       movhs   ip, #16
+
+       cmp     yl, #(1 << 8)
+       movhs   yl, yl, lsr #8
+       addhs   ip, ip, #8
+
+       cmp     yl, #(1 << 4)
+       movhs   yl, yl, lsr #4
+       addhs   ip, ip, #4
+
+       cmp     yl, #(1 << 2)
+       addhi   ip, ip, #3
+       addls   ip, ip, yl, lsr #1
+
+#endif
+
+       mov     yh, xh, lsr ip
+       mov     yl, xl, lsr ip
+       rsb     ip, ip, #32
+ ARM(  orr     yl, yl, xh, lsl ip      )
+ THUMB(        lsl     xh, xh, ip              )
+ THUMB(        orr     yl, yl, xh              )
+       mov     xh, xl, lsl ip
+       mov     xh, xh, lsr ip
+       mov     pc, lr
+
+       @ eq -> division by 1: obvious enough...
+9:     moveq   yl, xl
+       moveq   yh, xh
+       moveq   xh, #0
+       moveq   pc, lr
+UNWIND(.fnend)
+
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
+Ldiv0_64:
+       @ Division by 0:
+       str     lr, [sp, #-8]!
+       bl      __div0
+
+       @ as wrong as it could be...
+       mov     yl, #0
+       mov     yh, #0
+       mov     xh, #0
+       ldr     pc, [sp], #8
+
+UNWIND(.fnend)
+ENDPROC(__do_div64)
diff --git a/xen/arch/arm/lib/findbit.S b/xen/arch/arm/lib/findbit.S
new file mode 100644
index 0000000000..55ba552ba2
--- /dev/null
+++ b/xen/arch/arm/lib/findbit.S
@@ -0,0 +1,198 @@
+/*
+ *  linux/arch/arm/lib/findbit.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16th March 2001 - John Ripley <jripley@sonicblue.com>
+ *   Fixed so that "size" is an exclusive not an inclusive quantity.
+ *   All users of these functions expect exclusive sizes, and may
+ *   also call with zero size.
+ * Reworked by rmk.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+                .text
+
+/*
+ * Purpose  : Find a 'zero' bit
+ * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit);
+ */
+ENTRY(_find_first_zero_bit_le)
+               teq     r1, #0
+               beq     3f
+               mov     r2, #0
+1:
+ ARM(          ldrb    r3, [r0, r2, lsr #3]    )
+ THUMB(                lsr     r3, r2, #3              )
+ THUMB(                ldrb    r3, [r0, r3]            )
+               eors    r3, r3, #0xff           @ invert bits
+               bne     .L_found                @ any now set - found zero bit
+               add     r2, r2, #8              @ next bit pointer
+2:             cmp     r2, r1                  @ any more?
+               blo     1b
+3:             mov     r0, r1                  @ no free bits
+               mov     pc, lr
+ENDPROC(_find_first_zero_bit_le)
+
+/*
+ * Purpose  : Find next 'zero' bit
+ * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
+ */
+ENTRY(_find_next_zero_bit_le)
+               teq     r1, #0
+               beq     3b
+               ands    ip, r2, #7
+               beq     1b                      @ If new byte, goto old routine
+ ARM(          ldrb    r3, [r0, r2, lsr #3]    )
+ THUMB(                lsr     r3, r2, #3              )
+ THUMB(                ldrb    r3, [r0, r3]            )
+               eor     r3, r3, #0xff           @ now looking for a 1 bit
+               movs    r3, r3, lsr ip          @ shift off unused bits
+               bne     .L_found
+               orr     r2, r2, #7              @ if zero, then no bits here
+               add     r2, r2, #1              @ align bit pointer
+               b       2b                      @ loop for next bit
+ENDPROC(_find_next_zero_bit_le)
+
+/*
+ * Purpose  : Find a 'one' bit
+ * Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit);
+ */
+ENTRY(_find_first_bit_le)
+               teq     r1, #0
+               beq     3f
+               mov     r2, #0
+1:
+ ARM(          ldrb    r3, [r0, r2, lsr #3]    )
+ THUMB(                lsr     r3, r2, #3              )
+ THUMB(                ldrb    r3, [r0, r3]            )
+               movs    r3, r3
+               bne     .L_found                @ any now set - found zero bit
+               add     r2, r2, #8              @ next bit pointer
+2:             cmp     r2, r1                  @ any more?
+               blo     1b
+3:             mov     r0, r1                  @ no free bits
+               mov     pc, lr
+ENDPROC(_find_first_bit_le)
+
+/*
+ * Purpose  : Find next 'one' bit
+ * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
+ */
+ENTRY(_find_next_bit_le)
+               teq     r1, #0
+               beq     3b
+               ands    ip, r2, #7
+               beq     1b                      @ If new byte, goto old routine
+ ARM(          ldrb    r3, [r0, r2, lsr #3]    )
+ THUMB(                lsr     r3, r2, #3              )
+ THUMB(                ldrb    r3, [r0, r3]            )
+               movs    r3, r3, lsr ip          @ shift off unused bits
+               bne     .L_found
+               orr     r2, r2, #7              @ if zero, then no bits here
+               add     r2, r2, #1              @ align bit pointer
+               b       2b                      @ loop for next bit
+ENDPROC(_find_next_bit_le)
+
+#ifdef __ARMEB__
+
+ENTRY(_find_first_zero_bit_be)
+               teq     r1, #0
+               beq     3f
+               mov     r2, #0
+1:             eor     r3, r2, #0x18           @ big endian byte ordering
+ ARM(          ldrb    r3, [r0, r3, lsr #3]    )
+ THUMB(                lsr     r3, #3                  )
+ THUMB(                ldrb    r3, [r0, r3]            )
+               eors    r3, r3, #0xff           @ invert bits
+               bne     .L_found                @ any now set - found zero bit
+               add     r2, r2, #8              @ next bit pointer
+2:             cmp     r2, r1                  @ any more?
+               blo     1b
+3:             mov     r0, r1                  @ no free bits
+               mov     pc, lr
+ENDPROC(_find_first_zero_bit_be)
+
+ENTRY(_find_next_zero_bit_be)
+               teq     r1, #0
+               beq     3b
+               ands    ip, r2, #7
+               beq     1b                      @ If new byte, goto old routine
+               eor     r3, r2, #0x18           @ big endian byte ordering
+ ARM(          ldrb    r3, [r0, r3, lsr #3]    )
+ THUMB(                lsr     r3, #3                  )
+ THUMB(                ldrb    r3, [r0, r3]            )
+               eor     r3, r3, #0xff           @ now looking for a 1 bit
+               movs    r3, r3, lsr ip          @ shift off unused bits
+               bne     .L_found
+               orr     r2, r2, #7              @ if zero, then no bits here
+               add     r2, r2, #1              @ align bit pointer
+               b       2b                      @ loop for next bit
+ENDPROC(_find_next_zero_bit_be)
+
+ENTRY(_find_first_bit_be)
+               teq     r1, #0
+               beq     3f
+               mov     r2, #0
+1:             eor     r3, r2, #0x18           @ big endian byte ordering
+ ARM(          ldrb    r3, [r0, r3, lsr #3]    )
+ THUMB(                lsr     r3, #3                  )
+ THUMB(                ldrb    r3, [r0, r3]            )
+               movs    r3, r3
+               bne     .L_found                @ any now set - found zero bit
+               add     r2, r2, #8              @ next bit pointer
+2:             cmp     r2, r1                  @ any more?
+               blo     1b
+3:             mov     r0, r1                  @ no free bits
+               mov     pc, lr
+ENDPROC(_find_first_bit_be)
+
+ENTRY(_find_next_bit_be)
+               teq     r1, #0
+               beq     3b
+               ands    ip, r2, #7
+               beq     1b                      @ If new byte, goto old routine
+               eor     r3, r2, #0x18           @ big endian byte ordering
+ ARM(          ldrb    r3, [r0, r3, lsr #3]    )
+ THUMB(                lsr     r3, #3                  )
+ THUMB(                ldrb    r3, [r0, r3]            )
+               movs    r3, r3, lsr ip          @ shift off unused bits
+               bne     .L_found
+               orr     r2, r2, #7              @ if zero, then no bits here
+               add     r2, r2, #1              @ align bit pointer
+               b       2b                      @ loop for next bit
+ENDPROC(_find_next_bit_be)
+
+#endif
+
+/*
+ * One or more bits in the LSB of r3 are assumed to be set.
+ */
+.L_found:
+#if __LINUX_ARM_ARCH__ >= 5
+               rsb     r0, r3, #0
+               and     r3, r3, r0
+               clz     r3, r3
+               rsb     r3, r3, #31
+               add     r0, r2, r3
+#else
+               tst     r3, #0x0f
+               addeq   r2, r2, #4
+               movne   r3, r3, lsl #4
+               tst     r3, #0x30
+               addeq   r2, r2, #2
+               movne   r3, r3, lsl #2
+               tst     r3, #0x40
+               addeq   r2, r2, #1
+               mov     r0, r2
+#endif
+               cmp     r1, r0                  @ Clamp to maxbit
+               movlo   r0, r1
+               mov     pc, lr
+
diff --git a/xen/arch/arm/lib/lib1funcs.S b/xen/arch/arm/lib/lib1funcs.S
new file mode 100644
index 0000000000..343c2c8576
--- /dev/null
+++ b/xen/arch/arm/lib/lib1funcs.S
@@ -0,0 +1,389 @@
+/*
+ * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines
+ *
+ * Author: Nicolas Pitre <nico@fluxnic.net>
+ *   - contributed to gcc-3.4 on Sep 30, 2003
+ *   - adapted for the Linux kernel on Oct 2, 2003
+ */
+
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+
+#include <xen/config.h>
+#include "assembler.h"
+
+.macro ARM_DIV_BODY dividend, divisor, result, curbit
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+       clz     \curbit, \divisor
+       clz     \result, \dividend
+       sub     \result, \curbit, \result
+       mov     \curbit, #1
+       mov     \divisor, \divisor, lsl \result
+       mov     \curbit, \curbit, lsl \result
+       mov     \result, #0
+
+#else
+
+       @ Initially shift the divisor left 3 bits if possible,
+       @ set curbit accordingly.  This allows for curbit to be located
+       @ at the left end of each 4 bit nibbles in the division loop
+       @ to save one loop in most cases.
+       tst     \divisor, #0xe0000000
+       moveq   \divisor, \divisor, lsl #3
+       moveq   \curbit, #8
+       movne   \curbit, #1
+
+       @ Unless the divisor is very big, shift it up in multiples of
+       @ four bits, since this is the amount of unwinding in the main
+       @ division loop.  Continue shifting until the divisor is
+       @ larger than the dividend.
+1:     cmp     \divisor, #0x10000000
+       cmplo   \divisor, \dividend
+       movlo   \divisor, \divisor, lsl #4
+       movlo   \curbit, \curbit, lsl #4
+       blo     1b
+
+       @ For very big divisors, we must shift it a bit at a time, or
+       @ we will be in danger of overflowing.
+1:     cmp     \divisor, #0x80000000
+       cmplo   \divisor, \dividend
+       movlo   \divisor, \divisor, lsl #1
+       movlo   \curbit, \curbit, lsl #1
+       blo     1b
+
+       mov     \result, #0
+
+#endif
+
+       @ Division loop
+1:     cmp     \dividend, \divisor
+       subhs   \dividend, \dividend, \divisor
+       orrhs   \result,   \result,   \curbit
+       cmp     \dividend, \divisor,  lsr #1
+       subhs   \dividend, \dividend, \divisor, lsr #1
+       orrhs   \result,   \result,   \curbit,  lsr #1
+       cmp     \dividend, \divisor,  lsr #2
+       subhs   \dividend, \dividend, \divisor, lsr #2
+       orrhs   \result,   \result,   \curbit,  lsr #2
+       cmp     \dividend, \divisor,  lsr #3
+       subhs   \dividend, \dividend, \divisor, lsr #3
+       orrhs   \result,   \result,   \curbit,  lsr #3
+       cmp     \dividend, #0                   @ Early termination?
+       movnes  \curbit,   \curbit,  lsr #4     @ No, any more bits to do?
+       movne   \divisor,  \divisor, lsr #4
+       bne     1b
+
+.endm
+
+
+.macro ARM_DIV2_ORDER divisor, order
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+       clz     \order, \divisor
+       rsb     \order, \order, #31
+
+#else
+
+       cmp     \divisor, #(1 << 16)
+       movhs   \divisor, \divisor, lsr #16
+       movhs   \order, #16
+       movlo   \order, #0
+
+       cmp     \divisor, #(1 << 8)
+       movhs   \divisor, \divisor, lsr #8
+       addhs   \order, \order, #8
+
+       cmp     \divisor, #(1 << 4)
+       movhs   \divisor, \divisor, lsr #4
+       addhs   \order, \order, #4
+
+       cmp     \divisor, #(1 << 2)
+       addhi   \order, \order, #3
+       addls   \order, \order, \divisor, lsr #1
+
+#endif
+
+.endm
+
+
+.macro ARM_MOD_BODY dividend, divisor, order, spare
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+       clz     \order, \divisor
+       clz     \spare, \dividend
+       sub     \order, \order, \spare
+       mov     \divisor, \divisor, lsl \order
+
+#else
+
+       mov     \order, #0
+
+       @ Unless the divisor is very big, shift it up in multiples of
+       @ four bits, since this is the amount of unwinding in the main
+       @ division loop.  Continue shifting until the divisor is
+       @ larger than the dividend.
+1:     cmp     \divisor, #0x10000000
+       cmplo   \divisor, \dividend
+       movlo   \divisor, \divisor, lsl #4
+       addlo   \order, \order, #4
+       blo     1b
+
+       @ For very big divisors, we must shift it a bit at a time, or
+       @ we will be in danger of overflowing.
+1:     cmp     \divisor, #0x80000000
+       cmplo   \divisor, \dividend
+       movlo   \divisor, \divisor, lsl #1
+       addlo   \order, \order, #1
+       blo     1b
+
+#endif
+
+       @ Perform all needed substractions to keep only the reminder.
+       @ Do comparisons in batch of 4 first.
+       subs    \order, \order, #3              @ yes, 3 is intended here
+       blt     2f
+
+1:     cmp     \dividend, \divisor
+       subhs   \dividend, \dividend, \divisor
+       cmp     \dividend, \divisor,  lsr #1
+       subhs   \dividend, \dividend, \divisor, lsr #1
+       cmp     \dividend, \divisor,  lsr #2
+       subhs   \dividend, \dividend, \divisor, lsr #2
+       cmp     \dividend, \divisor,  lsr #3
+       subhs   \dividend, \dividend, \divisor, lsr #3
+       cmp     \dividend, #1
+       mov     \divisor, \divisor, lsr #4
+       subges  \order, \order, #4
+       bge     1b
+
+       tst     \order, #3
+       teqne   \dividend, #0
+       beq     5f
+
+       @ Either 1, 2 or 3 comparison/substractions are left.
+2:     cmn     \order, #2
+       blt     4f
+       beq     3f
+       cmp     \dividend, \divisor
+       subhs   \dividend, \dividend, \divisor
+       mov     \divisor,  \divisor,  lsr #1
+3:     cmp     \dividend, \divisor
+       subhs   \dividend, \dividend, \divisor
+       mov     \divisor,  \divisor,  lsr #1
+4:     cmp     \dividend, \divisor
+       subhs   \dividend, \dividend, \divisor
+5:
+.endm
+
+
+ENTRY(__udivsi3)
+ENTRY(__aeabi_uidiv)
+UNWIND(.fnstart)
+
+       subs    r2, r1, #1
+       moveq   pc, lr
+       bcc     Ldiv0
+       cmp     r0, r1
+       bls     11f
+       tst     r1, r2
+       beq     12f
+
+       ARM_DIV_BODY r0, r1, r2, r3
+
+       mov     r0, r2
+       mov     pc, lr
+
+11:    moveq   r0, #1
+       movne   r0, #0
+       mov     pc, lr
+
+12:    ARM_DIV2_ORDER r1, r2
+
+       mov     r0, r0, lsr r2
+       mov     pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__udivsi3)
+ENDPROC(__aeabi_uidiv)
+
+ENTRY(__umodsi3)
+UNWIND(.fnstart)
+
+       subs    r2, r1, #1                      @ compare divisor with 1
+       bcc     Ldiv0
+       cmpne   r0, r1                          @ compare dividend with divisor
+       moveq   r0, #0
+       tsthi   r1, r2                          @ see if divisor is power of 2
+       andeq   r0, r0, r2
+       movls   pc, lr
+
+       ARM_MOD_BODY r0, r1, r2, r3
+
+       mov     pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__umodsi3)
+
+ENTRY(__divsi3)
+ENTRY(__aeabi_idiv)
+UNWIND(.fnstart)
+
+       cmp     r1, #0
+       eor     ip, r0, r1                      @ save the sign of the result.
+       beq     Ldiv0
+       rsbmi   r1, r1, #0                      @ loops below use unsigned.
+       subs    r2, r1, #1                      @ division by 1 or -1 ?
+       beq     10f
+       movs    r3, r0
+       rsbmi   r3, r0, #0                      @ positive dividend value
+       cmp     r3, r1
+       bls     11f
+       tst     r1, r2                          @ divisor is power of 2 ?
+       beq     12f
+
+       ARM_DIV_BODY r3, r1, r0, r2
+
+       cmp     ip, #0
+       rsbmi   r0, r0, #0
+       mov     pc, lr
+
+10:    teq     ip, r0                          @ same sign ?
+       rsbmi   r0, r0, #0
+       mov     pc, lr
+
+11:    movlo   r0, #0
+       moveq   r0, ip, asr #31
+       orreq   r0, r0, #1
+       mov     pc, lr
+
+12:    ARM_DIV2_ORDER r1, r2
+
+       cmp     ip, #0
+       mov     r0, r3, lsr r2
+       rsbmi   r0, r0, #0
+       mov     pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__divsi3)
+ENDPROC(__aeabi_idiv)
+
+ENTRY(__modsi3)
+UNWIND(.fnstart)
+
+       cmp     r1, #0
+       beq     Ldiv0
+       rsbmi   r1, r1, #0                      @ loops below use unsigned.
+       movs    ip, r0                          @ preserve sign of dividend
+       rsbmi   r0, r0, #0                      @ if negative make positive
+       subs    r2, r1, #1                      @ compare divisor with 1
+       cmpne   r0, r1                          @ compare dividend with divisor
+       moveq   r0, #0
+       tsthi   r1, r2                          @ see if divisor is power of 2
+       andeq   r0, r0, r2
+       bls     10f
+
+       ARM_MOD_BODY r0, r1, r2, r3
+
+10:    cmp     ip, #0
+       rsbmi   r0, r0, #0
+       mov     pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__modsi3)
+
+#ifdef CONFIG_AEABI
+
+ENTRY(__aeabi_uidivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr}  )
+
+       stmfd   sp!, {r0, r1, ip, lr}
+       bl      __aeabi_uidiv
+       ldmfd   sp!, {r1, r2, ip, lr}
+       mul     r3, r0, r2
+       sub     r1, r1, r3
+       mov     pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_uidivmod)
+
+ENTRY(__aeabi_idivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr}  )
+       stmfd   sp!, {r0, r1, ip, lr}
+       bl      __aeabi_idiv
+       ldmfd   sp!, {r1, r2, ip, lr}
+       mul     r3, r0, r2
+       sub     r1, r1, r3
+       mov     pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_idivmod)
+
+ENTRY(__aeabi_uldivmod)
+UNWIND(.fnstart)
+UNWIND(.save {lr}      )
+       sub sp, sp, #8
+       stmfd   sp!, {sp, lr}
+       bl __qdivrem
+       ldr lr, [sp, #4]
+       add sp, sp, #8
+       ldmfd sp!, {r2, r3}
+       mov     pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_uldivmod)
+
+ENTRY(__aeabi_ldivmod)
+UNWIND(.fnstart)
+UNWIND(.save {lr}      )
+       sub sp, sp, #16
+       stmfd   sp!, {sp, lr}
+       bl __ldivmod_helper
+       ldr lr, [sp, #4]
+       add sp, sp, #16
+       ldmfd   sp!, {r2, r3}
+       mov     pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_ldivmod)
+#endif
+
+Ldiv0:
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
+       str     lr, [sp, #-8]!
+       bl      __div0
+       mov     r0, #0                  @ About as wrong as it could be.
+       ldr     pc, [sp], #8
+UNWIND(.fnend)
+ENDPROC(Ldiv0)
diff --git a/xen/arch/arm/lib/memcpy.S b/xen/arch/arm/lib/memcpy.S
new file mode 100644
index 0000000000..538d34f536
--- /dev/null
+++ b/xen/arch/arm/lib/memcpy.S
@@ -0,0 +1,63 @@
+/*
+ *  linux/arch/arm/lib/memcpy.S
+ *
+ *  Author:    Nicolas Pitre
+ *  Created:   Sep 28, 2005
+ *  Copyright: MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+#include "assembler.h"
+
+#define LDR1W_SHIFT    0
+#define STR1W_SHIFT    0
+
+       .macro ldr1w ptr reg abort
+       W(ldr) \reg, [\ptr], #4
+       .endm
+
+       .macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+       ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+       .endm
+
+       .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+       ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+       .endm
+
+       .macro ldr1b ptr reg cond=al abort
+       ldr\cond\()b \reg, [\ptr], #1
+       .endm
+
+       .macro str1w ptr reg abort
+       W(str) \reg, [\ptr], #4
+       .endm
+
+       .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+       stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+       .endm
+
+       .macro str1b ptr reg cond=al abort
+       str\cond\()b \reg, [\ptr], #1
+       .endm
+
+       .macro enter reg1 reg2
+       stmdb sp!, {r0, \reg1, \reg2}
+       .endm
+
+       .macro exit reg1 reg2
+       ldmfd sp!, {r0, \reg1, \reg2}
+       .endm
+
+       .text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+ENTRY(memcpy)
+
+#include "copy_template.S"
+
+ENDPROC(memcpy)
diff --git a/xen/arch/arm/lib/memmove.S b/xen/arch/arm/lib/memmove.S
new file mode 100644
index 0000000000..5970f5d0b9
--- /dev/null
+++ b/xen/arch/arm/lib/memmove.S
@@ -0,0 +1,200 @@
+/*
+ *  linux/arch/arm/lib/memmove.S
+ *
+ *  Author:    Nicolas Pitre
+ *  Created:   Sep 28, 2005
+ *  Copyright: (C) MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+
+               .text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards.  This
+ * is a transposition of the code from copy_template.S but with the copy
+ * occurring in the opposite direction.
+ */
+
+ENTRY(memmove)
+
+               subs    ip, r0, r1
+               cmphi   r2, ip
+               bls     memcpy
+
+               stmfd   sp!, {r0, r4, lr}
+               add     r1, r1, r2
+               add     r0, r0, r2
+               subs    r2, r2, #4
+               blt     8f
+               ands    ip, r0, #3
+       PLD(    pld     [r1, #-4]               )
+               bne     9f
+               ands    ip, r1, #3
+               bne     10f
+
+1:             subs    r2, r2, #(28)
+               stmfd   sp!, {r5 - r8}
+               blt     5f
+
+       CALGN(  ands    ip, r0, #31             )
+       CALGN(  sbcnes  r4, ip, r2              )  @ C is always set here
+       CALGN(  bcs     2f                      )
+       CALGN(  adr     r4, 6f                  )
+       CALGN(  subs    r2, r2, ip              )  @ C is set here
+       CALGN(  rsb     ip, ip, #32             )
+       CALGN(  add     pc, r4, ip              )
+
+       PLD(    pld     [r1, #-4]               )
+2:     PLD(    subs    r2, r2, #96             )
+       PLD(    pld     [r1, #-32]              )
+       PLD(    blt     4f                      )
+       PLD(    pld     [r1, #-64]              )
+       PLD(    pld     [r1, #-96]              )
+
+3:     PLD(    pld     [r1, #-128]             )
+4:             ldmdb   r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+               subs    r2, r2, #32
+               stmdb   r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+               bge     3b
+       PLD(    cmn     r2, #96                 )
+       PLD(    bge     4b                      )
+
+5:             ands    ip, r2, #28
+               rsb     ip, ip, #32
+               addne   pc, pc, ip              @ C is always clear here
+               b       7f
+6:             W(nop)
+               W(ldr)  r3, [r1, #-4]!
+               W(ldr)  r4, [r1, #-4]!
+               W(ldr)  r5, [r1, #-4]!
+               W(ldr)  r6, [r1, #-4]!
+               W(ldr)  r7, [r1, #-4]!
+               W(ldr)  r8, [r1, #-4]!
+               W(ldr)  lr, [r1, #-4]!
+
+               add     pc, pc, ip
+               nop
+               W(nop)
+               W(str)  r3, [r0, #-4]!
+               W(str)  r4, [r0, #-4]!
+               W(str)  r5, [r0, #-4]!
+               W(str)  r6, [r0, #-4]!
+               W(str)  r7, [r0, #-4]!
+               W(str)  r8, [r0, #-4]!
+               W(str)  lr, [r0, #-4]!
+
+       CALGN(  bcs     2b                      )
+
+7:             ldmfd   sp!, {r5 - r8}
+
+8:             movs    r2, r2, lsl #31
+               ldrneb  r3, [r1, #-1]!
+               ldrcsb  r4, [r1, #-1]!
+               ldrcsb  ip, [r1, #-1]
+               strneb  r3, [r0, #-1]!
+               strcsb  r4, [r0, #-1]!
+               strcsb  ip, [r0, #-1]
+               ldmfd   sp!, {r0, r4, pc}
+
+9:             cmp     ip, #2
+               ldrgtb  r3, [r1, #-1]!
+               ldrgeb  r4, [r1, #-1]!
+               ldrb    lr, [r1, #-1]!
+               strgtb  r3, [r0, #-1]!
+               strgeb  r4, [r0, #-1]!
+               subs    r2, r2, ip
+               strb    lr, [r0, #-1]!
+               blt     8b
+               ands    ip, r1, #3
+               beq     1b
+
+10:            bic     r1, r1, #3
+               cmp     ip, #2
+               ldr     r3, [r1, #0]
+               beq     17f
+               blt     18f
+
+
+               .macro  backward_copy_shift push pull
+
+               subs    r2, r2, #28
+               blt     14f
+
+       CALGN(  ands    ip, r0, #31             )
+       CALGN(  sbcnes  r4, ip, r2              )  @ C is always set here
+       CALGN(  subcc   r2, r2, ip              )
+       CALGN(  bcc     15f                     )
+
+11:            stmfd   sp!, {r5 - r9}
+
+       PLD(    pld     [r1, #-4]               )
+       PLD(    subs    r2, r2, #96             )
+       PLD(    pld     [r1, #-32]              )
+       PLD(    blt     13f                     )
+       PLD(    pld     [r1, #-64]              )
+       PLD(    pld     [r1, #-96]              )
+
+12:    PLD(    pld     [r1, #-128]             )
+13:            ldmdb   r1!, {r7, r8, r9, ip}
+               mov     lr, r3, push #\push
+               subs    r2, r2, #32
+               ldmdb   r1!, {r3, r4, r5, r6}
+               orr     lr, lr, ip, pull #\pull
+               mov     ip, ip, push #\push
+               orr     ip, ip, r9, pull #\pull
+               mov     r9, r9, push #\push
+               orr     r9, r9, r8, pull #\pull
+               mov     r8, r8, push #\push
+               orr     r8, r8, r7, pull #\pull
+               mov     r7, r7, push #\push
+               orr     r7, r7, r6, pull #\pull
+               mov     r6, r6, push #\push
+               orr     r6, r6, r5, pull #\pull
+               mov     r5, r5, push #\push
+               orr     r5, r5, r4, pull #\pull
+               mov     r4, r4, push #\push
+               orr     r4, r4, r3, pull #\pull
+               stmdb   r0!, {r4 - r9, ip, lr}
+               bge     12b
+       PLD(    cmn     r2, #96                 )
+       PLD(    bge     13b                     )
+
+               ldmfd   sp!, {r5 - r9}
+
+14:            ands    ip, r2, #28
+               beq     16f
+
+15:            mov     lr, r3, push #\push
+               ldr     r3, [r1, #-4]!
+               subs    ip, ip, #4
+               orr     lr, lr, r3, pull #\pull
+               str     lr, [r0, #-4]!
+               bgt     15b
+       CALGN(  cmp     r2, #0                  )
+       CALGN(  bge     11b                     )
+
+16:            add     r1, r1, #(\pull / 8)
+               b       8b
+
+               .endm
+
+
+               backward_copy_shift     push=8  pull=24
+
+17:            backward_copy_shift     push=16 pull=16
+
+18:            backward_copy_shift     push=24 pull=8
+
+ENDPROC(memmove)
diff --git a/xen/arch/arm/lib/memset.S b/xen/arch/arm/lib/memset.S
new file mode 100644
index 0000000000..7433789ce4
--- /dev/null
+++ b/xen/arch/arm/lib/memset.S
@@ -0,0 +1,129 @@
+/*
+ *  linux/arch/arm/lib/memset.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+
+       .text
+       .align  5
+       .word   0
+
+1:     subs    r2, r2, #4              @ 1 do we have enough
+       blt     5f                      @ 1 bytes to align with?
+       cmp     r3, #2                  @ 1
+       strltb  r1, [r0], #1            @ 1
+       strleb  r1, [r0], #1            @ 1
+       strb    r1, [r0], #1            @ 1
+       add     r2, r2, r3              @ 1 (r2 = r2 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted.  Try doing the
+ * memset again.
+ */
+
+ENTRY(memset)
+       ands    r3, r0, #3              @ 1 unaligned?
+       bne     1b                      @ 1
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+       orr     r1, r1, r1, lsl #8
+       orr     r1, r1, r1, lsl #16
+       mov     r3, r1
+       cmp     r2, #16
+       blt     4f
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+       str     lr, [sp, #-4]!
+       mov     ip, r1
+       mov     lr, r1
+
+2:     subs    r2, r2, #64
+       stmgeia r0!, {r1, r3, ip, lr}   @ 64 bytes at a time.
+       stmgeia r0!, {r1, r3, ip, lr}
+       stmgeia r0!, {r1, r3, ip, lr}
+       stmgeia r0!, {r1, r3, ip, lr}
+       bgt     2b
+       ldmeqfd sp!, {pc}               @ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+       tst     r2, #32
+       stmneia r0!, {r1, r3, ip, lr}
+       stmneia r0!, {r1, r3, ip, lr}
+       tst     r2, #16
+       stmneia r0!, {r1, r3, ip, lr}
+       ldr     lr, [sp], #4
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+       stmfd   sp!, {r4-r7, lr}
+       mov     r4, r1
+       mov     r5, r1
+       mov     r6, r1
+       mov     r7, r1
+       mov     ip, r1
+       mov     lr, r1
+
+       cmp     r2, #96
+       tstgt   r0, #31
+       ble     3f
+
+       and     ip, r0, #31
+       rsb     ip, ip, #32
+       sub     r2, r2, ip
+       movs    ip, ip, lsl #(32 - 4)
+       stmcsia r0!, {r4, r5, r6, r7}
+       stmmiia r0!, {r4, r5}
+       tst     ip, #(1 << 30)
+       mov     ip, r1
+       strne   r1, [r0], #4
+
+3:     subs    r2, r2, #64
+       stmgeia r0!, {r1, r3-r7, ip, lr}
+       stmgeia r0!, {r1, r3-r7, ip, lr}
+       bgt     3b
+       ldmeqfd sp!, {r4-r7, pc}
+
+       tst     r2, #32
+       stmneia r0!, {r1, r3-r7, ip, lr}
+       tst     r2, #16
+       stmneia r0!, {r4-r7}
+       ldmfd   sp!, {r4-r7, lr}
+
+#endif
+
+4:     tst     r2, #8
+       stmneia r0!, {r1, r3}
+       tst     r2, #4
+       strne   r1, [r0], #4
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+5:     tst     r2, #2
+       strneb  r1, [r0], #1
+       strneb  r1, [r0], #1
+       tst     r2, #1
+       strneb  r1, [r0], #1
+       mov     pc, lr
+ENDPROC(memset)
diff --git a/xen/arch/arm/lib/memzero.S b/xen/arch/arm/lib/memzero.S
new file mode 100644
index 0000000000..ce6cdda584
--- /dev/null
+++ b/xen/arch/arm/lib/memzero.S
@@ -0,0 +1,127 @@
+/*
+ *  linux/arch/arm/lib/memzero.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+
+       .text
+       .align  5
+       .word   0
+/*
+ * Align the pointer in r0.  r3 contains the number of bytes that we are
+ * mis-aligned by, and r1 is the number of bytes.  If r1 < 4, then we
+ * don't bother; we use byte stores instead.
+ */
+1:     subs    r1, r1, #4              @ 1 do we have enough
+       blt     5f                      @ 1 bytes to align with?
+       cmp     r3, #2                  @ 1
+       strltb  r2, [r0], #1            @ 1
+       strleb  r2, [r0], #1            @ 1
+       strb    r2, [r0], #1            @ 1
+       add     r1, r1, r3              @ 1 (r1 = r1 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted.  Try doing the
+ * memzero again.
+ */
+
+ENTRY(__memzero)
+       mov     r2, #0                  @ 1
+       ands    r3, r0, #3              @ 1 unaligned?
+       bne     1b                      @ 1
+/*
+ * r3 = 0, and we know that the pointer in r0 is aligned to a word boundary.
+ */
+       cmp     r1, #16                 @ 1 we can skip this chunk if we
+       blt     4f                      @ 1 have < 16 bytes
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+       str     lr, [sp, #-4]!          @ 1
+       mov     ip, r2                  @ 1
+       mov     lr, r2                  @ 1
+
+3:     subs    r1, r1, #64             @ 1 write 32 bytes out per loop
+       stmgeia r0!, {r2, r3, ip, lr}   @ 4
+       stmgeia r0!, {r2, r3, ip, lr}   @ 4
+       stmgeia r0!, {r2, r3, ip, lr}   @ 4
+       stmgeia r0!, {r2, r3, ip, lr}   @ 4
+       bgt     3b                      @ 1
+       ldmeqfd sp!, {pc}               @ 1/2 quick exit
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+       tst     r1, #32                 @ 1
+       stmneia r0!, {r2, r3, ip, lr}   @ 4
+       stmneia r0!, {r2, r3, ip, lr}   @ 4
+       tst     r1, #16                 @ 1 16 bytes or more?
+       stmneia r0!, {r2, r3, ip, lr}   @ 4
+       ldr     lr, [sp], #4            @ 1
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+       stmfd   sp!, {r4-r7, lr}
+       mov     r4, r2
+       mov     r5, r2
+       mov     r6, r2
+       mov     r7, r2
+       mov     ip, r2
+       mov     lr, r2
+
+       cmp     r1, #96
+       andgts  ip, r0, #31
+       ble     3f
+
+       rsb     ip, ip, #32
+       sub     r1, r1, ip
+       movs    ip, ip, lsl #(32 - 4)
+       stmcsia r0!, {r4, r5, r6, r7}
+       stmmiia r0!, {r4, r5}
+       movs    ip, ip, lsl #2
+       strcs   r2, [r0], #4
+
+3:     subs    r1, r1, #64
+       stmgeia r0!, {r2-r7, ip, lr}
+       stmgeia r0!, {r2-r7, ip, lr}
+       bgt     3b
+       ldmeqfd sp!, {r4-r7, pc}
+
+       tst     r1, #32
+       stmneia r0!, {r2-r7, ip, lr}
+       tst     r1, #16
+       stmneia r0!, {r4-r7}
+       ldmfd   sp!, {r4-r7, lr}
+
+#endif
+
+4:     tst     r1, #8                  @ 1 8 bytes or more?
+       stmneia r0!, {r2, r3}           @ 2
+       tst     r1, #4                  @ 1 4 bytes or more?
+       strne   r2, [r0], #4            @ 1
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+5:     tst     r1, #2                  @ 1 2 bytes or more?
+       strneb  r2, [r0], #1            @ 1
+       strneb  r2, [r0], #1            @ 1
+       tst     r1, #1                  @ 1 a byte left over
+       strneb  r2, [r0], #1            @ 1
+       mov     pc, lr                  @ 1
+ENDPROC(__memzero)
diff --git a/xen/arch/arm/lib/setbit.S b/xen/arch/arm/lib/setbit.S
new file mode 100644
index 0000000000..ab00cf9287
--- /dev/null
+++ b/xen/arch/arm/lib/setbit.S
@@ -0,0 +1,18 @@
+/*
+ *  linux/arch/arm/lib/setbit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+       .text
+
+ENTRY(_set_bit)
+       bitop   orr
+ENDPROC(_set_bit)
diff --git a/xen/arch/arm/lib/testchangebit.S b/xen/arch/arm/lib/testchangebit.S
new file mode 100644
index 0000000000..53975dff35
--- /dev/null
+++ b/xen/arch/arm/lib/testchangebit.S
@@ -0,0 +1,18 @@
+/*
+ *  linux/arch/arm/lib/testchangebit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+                .text
+
+ENTRY(_test_and_change_bit)
+       testop  eor, str
+ENDPROC(_test_and_change_bit)
diff --git a/xen/arch/arm/lib/testclearbit.S b/xen/arch/arm/lib/testclearbit.S
new file mode 100644
index 0000000000..a0dd4fb4d4
--- /dev/null
+++ b/xen/arch/arm/lib/testclearbit.S
@@ -0,0 +1,18 @@
+/*
+ *  linux/arch/arm/lib/testclearbit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+                .text
+
+ENTRY(_test_and_clear_bit)
+       testop  bicne, strne
+ENDPROC(_test_and_clear_bit)
diff --git a/xen/arch/arm/lib/testsetbit.S b/xen/arch/arm/lib/testsetbit.S
new file mode 100644
index 0000000000..94e99ddc34
--- /dev/null
+++ b/xen/arch/arm/lib/testsetbit.S
@@ -0,0 +1,18 @@
+/*
+ *  linux/arch/arm/lib/testsetbit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+                .text
+
+ENTRY(_test_and_set_bit)
+       testop  orreq, streq
+ENDPROC(_test_and_set_bit)
diff --git a/xen/include/asm-arm/config.h b/xen/include/asm-arm/config.h
index 12285dd19c..9294f8f71f 100644
--- a/xen/include/asm-arm/config.h
+++ b/xen/include/asm-arm/config.h
@@ -30,6 +30,9 @@
 
 #define asmlinkage /* Nothing needed */
 
+#define __LINUX_ARM_ARCH__ 7
+#define CONFIG_AEABI
+
 /* Linkage for ARM */
 #define __ALIGN .align 2
 #define __ALIGN_STR ".align 2"
-- 
cgit v1.2.3