From 716ca530e1c4515d8683c9d5be3d56b301758b66 Mon Sep 17 00:00:00 2001 From: James <> Date: Wed, 4 Nov 2015 11:49:21 +0000 Subject: trunk-47381 --- ...opy_to_user-and-__copy_from_user-performa.patch | 1496 ++++++++++++++++++++ 1 file changed, 1496 insertions(+) create mode 100644 target/linux/brcm2708/patches-4.1/0070-Improve-__copy_to_user-and-__copy_from_user-performa.patch (limited to 'target/linux/brcm2708/patches-4.1/0070-Improve-__copy_to_user-and-__copy_from_user-performa.patch') diff --git a/target/linux/brcm2708/patches-4.1/0070-Improve-__copy_to_user-and-__copy_from_user-performa.patch b/target/linux/brcm2708/patches-4.1/0070-Improve-__copy_to_user-and-__copy_from_user-performa.patch new file mode 100644 index 0000000..3e12a13 --- /dev/null +++ b/target/linux/brcm2708/patches-4.1/0070-Improve-__copy_to_user-and-__copy_from_user-performa.patch @@ -0,0 +1,1496 @@ +From 5929da3d7ce831709884de3e3564de028fafc28c Mon Sep 17 00:00:00 2001 +From: Phil Elwell +Date: Mon, 13 Oct 2014 11:47:53 +0100 +Subject: [PATCH 070/203] Improve __copy_to_user and __copy_from_user + performance + +Provide a __copy_from_user that uses memcpy. On BCM2708, use +optimised memcpy/memmove/memcmp/memset implementations. +--- + arch/arm/include/asm/string.h | 5 + + arch/arm/include/asm/uaccess.h | 1 + + arch/arm/lib/Makefile | 15 +- + arch/arm/lib/arm-mem.h | 159 ++++++++++++ + arch/arm/lib/copy_from_user.S | 4 +- + arch/arm/lib/exports_rpi.c | 37 +++ + arch/arm/lib/memcmp_rpi.S | 285 +++++++++++++++++++++ + arch/arm/lib/memcpy_rpi.S | 59 +++++ + arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++++++++++ + arch/arm/lib/memmove_rpi.S | 61 +++++ + arch/arm/lib/memset_rpi.S | 121 +++++++++ + arch/arm/lib/uaccess_with_memcpy.c | 112 +++++++- + 12 files changed, 1359 insertions(+), 6 deletions(-) + create mode 100644 arch/arm/lib/arm-mem.h + create mode 100644 arch/arm/lib/exports_rpi.c + create mode 100644 arch/arm/lib/memcmp_rpi.S + create mode 100644 arch/arm/lib/memcpy_rpi.S + create mode 100644 arch/arm/lib/memcpymove.h + create mode 100644 arch/arm/lib/memmove_rpi.S + create mode 100644 arch/arm/lib/memset_rpi.S + +--- a/arch/arm/include/asm/string.h ++++ b/arch/arm/include/asm/string.h +@@ -24,6 +24,11 @@ extern void * memchr(const void *, int, + #define __HAVE_ARCH_MEMSET + extern void * memset(void *, int, __kernel_size_t); + ++#ifdef CONFIG_MACH_BCM2708 ++#define __HAVE_ARCH_MEMCMP ++extern int memcmp(const void *, const void *, size_t); ++#endif ++ + extern void __memzero(void *ptr, __kernel_size_t n); + + #define memset(p,v,n) \ +--- a/arch/arm/include/asm/uaccess.h ++++ b/arch/arm/include/asm/uaccess.h +@@ -475,6 +475,7 @@ do { \ + + #ifdef CONFIG_MMU + extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); ++extern unsigned long __must_check __copy_from_user_std(void *to, const void __user *from, unsigned long n); + extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); + extern unsigned long __must_check __copy_to_user_std(void __user *to, const void *from, unsigned long n); + extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +--- a/arch/arm/lib/Makefile ++++ b/arch/arm/lib/Makefile +@@ -6,9 +6,8 @@ + + lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \ + csumpartialcopy.o csumpartialcopyuser.o clearbit.o \ +- delay.o delay-loop.o findbit.o memchr.o memcpy.o \ +- memmove.o memset.o memzero.o setbit.o \ +- strchr.o strrchr.o \ ++ delay.o delay-loop.o findbit.o memchr.o memzero.o \ ++ setbit.o strchr.o strrchr.o \ + testchangebit.o testclearbit.o testsetbit.o \ + ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ + ucmpdi2.o lib1funcs.o div64.o \ +@@ -18,6 +17,16 @@ lib-y := backtrace.o changebit.o csumip + mmu-y := clear_user.o copy_page.o getuser.o putuser.o \ + copy_from_user.o copy_to_user.o + ++# Choose optimised implementations for Raspberry Pi ++ifeq ($(CONFIG_MACH_BCM2708),y) ++ CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600 ++ CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672 ++ obj-$(CONFIG_MODULES) += exports_rpi.o ++ lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o ++else ++ lib-y += memcpy.o memmove.o memset.o ++endif ++ + # using lib_ here won't override already available weak symbols + obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o + +--- /dev/null ++++ b/arch/arm/lib/arm-mem.h +@@ -0,0 +1,159 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++.macro myfunc fname ++ .func fname ++ .global fname ++fname: ++.endm ++ ++.macro preload_leading_step1 backwards, ptr, base ++/* If the destination is already 16-byte aligned, then we need to preload ++ * between 0 and prefetch_distance (inclusive) cache lines ahead so there ++ * are no gaps when the inner loop starts. ++ */ ++ .if backwards ++ sub ptr, base, #1 ++ bic ptr, ptr, #31 ++ .else ++ bic ptr, base, #31 ++ .endif ++ .set OFFSET, 0 ++ .rept prefetch_distance+1 ++ pld [ptr, #OFFSET] ++ .if backwards ++ .set OFFSET, OFFSET-32 ++ .else ++ .set OFFSET, OFFSET+32 ++ .endif ++ .endr ++.endm ++ ++.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp ++/* However, if the destination is not 16-byte aligned, we may need to ++ * preload one more cache line than that. The question we need to ask is: ++ * are the leading bytes more than the amount by which the source ++ * pointer will be rounded down for preloading, and if so, by how many ++ * cache lines? ++ */ ++ .if backwards ++/* Here we compare against how many bytes we are into the ++ * cache line, counting down from the highest such address. ++ * Effectively, we want to calculate ++ * leading_bytes = dst&15 ++ * cacheline_offset = 31-((src-leading_bytes-1)&31) ++ * extra_needed = leading_bytes - cacheline_offset ++ * and test if extra_needed is <= 0, or rearranging: ++ * leading_bytes + (src-leading_bytes-1)&31 <= 31 ++ */ ++ mov tmp, base, lsl #32-5 ++ sbc tmp, tmp, leading_bytes, lsl #32-5 ++ adds tmp, tmp, leading_bytes, lsl #32-5 ++ bcc 61f ++ pld [ptr, #-32*(prefetch_distance+1)] ++ .else ++/* Effectively, we want to calculate ++ * leading_bytes = (-dst)&15 ++ * cacheline_offset = (src+leading_bytes)&31 ++ * extra_needed = leading_bytes - cacheline_offset ++ * and test if extra_needed is <= 0. ++ */ ++ mov tmp, base, lsl #32-5 ++ add tmp, tmp, leading_bytes, lsl #32-5 ++ rsbs tmp, tmp, leading_bytes, lsl #32-5 ++ bls 61f ++ pld [ptr, #32*(prefetch_distance+1)] ++ .endif ++61: ++.endm ++ ++.macro preload_trailing backwards, base, remain, tmp ++ /* We need either 0, 1 or 2 extra preloads */ ++ .if backwards ++ rsb tmp, base, #0 ++ mov tmp, tmp, lsl #32-5 ++ .else ++ mov tmp, base, lsl #32-5 ++ .endif ++ adds tmp, tmp, remain, lsl #32-5 ++ adceqs tmp, tmp, #0 ++ /* The instruction above has two effects: ensures Z is only ++ * set if C was clear (so Z indicates that both shifted quantities ++ * were 0), and clears C if Z was set (so C indicates that the sum ++ * of the shifted quantities was greater and not equal to 32) */ ++ beq 82f ++ .if backwards ++ sub tmp, base, #1 ++ bic tmp, tmp, #31 ++ .else ++ bic tmp, base, #31 ++ .endif ++ bcc 81f ++ .if backwards ++ pld [tmp, #-32*(prefetch_distance+1)] ++81: ++ pld [tmp, #-32*prefetch_distance] ++ .else ++ pld [tmp, #32*(prefetch_distance+2)] ++81: ++ pld [tmp, #32*(prefetch_distance+1)] ++ .endif ++82: ++.endm ++ ++.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1 ++ .if backwards ++ sub tmp0, base, #1 ++ bic tmp0, tmp0, #31 ++ pld [tmp0] ++ sub tmp1, base, remain, lsl #shift ++ .else ++ bic tmp0, base, #31 ++ pld [tmp0] ++ add tmp1, base, remain, lsl #shift ++ sub tmp1, tmp1, #1 ++ .endif ++ bic tmp1, tmp1, #31 ++ cmp tmp1, tmp0 ++ beq 92f ++ .if narrow_case ++ /* In this case, all the data fits in either 1 or 2 cache lines */ ++ pld [tmp1] ++ .else ++91: ++ .if backwards ++ sub tmp0, tmp0, #32 ++ .else ++ add tmp0, tmp0, #32 ++ .endif ++ cmp tmp0, tmp1 ++ pld [tmp0] ++ bne 91b ++ .endif ++92: ++.endm +--- a/arch/arm/lib/copy_from_user.S ++++ b/arch/arm/lib/copy_from_user.S +@@ -89,11 +89,13 @@ + + .text + +-ENTRY(__copy_from_user) ++ENTRY(__copy_from_user_std) ++WEAK(__copy_from_user) + + #include "copy_template.S" + + ENDPROC(__copy_from_user) ++ENDPROC(__copy_from_user_std) + + .pushsection .fixup,"ax" + .align 0 +--- /dev/null ++++ b/arch/arm/lib/exports_rpi.c +@@ -0,0 +1,37 @@ ++/** ++ * Copyright (c) 2014, Raspberry Pi (Trading) Ltd. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions, and the following disclaimer, ++ * without modification. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The names of the above-listed copyright holders may not be used ++ * to endorse or promote products derived from this software without ++ * specific prior written permission. ++ * ++ * ALTERNATIVELY, this software may be distributed under the terms of the ++ * GNU General Public License ("GPL") version 2, as published by the Free ++ * Software Foundation. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS ++ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ++ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++EXPORT_SYMBOL(memcmp); +--- /dev/null ++++ b/arch/arm/lib/memcmp_rpi.S +@@ -0,0 +1,285 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include "arm-mem.h" ++ ++/* Prevent the stack from becoming executable */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++ .text ++ .arch armv6 ++ .object_arch armv4 ++ .arm ++ .altmacro ++ .p2align 2 ++ ++.macro memcmp_process_head unaligned ++ .if unaligned ++ ldr DAT0, [S_1], #4 ++ ldr DAT1, [S_1], #4 ++ ldr DAT2, [S_1], #4 ++ ldr DAT3, [S_1], #4 ++ .else ++ ldmia S_1!, {DAT0, DAT1, DAT2, DAT3} ++ .endif ++ ldmia S_2!, {DAT4, DAT5, DAT6, DAT7} ++.endm ++ ++.macro memcmp_process_tail ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ cmpeq DAT2, DAT6 ++ cmpeq DAT3, DAT7 ++ bne 200f ++.endm ++ ++.macro memcmp_leading_31bytes ++ movs DAT0, OFF, lsl #31 ++ ldrmib DAT0, [S_1], #1 ++ ldrcsh DAT1, [S_1], #2 ++ ldrmib DAT4, [S_2], #1 ++ ldrcsh DAT5, [S_2], #2 ++ movpl DAT0, #0 ++ movcc DAT1, #0 ++ movpl DAT4, #0 ++ movcc DAT5, #0 ++ submi N, N, #1 ++ subcs N, N, #2 ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ bne 200f ++ movs DAT0, OFF, lsl #29 ++ ldrmi DAT0, [S_1], #4 ++ ldrcs DAT1, [S_1], #4 ++ ldrcs DAT2, [S_1], #4 ++ ldrmi DAT4, [S_2], #4 ++ ldmcsia S_2!, {DAT5, DAT6} ++ movpl DAT0, #0 ++ movcc DAT1, #0 ++ movcc DAT2, #0 ++ movpl DAT4, #0 ++ movcc DAT5, #0 ++ movcc DAT6, #0 ++ submi N, N, #4 ++ subcs N, N, #8 ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ cmpeq DAT2, DAT6 ++ bne 200f ++ tst OFF, #16 ++ beq 105f ++ memcmp_process_head 1 ++ sub N, N, #16 ++ memcmp_process_tail ++105: ++.endm ++ ++.macro memcmp_trailing_15bytes unaligned ++ movs N, N, lsl #29 ++ .if unaligned ++ ldrcs DAT0, [S_1], #4 ++ ldrcs DAT1, [S_1], #4 ++ .else ++ ldmcsia S_1!, {DAT0, DAT1} ++ .endif ++ ldrmi DAT2, [S_1], #4 ++ ldmcsia S_2!, {DAT4, DAT5} ++ ldrmi DAT6, [S_2], #4 ++ movcc DAT0, #0 ++ movcc DAT1, #0 ++ movpl DAT2, #0 ++ movcc DAT4, #0 ++ movcc DAT5, #0 ++ movpl DAT6, #0 ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ cmpeq DAT2, DAT6 ++ bne 200f ++ movs N, N, lsl #2 ++ ldrcsh DAT0, [S_1], #2 ++ ldrmib DAT1, [S_1] ++ ldrcsh DAT4, [S_2], #2 ++ ldrmib DAT5, [S_2] ++ movcc DAT0, #0 ++ movpl DAT1, #0 ++ movcc DAT4, #0 ++ movpl DAT5, #0 ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ bne 200f ++.endm ++ ++.macro memcmp_long_inner_loop unaligned ++110: ++ memcmp_process_head unaligned ++ pld [S_2, #prefetch_distance*32 + 16] ++ memcmp_process_tail ++ memcmp_process_head unaligned ++ pld [S_1, OFF] ++ memcmp_process_tail ++ subs N, N, #32 ++ bhs 110b ++ /* Just before the final (prefetch_distance+1) 32-byte blocks, ++ * deal with final preloads */ ++ preload_trailing 0, S_1, N, DAT0 ++ preload_trailing 0, S_2, N, DAT0 ++ add N, N, #(prefetch_distance+2)*32 - 16 ++120: ++ memcmp_process_head unaligned ++ memcmp_process_tail ++ subs N, N, #16 ++ bhs 120b ++ /* Trailing words and bytes */ ++ tst N, #15 ++ beq 199f ++ memcmp_trailing_15bytes unaligned ++199: /* Reached end without detecting a difference */ ++ mov a1, #0 ++ setend le ++ pop {DAT1-DAT6, pc} ++.endm ++ ++.macro memcmp_short_inner_loop unaligned ++ subs N, N, #16 /* simplifies inner loop termination */ ++ blo 122f ++120: ++ memcmp_process_head unaligned ++ memcmp_process_tail ++ subs N, N, #16 ++ bhs 120b ++122: /* Trailing words and bytes */ ++ tst N, #15 ++ beq 199f ++ memcmp_trailing_15bytes unaligned ++199: /* Reached end without detecting a difference */ ++ mov a1, #0 ++ setend le ++ pop {DAT1-DAT6, pc} ++.endm ++ ++/* ++ * int memcmp(const void *s1, const void *s2, size_t n); ++ * On entry: ++ * a1 = pointer to buffer 1 ++ * a2 = pointer to buffer 2 ++ * a3 = number of bytes to compare (as unsigned chars) ++ * On exit: ++ * a1 = >0/=0/<0 if s1 >/=/< s2 ++ */ ++ ++.set prefetch_distance, 2 ++ ++ENTRY(memcmp) ++ S_1 .req a1 ++ S_2 .req a2 ++ N .req a3 ++ DAT0 .req a4 ++ DAT1 .req v1 ++ DAT2 .req v2 ++ DAT3 .req v3 ++ DAT4 .req v4 ++ DAT5 .req v5 ++ DAT6 .req v6 ++ DAT7 .req ip ++ OFF .req lr ++ ++ push {DAT1-DAT6, lr} ++ setend be /* lowest-addressed bytes are most significant */ ++ ++ /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */ ++ cmp N, #(prefetch_distance+3)*32 - 1 ++ blo 170f ++ ++ /* Long case */ ++ /* Adjust N so that the decrement instruction can also test for ++ * inner loop termination. We want it to stop when there are ++ * (prefetch_distance+1) complete blocks to go. */ ++ sub N, N, #(prefetch_distance+2)*32 ++ preload_leading_step1 0, DAT0, S_1 ++ preload_leading_step1 0, DAT1, S_2 ++ tst S_2, #31 ++ beq 154f ++ rsb OFF, S_2, #0 /* no need to AND with 15 here */ ++ preload_leading_step2 0, DAT0, S_1, OFF, DAT2 ++ preload_leading_step2 0, DAT1, S_2, OFF, DAT2 ++ memcmp_leading_31bytes ++154: /* Second source now cacheline (32-byte) aligned; we have at ++ * least one prefetch to go. */ ++ /* Prefetch offset is best selected such that it lies in the ++ * first 8 of each 32 bytes - but it's just as easy to aim for ++ * the first one */ ++ and OFF, S_1, #31 ++ rsb OFF, OFF, #32*prefetch_distance ++ tst S_1, #3 ++ bne 140f ++ memcmp_long_inner_loop 0 ++140: memcmp_long_inner_loop 1 ++ ++170: /* Short case */ ++ teq N, #0 ++ beq 199f ++ preload_all 0, 0, 0, S_1, N, DAT0, DAT1 ++ preload_all 0, 0, 0, S_2, N, DAT0, DAT1 ++ tst S_2, #3 ++ beq 174f ++172: subs N, N, #1 ++ blo 199f ++ ldrb DAT0, [S_1], #1 ++ ldrb DAT4, [S_2], #1 ++ cmp DAT0, DAT4 ++ bne 200f ++ tst S_2, #3 ++ bne 172b ++174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */ ++ tst S_1, #3 ++ bne 140f ++ memcmp_short_inner_loop 0 ++140: memcmp_short_inner_loop 1 ++ ++200: /* Difference found: determine sign. */ ++ movhi a1, #1 ++ movlo a1, #-1 ++ setend le ++ pop {DAT1-DAT6, pc} ++ ++ .unreq S_1 ++ .unreq S_2 ++ .unreq N ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq DAT4 ++ .unreq DAT5 ++ .unreq DAT6 ++ .unreq DAT7 ++ .unreq OFF ++ENDPROC(memcmp) +--- /dev/null ++++ b/arch/arm/lib/memcpy_rpi.S +@@ -0,0 +1,59 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include "arm-mem.h" ++#include "memcpymove.h" ++ ++/* Prevent the stack from becoming executable */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++ .text ++ .arch armv6 ++ .object_arch armv4 ++ .arm ++ .altmacro ++ .p2align 2 ++ ++/* ++ * void *memcpy(void * restrict s1, const void * restrict s2, size_t n); ++ * On entry: ++ * a1 = pointer to destination ++ * a2 = pointer to source ++ * a3 = number of bytes to copy ++ * On exit: ++ * a1 preserved ++ */ ++ ++.set prefetch_distance, 3 ++ ++ENTRY(memcpy) ++ memcpy 0 ++ENDPROC(memcpy) +--- /dev/null ++++ b/arch/arm/lib/memcpymove.h +@@ -0,0 +1,506 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8 ++ .if words == 1 ++ .if backwards ++ mov r1, r0, lsl #32-align*8 ++ ldr r0, [S, #-4]! ++ orr r1, r1, r0, lsr #align*8 ++ str r1, [D, #-4]! ++ .else ++ mov r0, r1, lsr #align*8 ++ ldr r1, [S, #4]! ++ orr r0, r0, r1, lsl #32-align*8 ++ str r0, [D], #4 ++ .endif ++ .elseif words == 2 ++ .if backwards ++ ldr r1, [S, #-4]! ++ mov r2, r0, lsl #32-align*8 ++ ldr r0, [S, #-4]! ++ orr r2, r2, r1, lsr #align*8 ++ mov r1, r1, lsl #32-align*8 ++ orr r1, r1, r0, lsr #align*8 ++ stmdb D!, {r1, r2} ++ .else ++ ldr r1, [S, #4]! ++ mov r0, r2, lsr #align*8 ++ ldr r2, [S, #4]! ++ orr r0, r0, r1, lsl #32-align*8 ++ mov r1, r1, lsr #align*8 ++ orr r1, r1, r2, lsl #32-align*8 ++ stmia D!, {r0, r1} ++ .endif ++ .elseif words == 4 ++ .if backwards ++ ldmdb S!, {r2, r3} ++ mov r4, r0, lsl #32-align*8 ++ ldmdb S!, {r0, r1} ++ orr r4, r4, r3, lsr #align*8 ++ mov r3, r3, lsl #32-align*8 ++ orr r3, r3, r2, lsr #align*8 ++ mov r2, r2, lsl #32-align*8 ++ orr r2, r2, r1, lsr #align*8 ++ mov r1, r1, lsl #32-align*8 ++ orr r1, r1, r0, lsr #align*8 ++ stmdb D!, {r1, r2, r3, r4} ++ .else ++ ldmib S!, {r1, r2} ++ mov r0, r4, lsr #align*8 ++ ldmib S!, {r3, r4} ++ orr r0, r0, r1, lsl #32-align*8 ++ mov r1, r1, lsr #align*8 ++ orr r1, r1, r2, lsl #32-align*8 ++ mov r2, r2, lsr #align*8 ++ orr r2, r2, r3, lsl #32-align*8 ++ mov r3, r3, lsr #align*8 ++ orr r3, r3, r4, lsl #32-align*8 ++ stmia D!, {r0, r1, r2, r3} ++ .endif ++ .elseif words == 8 ++ .if backwards ++ ldmdb S!, {r4, r5, r6, r7} ++ mov r8, r0, lsl #32-align*8 ++ ldmdb S!, {r0, r1, r2, r3} ++ .if use_pld ++ pld [S, OFF] ++ .endif ++ orr r8, r8, r7, lsr #align*8 ++ mov r7, r7, lsl #32-align*8 ++ orr r7, r7, r6, lsr #align*8 ++ mov r6, r6, lsl #32-align*8 ++ orr r6, r6, r5, lsr #align*8 ++ mov r5, r5, lsl #32-align*8 ++ orr r5, r5, r4, lsr #align*8 ++ mov r4, r4, lsl #32-align*8 ++ orr r4, r4, r3, lsr #align*8 ++ mov r3, r3, lsl #32-align*8 ++ orr r3, r3, r2, lsr #align*8 ++ mov r2, r2, lsl #32-align*8 ++ orr r2, r2, r1, lsr #align*8 ++ mov r1, r1, lsl #32-align*8 ++ orr r1, r1, r0, lsr #align*8 ++ stmdb D!, {r5, r6, r7, r8} ++ stmdb D!, {r1, r2, r3, r4} ++ .else ++ ldmib S!, {r1, r2, r3, r4} ++ mov r0, r8, lsr #align*8 ++ ldmib S!, {r5, r6, r7, r8} ++ .if use_pld ++ pld [S, OFF] ++ .endif ++ orr r0, r0, r1, lsl #32-align*8 ++ mov r1, r1, lsr #align*8 ++ orr r1, r1, r2, lsl #32-align*8 ++ mov r2, r2, lsr #align*8 ++ orr r2, r2, r3, lsl #32-align*8 ++ mov r3, r3, lsr #align*8 ++ orr r3, r3, r4, lsl #32-align*8 ++ mov r4, r4, lsr #align*8 ++ orr r4, r4, r5, lsl #32-align*8 ++ mov r5, r5, lsr #align*8 ++ orr r5, r5, r6, lsl #32-align*8 ++ mov r6, r6, lsr #align*8 ++ orr r6, r6, r7, lsl #32-align*8 ++ mov r7, r7, lsr #align*8 ++ orr r7, r7, r8, lsl #32-align*8 ++ stmia D!, {r0, r1, r2, r3} ++ stmia D!, {r4, r5, r6, r7} ++ .endif ++ .endif ++.endm ++ ++.macro memcpy_leading_15bytes backwards, align ++ movs DAT1, DAT2, lsl #31 ++ sub N, N, DAT2 ++ .if backwards ++ ldrmib DAT0, [S, #-1]! ++ ldrcsh DAT1, [S, #-2]! ++ strmib DAT0, [D, #-1]! ++ strcsh DAT1, [D, #-2]! ++ .else ++ ldrmib DAT0, [S], #1 ++ ldrcsh DAT1, [S], #2 ++ strmib DAT0, [D], #1 ++ strcsh DAT1, [D], #2 ++ .endif ++ movs DAT1, DAT2, lsl #29 ++ .if backwards ++ ldrmi DAT0, [S, #-4]! ++ .if align == 0 ++ ldmcsdb S!, {DAT1, DAT2} ++ .else ++ ldrcs DAT2, [S, #-4]! ++ ldrcs DAT1, [S, #-4]! ++ .endif ++ strmi DAT0, [D, #-4]! ++ stmcsdb D!, {DAT1, DAT2} ++ .else ++ ldrmi DAT0, [S], #4 ++ .if align == 0 ++ ldmcsia S!, {DAT1, DAT2} ++ .else ++ ldrcs DAT1, [S], #4 ++ ldrcs DAT2, [S], #4 ++ .endif ++ strmi DAT0, [D], #4 ++ stmcsia D!, {DAT1, DAT2} ++ .endif ++.endm ++ ++.macro memcpy_trailing_15bytes backwards, align ++ movs N, N, lsl #29 ++ .if backwards ++ .if align == 0 ++ ldmcsdb S!, {DAT0, DAT1} ++ .else ++ ldrcs DAT1, [S, #-4]! ++ ldrcs DAT0, [S, #-4]! ++ .endif ++ ldrmi DAT2, [S, #-4]! ++ stmcsdb D!, {DAT0, DAT1} ++ strmi DAT2, [D, #-4]! ++ .else ++ .if align == 0 ++ ldmcsia S!, {DAT0, DAT1} ++ .else ++ ldrcs DAT0, [S], #4 ++ ldrcs DAT1, [S], #4 ++ .endif ++ ldrmi DAT2, [S], #4 ++ stmcsia D!, {DAT0, DAT1} ++ strmi DAT2, [D], #4 ++ .endif ++ movs N, N, lsl #2 ++ .if backwards ++ ldrcsh DAT0, [S, #-2]! ++ ldrmib DAT1, [S, #-1] ++ strcsh DAT0, [D, #-2]! ++ strmib DAT1, [D, #-1] ++ .else ++ ldrcsh DAT0, [S], #2 ++ ldrmib DAT1, [S] ++ strcsh DAT0, [D], #2 ++ strmib DAT1, [D] ++ .endif ++.endm ++ ++.macro memcpy_long_inner_loop backwards, align ++ .if align != 0 ++ .if backwards ++ ldr DAT0, [S, #-align]! ++ .else ++ ldr LAST, [S, #-align]! ++ .endif ++ .endif ++110: ++ .if align == 0 ++ .if backwards ++ ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} ++ pld [S, OFF] ++ stmdb D!, {DAT4, DAT5, DAT6, LAST} ++ stmdb D!, {DAT0, DAT1, DAT2, DAT3} ++ .else ++ ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} ++ pld [S, OFF] ++ stmia D!, {DAT0, DAT1, DAT2, DAT3} ++ stmia D!, {DAT4, DAT5, DAT6, LAST} ++ .endif ++ .else ++ unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST ++ .endif ++ subs N, N, #32 ++ bhs 110b ++ /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ ++ preload_trailing backwards, S, N, OFF ++ add N, N, #(prefetch_distance+2)*32 - 32 ++120: ++ .if align == 0 ++ .if backwards ++ ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} ++ stmdb D!, {DAT4, DAT5, DAT6, LAST} ++ stmdb D!, {DAT0, DAT1, DAT2, DAT3} ++ .else ++ ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} ++ stmia D!, {DAT0, DAT1, DAT2, DAT3} ++ stmia D!, {DAT4, DAT5, DAT6, LAST} ++ .endif ++ .else ++ unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST ++ .endif ++ subs N, N, #32 ++ bhs 120b ++ tst N, #16 ++ .if align == 0 ++ .if backwards ++ ldmnedb S!, {DAT0, DAT1, DAT2, LAST} ++ stmnedb D!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldmneia S!, {DAT0, DAT1, DAT2, LAST} ++ stmneia D!, {DAT0, DAT1, DAT2, LAST} ++ .endif ++ .else ++ beq 130f ++ unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST ++130: ++ .endif ++ /* Trailing words and bytes */ ++ tst N, #15 ++ beq 199f ++ .if align != 0 ++ add S, S, #align ++ .endif ++ memcpy_trailing_15bytes backwards, align ++199: ++ pop {DAT3, DAT4, DAT5, DAT6, DAT7} ++ pop {D, DAT1, DAT2, pc} ++.endm ++ ++.macro memcpy_medium_inner_loop backwards, align ++120: ++ .if backwards ++ .if align == 0 ++ ldmdb S!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldr LAST, [S, #-4]! ++ ldr DAT2, [S, #-4]! ++ ldr DAT1, [S, #-4]! ++ ldr DAT0, [S, #-4]! ++ .endif ++ stmdb D!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ .if align == 0 ++ ldmia S!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldr DAT0, [S], #4 ++ ldr DAT1, [S], #4 ++ ldr DAT2, [S], #4 ++ ldr LAST, [S], #4 ++ .endif ++ stmia D!, {DAT0, DAT1, DAT2, LAST} ++ .endif ++ subs N, N, #16 ++ bhs 120b ++ /* Trailing words and bytes */ ++ tst N, #15 ++ beq 199f ++ memcpy_trailing_15bytes backwards, align ++199: ++ pop {D, DAT1, DAT2, pc} ++.endm ++ ++.macro memcpy_short_inner_loop backwards, align ++ tst N, #16 ++ .if backwards ++ .if align == 0 ++ ldmnedb S!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldrne LAST, [S, #-4]! ++ ldrne DAT2, [S, #-4]! ++ ldrne DAT1, [S, #-4]! ++ ldrne DAT0, [S, #-4]! ++ .endif ++ stmnedb D!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ .if align == 0 ++ ldmneia S!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldrne DAT0, [S], #4 ++ ldrne DAT1, [S], #4 ++ ldrne DAT2, [S], #4 ++ ldrne LAST, [S], #4 ++ .endif ++ stmneia D!, {DAT0, DAT1, DAT2, LAST} ++ .endif ++ memcpy_trailing_15bytes backwards, align ++199: ++ pop {D, DAT1, DAT2, pc} ++.endm ++ ++.macro memcpy backwards ++ D .req a1 ++ S .req a2 ++ N .req a3 ++ DAT0 .req a4 ++ DAT1 .req v1 ++ DAT2 .req v2 ++ DAT3 .req v3 ++ DAT4 .req v4 ++ DAT5 .req v5 ++ DAT6 .req v6 ++ DAT7 .req sl ++ LAST .req ip ++ OFF .req lr ++ ++ .cfi_startproc ++ ++ push {D, DAT1, DAT2, lr} ++ ++ .cfi_def_cfa_offset 16 ++ .cfi_rel_offset D, 0 ++ .cfi_undefined S ++ .cfi_undefined N ++ .cfi_undefined DAT0 ++ .cfi_rel_offset DAT1, 4 ++ .cfi_rel_offset DAT2, 8 ++ .cfi_undefined LAST ++ .cfi_rel_offset lr, 12 ++ ++ .if backwards ++ add D, D, N ++ add S, S, N ++ .endif ++ ++ /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */ ++ cmp N, #31 ++ blo 170f ++ /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */ ++ cmp N, #(prefetch_distance+3)*32 - 1 ++ blo 160f ++ ++ /* Long case */ ++ push {DAT3, DAT4, DAT5, DAT6, DAT7} ++ ++ .cfi_def_cfa_offset 36 ++ .cfi_rel_offset D, 20 ++ .cfi_rel_offset DAT1, 24 ++ .cfi_rel_offset DAT2, 28 ++ .cfi_rel_offset DAT3, 0 ++ .cfi_rel_offset DAT4, 4 ++ .cfi_rel_offset DAT5, 8 ++ .cfi_rel_offset DAT6, 12 ++ .cfi_rel_offset DAT7, 16 ++ .cfi_rel_offset lr, 32 ++ ++ /* Adjust N so that the decrement instruction can also test for ++ * inner loop termination. We want it to stop when there are ++ * (prefetch_distance+1) complete blocks to go. */ ++ sub N, N, #(prefetch_distance+2)*32 ++ preload_leading_step1 backwards, DAT0, S ++ .if backwards ++ /* Bug in GAS: it accepts, but mis-assembles the instruction ++ * ands DAT2, D, #60, 2 ++ * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow) ++ */ ++ .word 0xE210513C ++ beq 154f ++ .else ++ ands DAT2, D, #15 ++ beq 154f ++ rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */ ++ .endif ++ preload_leading_step2 backwards, DAT0, S, DAT2, OFF ++ memcpy_leading_15bytes backwards, 1 ++154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */ ++ /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */ ++ .if backwards ++ rsb OFF, S, #3 ++ and OFF, OFF, #28 ++ sub OFF, OFF, #32*(prefetch_distance+1) ++ .else ++ and OFF, S, #28 ++ rsb OFF, OFF, #32*prefetch_distance ++ .endif ++ movs DAT0, S, lsl #31 ++ bhi 157f ++ bcs 156f ++ bmi 155f ++ memcpy_long_inner_loop backwards, 0 ++155: memcpy_long_inner_loop backwards, 1 ++156: memcpy_long_inner_loop backwards, 2 ++157: memcpy_long_inner_loop backwards, 3 ++ ++ .cfi_def_cfa_offset 16 ++ .cfi_rel_offset D, 0 ++ .cfi_rel_offset DAT1, 4 ++ .cfi_rel_offset DAT2, 8 ++ .cfi_same_value DAT3 ++ .cfi_same_value DAT4 ++ .cfi_same_value DAT5 ++ .cfi_same_value DAT6 ++ .cfi_same_value DAT7 ++ .cfi_rel_offset lr, 12 ++ ++160: /* Medium case */ ++ preload_all backwards, 0, 0, S, N, DAT2, OFF ++ sub N, N, #16 /* simplifies inner loop termination */ ++ .if backwards ++ ands DAT2, D, #15 ++ beq 164f ++ .else ++ ands DAT2, D, #15 ++ beq 164f ++ rsb DAT2, DAT2, #16 ++ .endif ++ memcpy_leading_15bytes backwards, align ++164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ ++ tst S, #3 ++ bne 140f ++ memcpy_medium_inner_loop backwards, 0 ++140: memcpy_medium_inner_loop backwards, 1 ++ ++170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */ ++ teq N, #0 ++ beq 199f ++ preload_all backwards, 1, 0, S, N, DAT2, LAST ++ tst D, #3 ++ beq 174f ++172: subs N, N, #1 ++ blo 199f ++ .if backwards ++ ldrb DAT0, [S, #-1]! ++ strb DAT0, [D, #-1]! ++ .else ++ ldrb DAT0, [S], #1 ++ strb DAT0, [D], #1 ++ .endif ++ tst D, #3 ++ bne 172b ++174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ ++ tst S, #3 ++ bne 140f ++ memcpy_short_inner_loop backwards, 0 ++140: memcpy_short_inner_loop backwards, 1 ++ ++ .cfi_endproc ++ ++ .unreq D ++ .unreq S ++ .unreq N ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq DAT4 ++ .unreq DAT5 ++ .unreq DAT6 ++ .unreq DAT7 ++ .unreq LAST ++ .unreq OFF ++.endm +--- /dev/null ++++ b/arch/arm/lib/memmove_rpi.S +@@ -0,0 +1,61 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include "arm-mem.h" ++#include "memcpymove.h" ++ ++/* Prevent the stack from becoming executable */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++ .text ++ .arch armv6 ++ .object_arch armv4 ++ .arm ++ .altmacro ++ .p2align 2 ++ ++/* ++ * void *memmove(void *s1, const void *s2, size_t n); ++ * On entry: ++ * a1 = pointer to destination ++ * a2 = pointer to source ++ * a3 = number of bytes to copy ++ * On exit: ++ * a1 preserved ++ */ ++ ++.set prefetch_distance, 3 ++ ++ENTRY(memmove) ++ cmp a2, a1 ++ bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */ ++ memcpy 1 ++ENDPROC(memmove) +--- /dev/null ++++ b/arch/arm/lib/memset_rpi.S +@@ -0,0 +1,121 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include "arm-mem.h" ++ ++/* Prevent the stack from becoming executable */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++ .text ++ .arch armv6 ++ .object_arch armv4 ++ .arm ++ .altmacro ++ .p2align 2 ++ ++/* ++ * void *memset(void *s, int c, size_t n); ++ * On entry: ++ * a1 = pointer to buffer to fill ++ * a2 = byte pattern to fill with (caller-narrowed) ++ * a3 = number of bytes to fill ++ * On exit: ++ * a1 preserved ++ */ ++ENTRY(memset) ++ S .req a1 ++ DAT0 .req a2 ++ N .req a3 ++ DAT1 .req a4 ++ DAT2 .req ip ++ DAT3 .req lr ++ ++ orr DAT0, DAT0, lsl #8 ++ push {S, lr} ++ orr DAT0, DAT0, lsl #16 ++ mov DAT1, DAT0 ++ ++ /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */ ++ cmp N, #31 ++ blo 170f ++ ++161: sub N, N, #16 /* simplifies inner loop termination */ ++ /* Leading words and bytes */ ++ tst S, #15 ++ beq 164f ++ rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */ ++ movs DAT2, DAT3, lsl #31 ++ submi N, N, #1 ++ strmib DAT0, [S], #1 ++ subcs N, N, #2 ++ strcsh DAT0, [S], #2 ++ movs DAT2, DAT3, lsl #29 ++ submi N, N, #4 ++ strmi DAT0, [S], #4 ++ subcs N, N, #8 ++ stmcsia S!, {DAT0, DAT1} ++164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */ ++ mov DAT2, DAT0 ++ mov DAT3, DAT0 ++ /* Now the inner loop of 16-byte stores */ ++165: stmia S!, {DAT0, DAT1, DAT2, DAT3} ++ subs N, N, #16 ++ bhs 165b ++166: /* Trailing words and bytes */ ++ movs N, N, lsl #29 ++ stmcsia S!, {DAT0, DAT1} ++ strmi DAT0, [S], #4 ++ movs N, N, lsl #2 ++ strcsh DAT0, [S], #2 ++ strmib DAT0, [S] ++199: pop {S, pc} ++ ++170: /* Short case */ ++ mov DAT2, DAT0 ++ mov DAT3, DAT0 ++ tst S, #3 ++ beq 174f ++172: subs N, N, #1 ++ blo 199b ++ strb DAT0, [S], #1 ++ tst S, #3 ++ bne 172b ++174: tst N, #16 ++ stmneia S!, {DAT0, DAT1, DAT2, DAT3} ++ b 166b ++ ++ .unreq S ++ .unreq DAT0 ++ .unreq N ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ENDPROC(memset) +--- a/arch/arm/lib/uaccess_with_memcpy.c ++++ b/arch/arm/lib/uaccess_with_memcpy.c +@@ -22,6 +22,14 @@ + #include + #include + ++#ifndef COPY_FROM_USER_THRESHOLD ++#define COPY_FROM_USER_THRESHOLD 64 ++#endif ++ ++#ifndef COPY_TO_USER_THRESHOLD ++#define COPY_TO_USER_THRESHOLD 64 ++#endif ++ + static int + pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) + { +@@ -85,7 +93,44 @@ pin_page_for_write(const void __user *_a + return 1; + } + +-static unsigned long noinline ++static int ++pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) ++{ ++ unsigned long addr = (unsigned long)_addr; ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ pud_t *pud; ++ spinlock_t *ptl; ++ ++ pgd = pgd_offset(current->mm, addr); ++ if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) ++ { ++ return 0; ++ } ++ pud = pud_offset(pgd, addr); ++ if (unlikely(pud_none(*pud) || pud_bad(*pud))) ++ { ++ return 0; ++ } ++ ++ pmd = pmd_offset(pud, addr); ++ if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd))) ++ return 0; ++ ++ pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); ++ if (unlikely(!pte_present(*pte) || !pte_young(*pte))) { ++ pte_unmap_unlock(pte, ptl); ++ return 0; ++ } ++ ++ *ptep = pte; ++ *ptlp = ptl; ++ ++ return 1; ++} ++ ++unsigned long noinline + __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) + { + int atomic; +@@ -135,6 +180,54 @@ out: + return n; + } + ++unsigned long noinline ++__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n) ++{ ++ int atomic; ++ ++ if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { ++ memcpy(to, (const void *)from, n); ++ return 0; ++ } ++ ++ /* the mmap semaphore is taken only if not in an atomic context */ ++ atomic = in_atomic(); ++ ++ if (!atomic) ++ down_read(¤t->mm->mmap_sem); ++ while (n) { ++ pte_t *pte; ++ spinlock_t *ptl; ++ int tocopy; ++ ++ while (!pin_page_for_read(from, &pte, &ptl)) { ++ char temp; ++ if (!atomic) ++ up_read(¤t->mm->mmap_sem); ++ if (__get_user(temp, (char __user *)from)) ++ goto out; ++ if (!atomic) ++ down_read(¤t->mm->mmap_sem); ++ } ++ ++ tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1; ++ if (tocopy > n) ++ tocopy = n; ++ ++ memcpy(to, (const void *)from, tocopy); ++ to += tocopy; ++ from += tocopy; ++ n -= tocopy; ++ ++ pte_unmap_unlock(pte, ptl); ++ } ++ if (!atomic) ++ up_read(¤t->mm->mmap_sem); ++ ++out: ++ return n; ++} ++ + unsigned long + __copy_to_user(void __user *to, const void *from, unsigned long n) + { +@@ -145,10 +238,25 @@ __copy_to_user(void __user *to, const vo + * With frame pointer disabled, tail call optimization kicks in + * as well making this test almost invisible. + */ +- if (n < 64) ++ if (n < COPY_TO_USER_THRESHOLD) + return __copy_to_user_std(to, from, n); + return __copy_to_user_memcpy(to, from, n); + } ++ ++unsigned long ++__copy_from_user(void *to, const void __user *from, unsigned long n) ++{ ++ /* ++ * This test is stubbed out of the main function above to keep ++ * the overhead for small copies low by avoiding a large ++ * register dump on the stack just to reload them right away. ++ * With frame pointer disabled, tail call optimization kicks in ++ * as well making this test almost invisible. ++ */ ++ if (n < COPY_FROM_USER_THRESHOLD) ++ return __copy_from_user_std(to, from, n); ++ return __copy_from_user_memcpy(to, from, n); ++} + + static unsigned long noinline + __clear_user_memset(void __user *addr, unsigned long n) -- cgit v1.2.3