From 5929da3d7ce831709884de3e3564de028fafc28c Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Mon, 13 Oct 2014 11:47:53 +0100 Subject: [PATCH 070/203] Improve __copy_to_user and __copy_from_user performance Provide a __copy_from_user that uses memcpy. On BCM2708, use optimised memcpy/memmove/memcmp/memset implementations. --- arch/arm/include/asm/string.h | 5 + arch/arm/include/asm/uaccess.h | 1 + arch/arm/lib/Makefile | 15 +- arch/arm/lib/arm-mem.h | 159 ++++++++++++ arch/arm/lib/copy_from_user.S | 4 +- arch/arm/lib/exports_rpi.c | 37 +++ arch/arm/lib/memcmp_rpi.S | 285 +++++++++++++++++++++ arch/arm/lib/memcpy_rpi.S | 59 +++++ arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++++++++++ arch/arm/lib/memmove_rpi.S | 61 +++++ arch/arm/lib/memset_rpi.S | 121 +++++++++ arch/arm/lib/uaccess_with_memcpy.c | 112 +++++++- 12 files changed, 1359 insertions(+), 6 deletions(-) create mode 100644 arch/arm/lib/arm-mem.h create mode 100644 arch/arm/lib/exports_rpi.c create mode 100644 arch/arm/lib/memcmp_rpi.S create mode 100644 arch/arm/lib/memcpy_rpi.S create mode 100644 arch/arm/lib/memcpymove.h create mode 100644 arch/arm/lib/memmove_rpi.S create mode 100644 arch/arm/lib/memset_rpi.S --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -24,6 +24,11 @@ extern void * memchr(const void *, int, #define __HAVE_ARCH_MEMSET extern void * memset(void *, int, __kernel_size_t); +#ifdef CONFIG_MACH_BCM2708 +#define __HAVE_ARCH_MEMCMP +extern int memcmp(const void *, const void *, size_t); +#endif + extern void __memzero(void *ptr, __kernel_size_t n); #define memset(p,v,n) \ --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -475,6 +475,7 @@ do { \ #ifdef CONFIG_MMU extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); +extern unsigned long __must_check __copy_from_user_std(void *to, const void __user *from, unsigned long n); extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __must_check __copy_to_user_std(void __user *to, const void *from, unsigned long n); extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -6,9 +6,8 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \ csumpartialcopy.o csumpartialcopyuser.o clearbit.o \ - delay.o delay-loop.o findbit.o memchr.o memcpy.o \ - memmove.o memset.o memzero.o setbit.o \ - strchr.o strrchr.o \ + delay.o delay-loop.o findbit.o memchr.o memzero.o \ + setbit.o strchr.o strrchr.o \ testchangebit.o testclearbit.o testsetbit.o \ ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ ucmpdi2.o lib1funcs.o div64.o \ @@ -18,6 +17,16 @@ lib-y := backtrace.o changebit.o csumip mmu-y := clear_user.o copy_page.o getuser.o putuser.o \ copy_from_user.o copy_to_user.o +# Choose optimised implementations for Raspberry Pi +ifeq ($(CONFIG_MACH_BCM2708),y) + CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672 + obj-$(CONFIG_MODULES) += exports_rpi.o + lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o +else + lib-y += memcpy.o memmove.o memset.o +endif + # using lib_ here won't override already available weak symbols obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o --- /dev/null +++ b/arch/arm/lib/arm-mem.h @@ -0,0 +1,159 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +.macro myfunc fname + .func fname + .global fname +fname: +.endm + +.macro preload_leading_step1 backwards, ptr, base +/* If the destination is already 16-byte aligned, then we need to preload + * between 0 and prefetch_distance (inclusive) cache lines ahead so there + * are no gaps when the inner loop starts. + */ + .if backwards + sub ptr, base, #1 + bic ptr, ptr, #31 + .else + bic ptr, base, #31 + .endif + .set OFFSET, 0 + .rept prefetch_distance+1 + pld [ptr, #OFFSET] + .if backwards + .set OFFSET, OFFSET-32 + .else + .set OFFSET, OFFSET+32 + .endif + .endr +.endm + +.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp +/* However, if the destination is not 16-byte aligned, we may need to + * preload one more cache line than that. The question we need to ask is: + * are the leading bytes more than the amount by which the source + * pointer will be rounded down for preloading, and if so, by how many + * cache lines? + */ + .if backwards +/* Here we compare against how many bytes we are into the + * cache line, counting down from the highest such address. + * Effectively, we want to calculate + * leading_bytes = dst&15 + * cacheline_offset = 31-((src-leading_bytes-1)&31) + * extra_needed = leading_bytes - cacheline_offset + * and test if extra_needed is <= 0, or rearranging: + * leading_bytes + (src-leading_bytes-1)&31 <= 31 + */ + mov tmp, base, lsl #32-5 + sbc tmp, tmp, leading_bytes, lsl #32-5 + adds tmp, tmp, leading_bytes, lsl #32-5 + bcc 61f + pld [ptr, #-32*(prefetch_distance+1)] + .else +/* Effectively, we want to calculate + * leading_bytes = (-dst)&15 + * cacheline_offset = (src+leading_bytes)&31 + * extra_needed = leading_bytes - cacheline_offset + * and test if extra_needed is <= 0. + */ + mov tmp, base, lsl #32-5 + add tmp, tmp, leading_bytes, lsl #32-5 + rsbs tmp, tmp, leading_bytes, lsl #32-5 + bls 61f + pld [ptr, #32*(prefetch_distance+1)] + .endif +61: +.endm + +.macro preload_trailing backwards, base, remain, tmp + /* We need either 0, 1 or 2 extra preloads */ + .if backwards + rsb tmp, base, #0 + mov tmp, tmp, lsl #32-5 + .else + mov tmp, base, lsl #32-5 + .endif + adds tmp, tmp, remain, lsl #32-5 + adceqs tmp, tmp, #0 + /* The instruction above has two effects: ensures Z is only + * set if C was clear (so Z indicates that both shifted quantities + * were 0), and clears C if Z was set (so C indicates that the sum + * of the shifted quantities was greater and not equal to 32) */ + beq 82f + .if backwards + sub tmp, base, #1 + bic tmp, tmp, #31 + .else + bic tmp, base, #31 + .endif + bcc 81f + .if backwards + pld [tmp, #-32*(prefetch_distance+1)] +81: + pld [tmp, #-32*prefetch_distance] + .else + pld [tmp, #32*(prefetch_distance+2)] +81: + pld [tmp, #32*(prefetch_distance+1)] + .endif +82: +.endm + +.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1 + .if backwards + sub tmp0, base, #1 + bic tmp0, tmp0, #31 + pld [tmp0] + sub tmp1, base, remain, lsl #shift + .else + bic tmp0, base, #31 + pld [tmp0] + add tmp1, base, remain, lsl #shift + sub tmp1, tmp1, #1 + .endif + bic tmp1, tmp1, #31 + cmp tmp1, tmp0 + beq 92f + .if narrow_case + /* In this case, all the data fits in either 1 or 2 cache lines */ + pld [tmp1] + .else +91: + .if backwards + sub tmp0, tmp0, #32 + .else + add tmp0, tmp0, #32 + .endif + cmp tmp0, tmp1 + pld [tmp0] + bne 91b + .endif +92: +.endm --- a/arch/arm/lib/copy_from_user.S +++ b/arch/arm/lib/copy_from_user.S @@ -89,11 +89,13 @@ .text -ENTRY(__copy_from_user) +ENTRY(__copy_from_user_std) +WEAK(__copy_from_user) #include "copy_template.S" ENDPROC(__copy_from_user) +ENDPROC(__copy_from_user_std) .pushsection .fixup,"ax" .align 0 --- /dev/null +++ b/arch/arm/lib/exports_rpi.c @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2014, Raspberry Pi (Trading) Ltd. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +EXPORT_SYMBOL(memcmp); --- /dev/null +++ b/arch/arm/lib/memcmp_rpi.S @@ -0,0 +1,285 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "arm-mem.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +.macro memcmp_process_head unaligned + .if unaligned + ldr DAT0, [S_1], #4 + ldr DAT1, [S_1], #4 + ldr DAT2, [S_1], #4 + ldr DAT3, [S_1], #4 + .else + ldmia S_1!, {DAT0, DAT1, DAT2, DAT3} + .endif + ldmia S_2!, {DAT4, DAT5, DAT6, DAT7} +.endm + +.macro memcmp_process_tail + cmp DAT0, DAT4 + cmpeq DAT1, DAT5 + cmpeq DAT2, DAT6 + cmpeq DAT3, DAT7 + bne 200f +.endm + +.macro memcmp_leading_31bytes + movs DAT0, OFF, lsl #31 + ldrmib DAT0, [S_1], #1 + ldrcsh DAT1, [S_1], #2 + ldrmib DAT4, [S_2], #1 + ldrcsh DAT5, [S_2], #2 + movpl DAT0, #0 + movcc DAT1, #0 + movpl DAT4, #0 + movcc DAT5, #0 + submi N, N, #1 + subcs N, N, #2 + cmp DAT0, DAT4 + cmpeq DAT1, DAT5 + bne 200f + movs DAT0, OFF, lsl #29 + ldrmi DAT0, [S_1], #4 + ldrcs DAT1, [S_1], #4 + ldrcs DAT2, [S_1], #4 + ldrmi DAT4, [S_2], #4 + ldmcsia S_2!, {DAT5, DAT6} + movpl DAT0, #0 + movcc DAT1, #0 + movcc DAT2, #0 + movpl DAT4, #0 + movcc DAT5, #0 + movcc DAT6, #0 + submi N, N, #4 + subcs N, N, #8 + cmp DAT0, DAT4 + cmpeq DAT1, DAT5 + cmpeq DAT2, DAT6 + bne 200f + tst OFF, #16 + beq 105f + memcmp_process_head 1 + sub N, N, #16 + memcmp_process_tail +105: +.endm + +.macro memcmp_trailing_15bytes unaligned + movs N, N, lsl #29 + .if unaligned + ldrcs DAT0, [S_1], #4 + ldrcs DAT1, [S_1], #4 + .else + ldmcsia S_1!, {DAT0, DAT1} + .endif + ldrmi DAT2, [S_1], #4 + ldmcsia S_2!, {DAT4, DAT5} + ldrmi DAT6, [S_2], #4 + movcc DAT0, #0 + movcc DAT1, #0 + movpl DAT2, #0 + movcc DAT4, #0 + movcc DAT5, #0 + movpl DAT6, #0 + cmp DAT0, DAT4 + cmpeq DAT1, DAT5 + cmpeq DAT2, DAT6 + bne 200f + movs N, N, lsl #2 + ldrcsh DAT0, [S_1], #2 + ldrmib DAT1, [S_1] + ldrcsh DAT4, [S_2], #2 + ldrmib DAT5, [S_2] + movcc DAT0, #0 + movpl DAT1, #0 + movcc DAT4, #0 + movpl DAT5, #0 + cmp DAT0, DAT4 + cmpeq DAT1, DAT5 + bne 200f +.endm + +.macro memcmp_long_inner_loop unaligned +110: + memcmp_process_head unaligned + pld [S_2, #prefetch_distance*32 + 16] + memcmp_process_tail + memcmp_process_head unaligned + pld [S_1, OFF] + memcmp_process_tail + subs N, N, #32 + bhs 110b + /* Just before the final (prefetch_distance+1) 32-byte blocks, + * deal with final preloads */ + preload_trailing 0, S_1, N, DAT0 + preload_trailing 0, S_2, N, DAT0 + add N, N, #(prefetch_distance+2)*32 - 16 +120: + memcmp_process_head unaligned + memcmp_process_tail + subs N, N, #16 + bhs 120b + /* Trailing words and bytes */ + tst N, #15 + beq 199f + memcmp_trailing_15bytes unaligned +199: /* Reached end without detecting a difference */ + mov a1, #0 + setend le + pop {DAT1-DAT6, pc} +.endm + +.macro memcmp_short_inner_loop unaligned + subs N, N, #16 /* simplifies inner loop termination */ + blo 122f +120: + memcmp_process_head unaligned + memcmp_process_tail + subs N, N, #16 + bhs 120b +122: /* Trailing words and bytes */ + tst N, #15 + beq 199f + memcmp_trailing_15bytes unaligned +199: /* Reached end without detecting a difference */ + mov a1, #0 + setend le + pop {DAT1-DAT6, pc} +.endm + +/* + * int memcmp(const void *s1, const void *s2, size_t n); + * On entry: + * a1 = pointer to buffer 1 + * a2 = pointer to buffer 2 + * a3 = number of bytes to compare (as unsigned chars) + * On exit: + * a1 = >0/=0/<0 if s1 >/=/< s2 + */ + +.set prefetch_distance, 2 + +ENTRY(memcmp) + S_1 .req a1 + S_2 .req a2 + N .req a3 + DAT0 .req a4 + DAT1 .req v1 + DAT2 .req v2 + DAT3 .req v3 + DAT4 .req v4 + DAT5 .req v5 + DAT6 .req v6 + DAT7 .req ip + OFF .req lr + + push {DAT1-DAT6, lr} + setend be /* lowest-addressed bytes are most significant */ + + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */ + cmp N, #(prefetch_distance+3)*32 - 1 + blo 170f + + /* Long case */ + /* Adjust N so that the decrement instruction can also test for + * inner loop termination. We want it to stop when there are + * (prefetch_distance+1) complete blocks to go. */ + sub N, N, #(prefetch_distance+2)*32 + preload_leading_step1 0, DAT0, S_1 + preload_leading_step1 0, DAT1, S_2 + tst S_2, #31 + beq 154f + rsb OFF, S_2, #0 /* no need to AND with 15 here */ + preload_leading_step2 0, DAT0, S_1, OFF, DAT2 + preload_leading_step2 0, DAT1, S_2, OFF, DAT2 + memcmp_leading_31bytes +154: /* Second source now cacheline (32-byte) aligned; we have at + * least one prefetch to go. */ + /* Prefetch offset is best selected such that it lies in the + * first 8 of each 32 bytes - but it's just as easy to aim for + * the first one */ + and OFF, S_1, #31 + rsb OFF, OFF, #32*prefetch_distance + tst S_1, #3 + bne 140f + memcmp_long_inner_loop 0 +140: memcmp_long_inner_loop 1 + +170: /* Short case */ + teq N, #0 + beq 199f + preload_all 0, 0, 0, S_1, N, DAT0, DAT1 + preload_all 0, 0, 0, S_2, N, DAT0, DAT1 + tst S_2, #3 + beq 174f +172: subs N, N, #1 + blo 199f + ldrb DAT0, [S_1], #1 + ldrb DAT4, [S_2], #1 + cmp DAT0, DAT4 + bne 200f + tst S_2, #3 + bne 172b +174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */ + tst S_1, #3 + bne 140f + memcmp_short_inner_loop 0 +140: memcmp_short_inner_loop 1 + +200: /* Difference found: determine sign. */ + movhi a1, #1 + movlo a1, #-1 + setend le + pop {DAT1-DAT6, pc} + + .unreq S_1 + .unreq S_2 + .unreq N + .unreq DAT0 + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 + .unreq DAT4 + .unreq DAT5 + .unreq DAT6 + .unreq DAT7 + .unreq OFF +ENDPROC(memcmp) --- /dev/null +++ b/arch/arm/lib/memcpy_rpi.S @@ -0,0 +1,59 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "arm-mem.h" +#include "memcpymove.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +/* + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n); + * On entry: + * a1 = pointer to destination + * a2 = pointer to source + * a3 = number of bytes to copy + * On exit: + * a1 preserved + */ + +.set prefetch_distance, 3 + +ENTRY(memcpy) + memcpy 0 +ENDPROC(memcpy) --- /dev/null +++ b/arch/arm/lib/memcpymove.h @@ -0,0 +1,506 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8 + .if words == 1 + .if backwards + mov r1, r0, lsl #32-align*8 + ldr r0, [S, #-4]! + orr r1, r1, r0, lsr #align*8 + str r1, [D, #-4]! + .else + mov r0, r1, lsr #align*8 + ldr r1, [S, #4]! + orr r0, r0, r1, lsl #32-align*8 + str r0, [D], #4 + .endif + .elseif words == 2 + .if backwards + ldr r1, [S, #-4]! + mov r2, r0, lsl #32-align*8 + ldr r0, [S, #-4]! + orr r2, r2, r1, lsr #align*8 + mov r1, r1, lsl #32-align*8 + orr r1, r1, r0, lsr #align*8 + stmdb D!, {r1, r2} + .else + ldr r1, [S, #4]! + mov r0, r2, lsr #align*8 + ldr r2, [S, #4]! + orr r0, r0, r1, lsl #32-align*8 + mov r1, r1, lsr #align*8 + orr r1, r1, r2, lsl #32-align*8 + stmia D!, {r0, r1} + .endif + .elseif words == 4 + .if backwards + ldmdb S!, {r2, r3} + mov r4, r0, lsl #32-align*8 + ldmdb S!, {r0, r1} + orr r4, r4, r3, lsr #align*8 + mov r3, r3, lsl #32-align*8 + orr r3, r3, r2, lsr #align*8 + mov r2, r2, lsl #32-align*8 + orr r2, r2, r1, lsr #align*8 + mov r1, r1, lsl #32-align*8 + orr r1, r1, r0, lsr #align*8 + stmdb D!, {r1, r2, r3, r4} + .else + ldmib S!, {r1, r2} + mov r0, r4, lsr #align*8 + ldmib S!, {r3, r4} + orr r0, r0, r1, lsl #32-align*8 + mov r1, r1, lsr #align*8 + orr r1, r1, r2, lsl #32-align*8 + mov r2, r2, lsr #align*8 + orr r2, r2, r3, lsl #32-align*8 + mov r3, r3, lsr #align*8 + orr r3, r3, r4, lsl #32-align*8 + stmia D!, {r0, r1, r2, r3} + .endif + .elseif words == 8 + .if backwards + ldmdb S!, {r4, r5, r6, r7} + mov r8, r0, lsl #32-align*8 + ldmdb S!, {r0, r1, r2, r3} + .if use_pld + pld [S, OFF] + .endif + orr r8, r8, r7, lsr #align*8 + mov r7, r7, lsl #32-align*8 + orr r7, r7, r6, lsr #align*8 + mov r6, r6, lsl #32-align*8 + orr r6, r6, r5, lsr #align*8 + mov r5, r5, lsl #32-align*8 + orr r5, r5, r4, lsr #align*8 + mov r4, r4, lsl #32-align*8 + orr r4, r4, r3, lsr #align*8 + mov r3, r3, lsl #32-align*8 + orr r3, r3, r2, lsr #align*8 + mov r2, r2, lsl #32-align*8 + orr r2, r2, r1, lsr #align*8 + mov r1, r1, lsl #32-align*8 + orr r1, r1, r0, lsr #align*8 + stmdb D!, {r5, r6, r7, r8} + stmdb D!, {r1, r2, r3, r4} + .else + ldmib S!, {r1, r2, r3, r4} + mov r0, r8, lsr #align*8 + ldmib S!, {r5, r6, r7, r8} + .if use_pld + pld [S, OFF] + .endif + orr r0, r0, r1, lsl #32-align*8 + mov r1, r1, lsr #align*8 + orr r1, r1, r2, lsl #32-align*8 + mov r2, r2, lsr #align*8 + orr r2, r2, r3, lsl #32-align*8 + mov r3, r3, lsr #align*8 + orr r3, r3, r4, lsl #32-align*8 + mov r4, r4, lsr #align*8 + orr r4, r4, r5, lsl #32-align*8 + mov r5, r5, lsr #align*8 + orr r5, r5, r6, lsl #32-align*8 + mov r6, r6, lsr #align*8 + orr r6, r6, r7, lsl #32-align*8 + mov r7, r7, lsr #align*8 + orr r7, r7, r8, lsl #32-align*8 + stmia D!, {r0, r1, r2, r3} + stmia D!, {r4, r5, r6, r7} + .endif + .endif +.endm + +.macro memcpy_leading_15bytes backwards, align + movs DAT1, DAT2, lsl #31 + sub N, N, DAT2 + .if backwards + ldrmib DAT0, [S, #-1]! + ldrcsh DAT1, [S, #-2]! + strmib DAT0, [D, #-1]! + strcsh DAT1, [D, #-2]! + .else + ldrmib DAT0, [S], #1 + ldrcsh DAT1, [S], #2 + strmib DAT0, [D], #1 + strcsh DAT1, [D], #2 + .endif + movs DAT1, DAT2, lsl #29 + .if backwards + ldrmi DAT0, [S, #-4]! + .if align == 0 + ldmcsdb S!, {DAT1, DAT2} + .else + ldrcs DAT2, [S, #-4]! + ldrcs DAT1, [S, #-4]! + .endif + strmi DAT0, [D, #-4]! + stmcsdb D!, {DAT1, DAT2} + .else + ldrmi DAT0, [S], #4 + .if align == 0 + ldmcsia S!, {DAT1, DAT2} + .else + ldrcs DAT1, [S], #4 + ldrcs DAT2, [S], #4 + .endif + strmi DAT0, [D], #4 + stmcsia D!, {DAT1, DAT2} + .endif +.endm + +.macro memcpy_trailing_15bytes backwards, align + movs N, N, lsl #29 + .if backwards + .if align == 0 + ldmcsdb S!, {DAT0, DAT1} + .else + ldrcs DAT1, [S, #-4]! + ldrcs DAT0, [S, #-4]! + .endif + ldrmi DAT2, [S, #-4]! + stmcsdb D!, {DAT0, DAT1} + strmi DAT2, [D, #-4]! + .else + .if align == 0 + ldmcsia S!, {DAT0, DAT1} + .else + ldrcs DAT0, [S], #4 + ldrcs DAT1, [S], #4 + .endif + ldrmi DAT2, [S], #4 + stmcsia D!, {DAT0, DAT1} + strmi DAT2, [D], #4 + .endif + movs N, N, lsl #2 + .if backwards + ldrcsh DAT0, [S, #-2]! + ldrmib DAT1, [S, #-1] + strcsh DAT0, [D, #-2]! + strmib DAT1, [D, #-1] + .else + ldrcsh DAT0, [S], #2 + ldrmib DAT1, [S] + strcsh DAT0, [D], #2 + strmib DAT1, [D] + .endif +.endm + +.macro memcpy_long_inner_loop backwards, align + .if align != 0 + .if backwards + ldr DAT0, [S, #-align]! + .else + ldr LAST, [S, #-align]! + .endif + .endif +110: + .if align == 0 + .if backwards + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} + pld [S, OFF] + stmdb D!, {DAT4, DAT5, DAT6, LAST} + stmdb D!, {DAT0, DAT1, DAT2, DAT3} + .else + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} + pld [S, OFF] + stmia D!, {DAT0, DAT1, DAT2, DAT3} + stmia D!, {DAT4, DAT5, DAT6, LAST} + .endif + .else + unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST + .endif + subs N, N, #32 + bhs 110b + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ + preload_trailing backwards, S, N, OFF + add N, N, #(prefetch_distance+2)*32 - 32 +120: + .if align == 0 + .if backwards + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} + stmdb D!, {DAT4, DAT5, DAT6, LAST} + stmdb D!, {DAT0, DAT1, DAT2, DAT3} + .else + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} + stmia D!, {DAT0, DAT1, DAT2, DAT3} + stmia D!, {DAT4, DAT5, DAT6, LAST} + .endif + .else + unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST + .endif + subs N, N, #32 + bhs 120b + tst N, #16 + .if align == 0 + .if backwards + ldmnedb S!, {DAT0, DAT1, DAT2, LAST} + stmnedb D!, {DAT0, DAT1, DAT2, LAST} + .else + ldmneia S!, {DAT0, DAT1, DAT2, LAST} + stmneia D!, {DAT0, DAT1, DAT2, LAST} + .endif + .else + beq 130f + unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST +130: + .endif + /* Trailing words and bytes */ + tst N, #15 + beq 199f + .if align != 0 + add S, S, #align + .endif + memcpy_trailing_15bytes backwards, align +199: + pop {DAT3, DAT4, DAT5, DAT6, DAT7} + pop {D, DAT1, DAT2, pc} +.endm + +.macro memcpy_medium_inner_loop backwards, align +120: + .if backwards + .if align == 0 + ldmdb S!, {DAT0, DAT1, DAT2, LAST} + .else + ldr LAST, [S, #-4]! + ldr DAT2, [S, #-4]! + ldr DAT1, [S, #-4]! + ldr DAT0, [S, #-4]! + .endif + stmdb D!, {DAT0, DAT1, DAT2, LAST} + .else + .if align == 0 + ldmia S!, {DAT0, DAT1, DAT2, LAST} + .else + ldr DAT0, [S], #4 + ldr DAT1, [S], #4 + ldr DAT2, [S], #4 + ldr LAST, [S], #4 + .endif + stmia D!, {DAT0, DAT1, DAT2, LAST} + .endif + subs N, N, #16 + bhs 120b + /* Trailing words and bytes */ + tst N, #15 + beq 199f + memcpy_trailing_15bytes backwards, align +199: + pop {D, DAT1, DAT2, pc} +.endm + +.macro memcpy_short_inner_loop backwards, align + tst N, #16 + .if backwards + .if align == 0 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST} + .else + ldrne LAST, [S, #-4]! + ldrne DAT2, [S, #-4]! + ldrne DAT1, [S, #-4]! + ldrne DAT0, [S, #-4]! + .endif + stmnedb D!, {DAT0, DAT1, DAT2, LAST} + .else + .if align == 0 + ldmneia S!, {DAT0, DAT1, DAT2, LAST} + .else + ldrne DAT0, [S], #4 + ldrne DAT1, [S], #4 + ldrne DAT2, [S], #4 + ldrne LAST, [S], #4 + .endif + stmneia D!, {DAT0, DAT1, DAT2, LAST} + .endif + memcpy_trailing_15bytes backwards, align +199: + pop {D, DAT1, DAT2, pc} +.endm + +.macro memcpy backwards + D .req a1 + S .req a2 + N .req a3 + DAT0 .req a4 + DAT1 .req v1 + DAT2 .req v2 + DAT3 .req v3 + DAT4 .req v4 + DAT5 .req v5 + DAT6 .req v6 + DAT7 .req sl + LAST .req ip + OFF .req lr + + .cfi_startproc + + push {D, DAT1, DAT2, lr} + + .cfi_def_cfa_offset 16 + .cfi_rel_offset D, 0 + .cfi_undefined S + .cfi_undefined N + .cfi_undefined DAT0 + .cfi_rel_offset DAT1, 4 + .cfi_rel_offset DAT2, 8 + .cfi_undefined LAST + .cfi_rel_offset lr, 12 + + .if backwards + add D, D, N + add S, S, N + .endif + + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */ + cmp N, #31 + blo 170f + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */ + cmp N, #(prefetch_distance+3)*32 - 1 + blo 160f + + /* Long case */ + push {DAT3, DAT4, DAT5, DAT6, DAT7} + + .cfi_def_cfa_offset 36 + .cfi_rel_offset D, 20 + .cfi_rel_offset DAT1, 24 + .cfi_rel_offset DAT2, 28 + .cfi_rel_offset DAT3, 0 + .cfi_rel_offset DAT4, 4 + .cfi_rel_offset DAT5, 8 + .cfi_rel_offset DAT6, 12 + .cfi_rel_offset DAT7, 16 + .cfi_rel_offset lr, 32 + + /* Adjust N so that the decrement instruction can also test for + * inner loop termination. We want it to stop when there are + * (prefetch_distance+1) complete blocks to go. */ + sub N, N, #(prefetch_distance+2)*32 + preload_leading_step1 backwards, DAT0, S + .if backwards + /* Bug in GAS: it accepts, but mis-assembles the instruction + * ands DAT2, D, #60, 2 + * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow) + */ + .word 0xE210513C + beq 154f + .else + ands DAT2, D, #15 + beq 154f + rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */ + .endif + preload_leading_step2 backwards, DAT0, S, DAT2, OFF + memcpy_leading_15bytes backwards, 1 +154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */ + /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */ + .if backwards + rsb OFF, S, #3 + and OFF, OFF, #28 + sub OFF, OFF, #32*(prefetch_distance+1) + .else + and OFF, S, #28 + rsb OFF, OFF, #32*prefetch_distance + .endif + movs DAT0, S, lsl #31 + bhi 157f + bcs 156f + bmi 155f + memcpy_long_inner_loop backwards, 0 +155: memcpy_long_inner_loop backwards, 1 +156: memcpy_long_inner_loop backwards, 2 +157: memcpy_long_inner_loop backwards, 3 + + .cfi_def_cfa_offset 16 + .cfi_rel_offset D, 0 + .cfi_rel_offset DAT1, 4 + .cfi_rel_offset DAT2, 8 + .cfi_same_value DAT3 + .cfi_same_value DAT4 + .cfi_same_value DAT5 + .cfi_same_value DAT6 + .cfi_same_value DAT7 + .cfi_rel_offset lr, 12 + +160: /* Medium case */ + preload_all backwards, 0, 0, S, N, DAT2, OFF + sub N, N, #16 /* simplifies inner loop termination */ + .if backwards + ands DAT2, D, #15 + beq 164f + .else + ands DAT2, D, #15 + beq 164f + rsb DAT2, DAT2, #16 + .endif + memcpy_leading_15bytes backwards, align +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ + tst S, #3 + bne 140f + memcpy_medium_inner_loop backwards, 0 +140: memcpy_medium_inner_loop backwards, 1 + +170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */ + teq N, #0 + beq 199f + preload_all backwards, 1, 0, S, N, DAT2, LAST + tst D, #3 + beq 174f +172: subs N, N, #1 + blo 199f + .if backwards + ldrb DAT0, [S, #-1]! + strb DAT0, [D, #-1]! + .else + ldrb DAT0, [S], #1 + strb DAT0, [D], #1 + .endif + tst D, #3 + bne 172b +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ + tst S, #3 + bne 140f + memcpy_short_inner_loop backwards, 0 +140: memcpy_short_inner_loop backwards, 1 + + .cfi_endproc + + .unreq D + .unreq S + .unreq N + .unreq DAT0 + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 + .unreq DAT4 + .unreq DAT5 + .unreq DAT6 + .unreq DAT7 + .unreq LAST + .unreq OFF +.endm --- /dev/null +++ b/arch/arm/lib/memmove_rpi.S @@ -0,0 +1,61 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "arm-mem.h" +#include "memcpymove.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +/* + * void *memmove(void *s1, const void *s2, size_t n); + * On entry: + * a1 = pointer to destination + * a2 = pointer to source + * a3 = number of bytes to copy + * On exit: + * a1 preserved + */ + +.set prefetch_distance, 3 + +ENTRY(memmove) + cmp a2, a1 + bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */ + memcpy 1 +ENDPROC(memmove) --- /dev/null +++ b/arch/arm/lib/memset_rpi.S @@ -0,0 +1,121 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "arm-mem.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +/* + * void *memset(void *s, int c, size_t n); + * On entry: + * a1 = pointer to buffer to fill + * a2 = byte pattern to fill with (caller-narrowed) + * a3 = number of bytes to fill + * On exit: + * a1 preserved + */ +ENTRY(memset) + S .req a1 + DAT0 .req a2 + N .req a3 + DAT1 .req a4 + DAT2 .req ip + DAT3 .req lr + + orr DAT0, DAT0, lsl #8 + push {S, lr} + orr DAT0, DAT0, lsl #16 + mov DAT1, DAT0 + + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */ + cmp N, #31 + blo 170f + +161: sub N, N, #16 /* simplifies inner loop termination */ + /* Leading words and bytes */ + tst S, #15 + beq 164f + rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */ + movs DAT2, DAT3, lsl #31 + submi N, N, #1 + strmib DAT0, [S], #1 + subcs N, N, #2 + strcsh DAT0, [S], #2 + movs DAT2, DAT3, lsl #29 + submi N, N, #4 + strmi DAT0, [S], #4 + subcs N, N, #8 + stmcsia S!, {DAT0, DAT1} +164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */ + mov DAT2, DAT0 + mov DAT3, DAT0 + /* Now the inner loop of 16-byte stores */ +165: stmia S!, {DAT0, DAT1, DAT2, DAT3} + subs N, N, #16 + bhs 165b +166: /* Trailing words and bytes */ + movs N, N, lsl #29 + stmcsia S!, {DAT0, DAT1} + strmi DAT0, [S], #4 + movs N, N, lsl #2 + strcsh DAT0, [S], #2 + strmib DAT0, [S] +199: pop {S, pc} + +170: /* Short case */ + mov DAT2, DAT0 + mov DAT3, DAT0 + tst S, #3 + beq 174f +172: subs N, N, #1 + blo 199b + strb DAT0, [S], #1 + tst S, #3 + bne 172b +174: tst N, #16 + stmneia S!, {DAT0, DAT1, DAT2, DAT3} + b 166b + + .unreq S + .unreq DAT0 + .unreq N + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 +ENDPROC(memset) --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -22,6 +22,14 @@ #include #include +#ifndef COPY_FROM_USER_THRESHOLD +#define COPY_FROM_USER_THRESHOLD 64 +#endif + +#ifndef COPY_TO_USER_THRESHOLD +#define COPY_TO_USER_THRESHOLD 64 +#endif + static int pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) { @@ -85,7 +93,44 @@ pin_page_for_write(const void __user *_a return 1; } -static unsigned long noinline +static int +pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) +{ + unsigned long addr = (unsigned long)_addr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pud_t *pud; + spinlock_t *ptl; + + pgd = pgd_offset(current->mm, addr); + if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) + { + return 0; + } + pud = pud_offset(pgd, addr); + if (unlikely(pud_none(*pud) || pud_bad(*pud))) + { + return 0; + } + + pmd = pmd_offset(pud, addr); + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd))) + return 0; + + pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); + if (unlikely(!pte_present(*pte) || !pte_young(*pte))) { + pte_unmap_unlock(pte, ptl); + return 0; + } + + *ptep = pte; + *ptlp = ptl; + + return 1; +} + +unsigned long noinline __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) { int atomic; @@ -135,6 +180,54 @@ out: return n; } +unsigned long noinline +__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n) +{ + int atomic; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memcpy(to, (const void *)from, n); + return 0; + } + + /* the mmap semaphore is taken only if not in an atomic context */ + atomic = in_atomic(); + + if (!atomic) + down_read(¤t->mm->mmap_sem); + while (n) { + pte_t *pte; + spinlock_t *ptl; + int tocopy; + + while (!pin_page_for_read(from, &pte, &ptl)) { + char temp; + if (!atomic) + up_read(¤t->mm->mmap_sem); + if (__get_user(temp, (char __user *)from)) + goto out; + if (!atomic) + down_read(¤t->mm->mmap_sem); + } + + tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1; + if (tocopy > n) + tocopy = n; + + memcpy(to, (const void *)from, tocopy); + to += tocopy; + from += tocopy; + n -= tocopy; + + pte_unmap_unlock(pte, ptl); + } + if (!atomic) + up_read(¤t->mm->mmap_sem); + +out: + return n; +} + unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { @@ -145,10 +238,25 @@ __copy_to_user(void __user *to, const vo * With frame pointer disabled, tail call optimization kicks in * as well making this test almost invisible. */ - if (n < 64) + if (n < COPY_TO_USER_THRESHOLD) return __copy_to_user_std(to, from, n); return __copy_to_user_memcpy(to, from, n); } + +unsigned long +__copy_from_user(void *to, const void __user *from, unsigned long n) +{ + /* + * This test is stubbed out of the main function above to keep + * the overhead for small copies low by avoiding a large + * register dump on the stack just to reload them right away. + * With frame pointer disabled, tail call optimization kicks in + * as well making this test almost invisible. + */ + if (n < COPY_FROM_USER_THRESHOLD) + return __copy_from_user_std(to, from, n); + return __copy_from_user_memcpy(to, from, n); +} static unsigned long noinline __clear_user_memset(void __user *addr, unsigned long n)