Subject: [PATCH] AVR32-optimized string operations Add hand-optimized AVR32-specific string operations. Some of them need a bit more testing, though. --- libc/string/avr32/Makefile | 40 +++++++++++ libc/string/avr32/bcopy.S | 15 ++++ libc/string/avr32/bzero.S | 12 +++ libc/string/avr32/memchr.S | 62 +++++++++++++++++ libc/string/avr32/memcmp.S | 50 +++++++++++++ libc/string/avr32/memcpy.S | 110 ++++++++++++++++++++++++++++++ libc/string/avr32/memmove.S | 114 +++++++++++++++++++++++++++++++ libc/string/avr32/memset.S | 60 ++++++++++++++++ libc/string/avr32/strcat.S | 95 ++++++++++++++++++++++++++ libc/string/avr32/strcmp.S | 80 ++++++++++++++++++++++ libc/string/avr32/strcpy.S | 63 +++++++++++++++++ libc/string/avr32/stringtest.c | 144 ++++++++++++++++++++++++++++++++++++++++ libc/string/avr32/strlen.S | 52 ++++++++++++++ libc/string/avr32/strncpy.S | 77 +++++++++++++++++++++ libc/string/avr32/test_memcpy.c | 66 ++++++++++++++++++ 15 files changed, 1040 insertions(+) Index: uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,15 @@ +/* + * Copyright (C) 2004 Atmel Norway + */ + + .text + .global bcopy + .type bcopy, @function + .align 1 +bcopy: + /* Swap the first two arguments */ + eor r11, r12 + eor r12, r11 + eor r11, r12 + rjmp __memmove + .size bcopy, . - bcopy Index: uClibc-0.9.28-avr32/libc/string/avr32/bzero.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/bzero.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,12 @@ +/* + * Copyright (C) 2004 Atmel Norway + */ + + .text + .global bzero + .type bzero, @function + .align 1 +bzero: + mov r10, r11 + mov r11, 0 + rjmp __memset Index: uClibc-0.9.28-avr32/libc/string/avr32/Makefile =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/Makefile 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,40 @@ +# Makefile for uClibc +# +# Copyright (C) 2000-2003 Erik Andersen +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU Library General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more +# details. +# +# You should have received a copy of the GNU Library General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +TOPDIR=../../../ +include $(TOPDIR)Rules.mak + +SSRC := bcopy.S bzero.S memcmp.S memcpy.S memmove.S +SSRC += memset.S strcmp.S strlen.S +# memchr.S, strcat.S, strcpy.S, strncpy.S is broken +SOBJS := $(patsubst %.S,%.o, $(SSRC)) +OBJS := $(SOBJS) + +OBJ_LIST:= ../../obj.string.$(TARGET_ARCH) + +all: $(OBJ_LIST) + +$(OBJ_LIST): $(OBJS) + echo $(addprefix string/$(TARGET_ARCH)/, $(OBJS)) > $@ + +$(SOBJS): %.o: %.S + $(CC) $(ASFLAGS) -c $< -o $@ + $(STRIPTOOL) -x -R .note -R .comment $@ + +clean: + $(RM) *.[oa] *~ core Index: uClibc-0.9.28-avr32/libc/string/avr32/memchr.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/memchr.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2004 Atmel Norway + */ + +#define str r12 +#define chr r11 +#define len r10 + + .text + .global memchr + .type memchr, @function +memchr: + or chr, chr, chr << 8 + or chr, chr, chr << 16 + + mov r9, str + andl r9, 3, COH + brne .Lunaligned_str + +1: sub len, 4 + brlt 2f + ld.w r8, str++ + psub.b r9, r8, r11 + tnbz r9 + brne 1b + + sub str, 4 + bfextu r9, r8, 24, 8 + cp.b r9, r11 + reteq str + sub str, -1 + bfextu r9, r8, 16, 8 + cp.b r9, r11 + reteq str + sub str, -1 + bfextu r9, r8, 8, 8 + cp.b r9, r11 + reteq str + sub str, -1 + retal str + +2: sub len, -4 + reteq 0 + +3: ld.ub r8, str++ + cp.w r8, 0 + reteq str + sub len, 1 + brne 3b + + retal 0 + +.Lunaligned_str: +1: sub len, 1 + retlt 0 + ld.ub r8, str++ + cp.b r8, r11 + reteq str + sub r9, 1 + brge 1b + + rjmp .Laligned_search Index: uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S 2006-10-20 10:42:09.000000000 +0200 @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2004 Atmel Norway. + */ + +#define s1 r12 +#define s2 r11 +#define len r10 + + .text + .global memcmp + .type memcmp, @function + .align 1 +memcmp: + sub len, 4 + brlt .Lless_than_4 + +1: ld.w r8, s1++ + ld.w r9, s2++ + cp.w r8, r9 + brne .Lfound_word + sub len, 4 + brge 1b + +.Lless_than_4: + sub len, -4 + reteq 0 + +1: ld.ub r8, s1++ + ld.ub r9, s2++ + sub r8, r9 + retne r8 + sub len, 1 + brgt 1b + + retal 0 + +.Lfound_word: + psub.b r9, r8, r9 + bfextu r8, r9, 24, 8 + retne r8 + bfextu r8, r9, 16, 8 + retne r8 + bfextu r8, r9, 8, 8 + retne r8 + retal r9 + + .size memcmp, . - memcmp + + .weak bcmp + bcmp = memcmp Index: uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2004 Atmel Norway + */ + +/* Don't use r12 as dst since we must return it unmodified */ +#define dst r9 +#define src r11 +#define len r10 + + .text + .global memcpy + .type memcpy, @function + + .global __memcpy + .hidden __memcpy + .type __memcpy, @function +memcpy: +__memcpy: + pref src[0] + mov dst, r12 + + /* If we have less than 32 bytes, don't do anything fancy */ + cp.w len, 32 + brge .Lmore_than_31 + + sub len, 1 + retlt r12 +1: ld.ub r8, src++ + st.b dst++, r8 + sub len, 1 + brge 1b + retal r12 + +.Lmore_than_31: + pushm r0-r7, lr + + /* Check alignment */ + mov r8, src + andl r8, 31, COH + brne .Lunaligned_src + mov r8, dst + andl r8, 3, COH + brne .Lunaligned_dst + +.Laligned_copy: + sub len, 32 + brlt .Lless_than_32 + +1: /* Copy 32 bytes at a time */ + ldm src, r0-r7 + sub src, -32 + stm dst, r0-r7 + sub dst, -32 + sub len, 32 + brge 1b + +.Lless_than_32: + /* Copy 16 more bytes if possible */ + sub len, -16 + brlt .Lless_than_16 + ldm src, r0-r3 + sub src, -16 + sub len, 16 + stm dst, r0-r3 + sub dst, -16 + +.Lless_than_16: + /* Do the remaining as byte copies */ + neg len + add pc, pc, len << 2 + .rept 15 + ld.ub r0, src++ + st.b dst++, r0 + .endr + + popm r0-r7, pc + +.Lunaligned_src: + /* Make src cacheline-aligned. r8 = (src & 31) */ + rsub r8, r8, 32 + sub len, r8 +1: ld.ub r0, src++ + st.b dst++, r0 + sub r8, 1 + brne 1b + + /* If dst is word-aligned, we're ready to go */ + pref src[0] + mov r8, 3 + tst dst, r8 + breq .Laligned_copy + +.Lunaligned_dst: + /* src is aligned, but dst is not. Expect bad performance */ + sub len, 4 + brlt 2f +1: ld.w r0, src++ + st.w dst++, r0 + sub len, 4 + brge 1b + +2: neg len + add pc, pc, len << 2 + .rept 3 + ld.ub r0, src++ + st.b dst++, r0 + .endr + + popm r0-r7, pc + .size memcpy, . - memcpy Index: uClibc-0.9.28-avr32/libc/string/avr32/memmove.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/memmove.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2004 Atmel Norway + */ + +#define dst r12 +#define src r11 +#define len r10 + + .text + .global memmove + .type memmove, @function + + .global __memmove + .hidden __memmove + .type __memmove, @function +memmove: +__memmove: + cp.w src, dst + brge __memcpy + + add dst, len + add src, len + pref src[-1] + + /* + * The rest is basically the same as in memcpy.S except that + * the direction is reversed. + */ + cp.w len, 32 + brge .Lmore_than_31 + + sub len, 1 + retlt r12 +1: ld.ub r8, --src + st.b --dst, r8 + sub len, 1 + brge 1b + retal r12 + +.Lmore_than_31: + pushm r0-r7, lr + + /* Check alignment */ + mov r8, src + andl r8, 31, COH + brne .Lunaligned_src + mov r8, r12 + andl r8, 3, COH + brne .Lunaligned_dst + +.Laligned_copy: + sub len, 32 + brlt .Lless_than_32 + +1: /* Copy 32 bytes at a time */ + sub src, 32 + ldm src, r0-r7 + sub dst, 32 + sub len, 32 + stm dst, r0-r7 + brge 1b + +.Lless_than_32: + /* Copy 16 more bytes if possible */ + sub len, -16 + brlt .Lless_than_16 + sub src, 16 + ldm src, r0-r3 + sub dst, 16 + sub len, 16 + stm dst, r0-r3 + +.Lless_than_16: + /* Do the remaining as byte copies */ + sub len, -16 + breq 2f +1: ld.ub r0, --src + st.b --dst, r0 + sub len, 1 + brne 1b + +2: popm r0-r7, pc + +.Lunaligned_src: + /* Make src cacheline-aligned. r8 = (src & 31) */ + sub len, r8 +1: ld.ub r0, --src + st.b --dst, r0 + sub r8, 1 + brne 1b + + /* If dst is word-aligned, we're ready to go */ + pref src[-4] + mov r8, 3 + tst dst, r8 + breq .Laligned_copy + +.Lunaligned_dst: + /* src is aligned, but dst is not. Expect bad performance */ + sub len, 4 + brlt 2f +1: ld.w r0, --src + st.w --dst, r0 + sub len, 4 + brge 1b + +2: neg len + add pc, pc, len << 2 + .rept 3 + ld.ub r0, --src + st.b --dst, r0 + .endr + + popm r0-r7, pc Index: uClibc-0.9.28-avr32/libc/string/avr32/memset.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/memset.S 2006-10-20 10:42:15.000000000 +0200 @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2004 Atmel Norway. + */ + +#define s r12 +#define c r11 +#define n r10 + + .text + .global memset + .type memset, @function + + .global __memset + .hidden __memset + .type __memset, @function + + .align 1 +memset: +__memset: + cp.w n, 32 + mov r9, s + brge .Llarge_memset + + sub n, 1 + retlt s +1: st.b s++, c + sub n, 1 + brge 1b + + retal r9 + +.Llarge_memset: + mov r8, r11 + mov r11, 3 + bfins r8, r8, 8, 8 + bfins r8, r8, 16, 16 + tst s, r11 + breq 2f + +1: st.b s++, r8 + sub n, 1 + tst s, r11 + brne 1b + +2: mov r11, r9 + mov r9, r8 + sub n, 8 + +3: st.d s++, r8 + sub n, 8 + brge 3b + + /* If we are done, n == -8 and we'll skip all st.b insns below */ + neg n + lsl n, 1 + add pc, n + .rept 7 + st.b s++, r8 + .endr + retal r11 Index: uClibc-0.9.28-avr32/libc/string/avr32/strcat.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/strcat.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2004 Atmel Norway + */ + +#define s1 r9 +#define s2 r11 + + .text + .global strcat + .type strcat, @function + .align 1 +strcat: + mov s1, r12 + + /* Make sure s1 is word-aligned */ + mov r10, s1 + andl r10, 3, COH + breq 2f + + add pc, pc, r10 << 3 + sub r0, r0, 0 /* 4-byte nop */ + ld.ub r8, s1++ + sub r8, r8, 0 + breq 2f + ld.ub r8, s1++ + sub r8, r8, 0 + breq 3f + ld.ub r8, s1++ + sub r8, r8, 0 + breq 4f + + /* Find the end of the first string */ +5: ld.w r8, s1++ + tnbz r8 + brne 5b + + sub s1, 4 + + bfextu r10, r8, 24, 8 + cp.w r10, 0 + breq 1f + sub s1, -1 + bfextu r10, r8, 16, 8 + cp.w r10, 0 + breq 2f + sub s1, -1 + bfextu r10, r8, 8, 8 + cp.w r10, 0 + breq 3f + sub s1, -1 + rjmp 4f + + /* Now, append s2 */ +1: ld.ub r8, s2++ + st.b s1++, r8 + cp.w r8, 0 + reteq r12 +2: ld.ub r8, s2++ + st.b s1++, r8 + cp.w r8, 0 + reteq r12 +3: ld.ub r8, s2++ + st.b s1++, r8 + cp.w r8, 0 + reteq r12 +4: ld.ub r8, s2++ + st.b s1++, r8 + cp.w r8, 0 + reteq r12 + + /* Copy one word at a time */ + ld.w r8, s2++ + tnbz r8 + breq 2f +1: st.w r8, s2++ + ld.w r8, s2++ + tnbz r8 + brne 1b + + /* Copy the remaining bytes */ + bfextu r10, r8, 24, 8 + st.b s1++, r10 + cp.w r10, 0 + reteq r12 + bfextu r10, r8, 16, 8 + st.b s1++, r10 + cp.w r10, 0 + reteq r12 + bfextu r10, r8, 8, 8 + st.b s1++, r10 + cp.w r10, 0 + reteq r12 + st.b s1++, r8 + retal r12 + .size strcat, . - strcat Index: uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2004 Atmel Norway. + */ + +#define s1 r12 +#define s2 r11 +#define len r10 + + .text + .global strcmp + .type strcmp, @function + .align 1 +strcmp: + mov r8, 3 + tst s1, r8 + brne .Lunaligned_s1 + tst s2, r8 + brne .Lunaligned_s2 + +1: ld.w r8, s1++ + ld.w r9, s2++ + cp.w r8, r9 + brne 2f + tnbz r8 + brne 1b + retal 0 + +2: bfextu r12, r8, 24, 8 + bfextu r11, r9, 24, 8 + sub r12, r11 + retne r12 + cp.w r11, 0 + reteq 0 + bfextu r12, r8, 16, 8 + bfextu r11, r9, 16, 8 + sub r12, r11 + retne r12 + cp.w r11, 0 + reteq 0 + bfextu r12, r8, 8, 8 + bfextu r11, r9, 8, 8 + sub r12, r11 + retne r12 + cp.w r11, 0 + reteq 0 + bfextu r12, r8, 0, 8 + bfextu r11, r9, 0, 8 + sub r12, r11 + retal r12 + +.Lunaligned_s1: +3: tst s1, r8 + breq 4f + ld.ub r10, s1++ + ld.ub r9, s2++ + sub r10, r9 + retne r10 + cp.w r9, 0 + brne 3b + retal r10 + +4: tst s2, r8 + breq 1b + +.Lunaligned_s2: + /* + * s1 and s2 can't both be aligned, and unaligned word loads + * can trigger spurious exceptions if we cross a page boundary. + * Do it the slow way... + */ +1: ld.ub r8, s1++ + ld.ub r9, s2++ + sub r8, r9 + retne r8 + cp.w r9, 0 + brne 1b + retal 0 + + .weak strcoll + strcoll = strcmp Index: uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2004 Atmel Norway + * + * To reduce the size, this one might simply call strncpy with len = -1. + */ + +#define dst r9 +#define src r11 + + .text + .global strcpy + .type strcpy, @function +strcpy: + mov dst, r12 + + pref src[0] + + /* + * Check alignment. If src is aligned but dst isn't, we can't + * do much about it... + */ + mov r8, src + andl r8, 3 COH + brne .Lunaligned_src + +.Laligned_copy: +1: ld.w r8, src++ + tnbz r8 + breq 2f + st.w dst++, r8 + rjmp 1b + +2: /* + * Ok, r8 now contains the terminating '\0'. Copy the + * remaining bytes individually. + */ + bfextu r10, r8, 24, 8 + st.b dst++, r10 + cp.w r10, 0 + reteq r12 + bfextu r10, r8, 16, 8 + st.b dst++, r10 + cp.w r10, 0 + reteq r12 + bfextu r10, r8, 8, 8 + st.b dst++, r10 + cp.w r10, 0 + reteq r12 + st.b dst++, r8 + retal r12 + +.Lunaligned_src: + /* Copy bytes until we're aligned */ + rsub r8, r8, 4 + add pc, pc, r8 << 3 + nop + nop + ld.ub r10, src++ + st.b dst++, r10 + cp.w r10, 0 + reteq r12 + + rjmp .Laligned_copy Index: uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,144 @@ + +#include +#include +#include +#include + +#define BUF_SIZE (8 * 1024) + +static char *buf1; +static char *buf1_ref; +static char *buf2; + +extern void *optimized_memcpy(void *dest, void *src, size_t len); +extern void *optimized_memmove(void *dest, void *src, size_t len); +extern char *optimized_strcpy(char *dest, char *src); +extern char *optimized_strncpy(char *dest, char *src, size_t len); + +void dump_mismatch(char *buf, char *ref, size_t len) +{ + int i, j; + + for (i = 0; i < len; i += 16) { + if (memcmp(buf + i, ref + i, 16) == 0) + continue; + + printf("%4x buf:", i); + for (j = i; j < (i + 16); j++) + printf(" %02x", buf[j]); + printf("\n ref:"); + for (j = i; j < (i + 16); j++) + printf(" %02x", ref[j]); + printf("\n"); + } +} + +static void test_memcpy(int src_offset, int dst_offset, int len) +{ + clock_t start, old, new; + int i; + + memset(buf1, 0x55, BUF_SIZE); + memset(buf1_ref, 0x55, BUF_SIZE); + memset(buf2, 0xaa, BUF_SIZE); + + printf("Testing memcpy with offsets %d => %d and len %d...", + src_offset, dst_offset, len); + + start = clock(); + for (i = 0; i < 8192; i++) + optimized_memcpy(buf1 + dst_offset, buf2 + src_offset, len); + new = clock() - start; + start = clock(); + for ( i = 0; i < 8192; i++) + memcpy(buf1_ref + dst_offset, buf2 + src_offset, len); + old = clock() - start; + + if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0) + printf("OK\n"); + else { + printf("FAILED\n"); + dump_mismatch(buf1, buf1_ref, BUF_SIZE); + } + printf("CPU time used: %d vs. %d\n", new, old); +} + +static void test_memmove(int src_offset, int dst_offset, int len) +{ + clock_t start, old, new; + + memset(buf1, 0x55, BUF_SIZE); + memset(buf1_ref, 0x55, BUF_SIZE); + memset(buf2, 0xaa, BUF_SIZE); + + printf("Testing memmove with offsets %d => %d and len %d...", + src_offset, dst_offset, len); + + start = clock(); + optimized_memmove(buf1 + dst_offset, buf2 + src_offset, len); + new = clock() - start; + start = clock(); + memmove(buf1_ref + dst_offset, buf2 + src_offset, len); + old = clock() - start; + + if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0) + printf("OK\n"); + else { + printf("FAILED\n"); + dump_mismatch(buf1, buf1_ref, BUF_SIZE); + } + printf("CPU time used: %d vs. %d\n", new, old); +} + +int main(int argc, char *argv[]) +{ + buf2 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (buf2 == MAP_FAILED) { + perror("Failed to allocate memory for buf2"); + return 1; + } + buf1 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (buf1 == MAP_FAILED) { + perror("Failed to allocate memory for buf1"); + return 1; + } + buf1_ref = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (buf1_ref == MAP_FAILED) { + perror("Failed to allocate memory for buf1_ref"); + return 1; + } + printf("\n === MEMCPY ===\n\n"); + + test_memcpy(0, 0, BUF_SIZE - 32); + test_memcpy(0, 0, 1); + test_memcpy(0, 0, 31); + test_memcpy(0, 0, 32); + test_memcpy(0, 0, 127); + test_memcpy(0, 0, 128); + test_memcpy(4, 4, BUF_SIZE - 32 - 4); + test_memcpy(1, 1, BUF_SIZE - 32 - 1); + test_memcpy(1, 1, 126); + test_memcpy(0, 3, 128); + test_memcpy(1, 4, 128); + test_memcpy(0, 0, 0); + + printf("\n === MEMMOVE ===\n\n"); + + test_memmove(0, 0, BUF_SIZE - 32); + test_memmove(0, 0, 1); + test_memmove(0, 0, 31); + test_memmove(0, 0, 32); + test_memmove(0, 0, BUF_SIZE - 33); + test_memmove(0, 0, 128); + test_memmove(4, 4, BUF_SIZE - 32 - 4); + test_memmove(1, 1, BUF_SIZE - 32 - 1); + test_memmove(1, 1, BUF_SIZE - 130); + test_memmove(0, 3, BUF_SIZE - 128); + test_memmove(1, 4, BUF_SIZE - 128); + test_memmove(0, 0, 0); + + return 0; +} Index: uClibc-0.9.28-avr32/libc/string/avr32/strlen.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/strlen.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2004 Atmel Norway + */ + +#define str r12 + + .text + .global strlen + .type strlen, @function +strlen: + mov r11, r12 + + mov r9, str + andl r9, 3, COH + brne .Lunaligned_str + +1: ld.w r8, str++ + tnbz r8 + brne 1b + + sub r12, r11 + bfextu r9, r8, 24, 8 + cp.w r9, 0 + subeq r12, 4 + reteq r12 + bfextu r9, r8, 16, 8 + cp.w r9, 0 + subeq r12, 3 + reteq r12 + bfextu r9, r8, 8, 8 + cp.w r9, 0 + subeq r12, 2 + reteq r12 + sub r12, 1 + retal r12 + +.Lunaligned_str: + add pc, pc, r9 << 3 + sub r0, r0, 0 /* 4-byte nop */ + ld.ub r8, str++ + sub r8, r8, 0 + breq 1f + ld.ub r8, str++ + sub r8, r8, 0 + breq 1f + ld.ub r8, str++ + sub r8, r8, 0 + brne 1b + +1: sub r12, 1 + sub r12, r11 + retal r12 Index: uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2004 Atmel Norway + */ + +#define dst r9 +#define src r11 + + .text + .global strcpy + .type strncpy, @function +strncpy: + mov dst, r12 + + pref src[0] + mov dst, r12 + + /* + * Check alignment. If src is aligned but dst isn't, we can't + * do much about it... + */ + mov r8, src + andl r8, 3 COH + brne .Lunaligned_src + +.Laligned_copy: + sub r10, 4 + brlt 3f +1: ld.w r8, src++ + tnbz r8 + breq 2f + st.w dst++, r8 + sub r10, 4 + brne 1b + +3: sub r10, -4 + reteq r12 + + /* This is safe as long as src is word-aligned and r10 > 0 */ + ld.w r8, src++ + +2: /* + * Ok, r8 now contains the terminating '\0'. Copy the + * remaining bytes individually. + */ + bfextu r11, r8, 24, 8 + st.b dst++, r11 + cp.w r11, 0 + reteq r12 + sub r10, 1 + reteq r12 + bfextu r11, r8, 16, 8 + st.b dst++, r11 + cp.w r11, 0 + reteq r12 + sub r10, 1 + reteq r12 + bfextu r11, r8, 8, 8 + st.b dst++, r11 + cp.w r11, 0 + reteq r12 + sub r10, 1 + reteq r12 + st.b dst++, r8 + retal r12 + +.Lunaligned_src: + /* Copy bytes until we're aligned */ + min r8, r8, r10 + sub r10, r8 + sub r8, 1 + retlt r12 +1: ld.ub r10, src++ + st.b dst++, r10 + sub r8, 1 + brge 1b + + rjmp .Laligned_copy Index: uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c 2006-10-19 15:05:52.000000000 +0200 @@ -0,0 +1,66 @@ + +#include +#include + +#define BUF_SIZE 32768 + +static char buf1[BUF_SIZE] __attribute__((aligned(32))); +static char buf1_ref[BUF_SIZE] __attribute__((aligned(32))); +static char buf2[BUF_SIZE] __attribute__((aligned(32))); + +extern void *new_memcpy(void *dest, void *src, size_t len); + +void dump_mismatch(char *buf, char *ref, size_t len) +{ + int i, j; + + for (i = 0; i < len; i += 16) { + if (memcmp(buf + i, ref + i, 16) == 0) + continue; + + printf("% 4x buf:", i); + for (j = i; j < (i + 16); j++) + printf(" %02x", buf[j]); + printf("\n ref:"); + for (j = i; j < (i + 16); j++) + printf(" %02x", ref[j]); + printf("\n"); + } +} + +void test(int src_offset, int dst_offset, int len) +{ + memset(buf1, 0x55, sizeof(buf1)); + memset(buf1_ref, 0x55, sizeof(buf1_ref)); + memset(buf2, 0xaa, sizeof(buf2)); + + printf("Testing with offsets %d => %d and len %d...", + src_offset, dst_offset, len); + + new_memcpy(buf1 + dst_offset, buf2 + src_offset, len); + memcpy(buf1_ref + dst_offset, buf2 + src_offset, len); + + if (memcmp(buf1, buf1_ref, sizeof(buf1)) == 0) + printf("OK\n"); + else { + printf("FAILED\n"); + dump_mismatch(buf1, buf1_ref, sizeof(buf1)); + } +} + +int main(int argc, char *argv[]) +{ + test(0, 0, BUF_SIZE); + test(0, 0, 1); + test(0, 0, 31); + test(0, 0, 32); + test(0, 0, 127); + test(0, 0, 128); + test(4, 4, BUF_SIZE - 4); + test(1, 1, BUF_SIZE - 1); + test(1, 1, 126); + test(0, 3, 128); + test(1, 4, 128); + + return 0; +}