diff options
Diffstat (limited to 'toolchain/musl/patches/000-update-to-git-2016-01-22.patch')
-rw-r--r-- | toolchain/musl/patches/000-update-to-git-2016-01-22.patch | 7636 |
1 files changed, 7636 insertions, 0 deletions
diff --git a/toolchain/musl/patches/000-update-to-git-2016-01-22.patch b/toolchain/musl/patches/000-update-to-git-2016-01-22.patch new file mode 100644 index 0000000000..f5fc159ad1 --- /dev/null +++ b/toolchain/musl/patches/000-update-to-git-2016-01-22.patch @@ -0,0 +1,7636 @@ +--- a/.gitignore ++++ b/.gitignore +@@ -5,9 +5,6 @@ + *.so.1 + arch/*/bits/alltypes.h + config.mak +-include/bits +-tools/musl-gcc +-tools/musl-clang +-tools/ld.musl-clang + lib/musl-gcc.specs + src/internal/version.h ++/obj/ +--- a/Makefile ++++ b/Makefile +@@ -8,6 +8,7 @@ + # Do not make changes here. + # + ++srcdir = . + exec_prefix = /usr/local + bindir = $(exec_prefix)/bin + +@@ -16,31 +17,38 @@ includedir = $(prefix)/include + libdir = $(prefix)/lib + syslibdir = /lib + +-SRCS = $(sort $(wildcard src/*/*.c arch/$(ARCH)/src/*.c)) +-OBJS = $(SRCS:.c=.o) ++BASE_SRCS = $(sort $(wildcard $(srcdir)/src/*/*.c $(srcdir)/arch/$(ARCH)/src/*.[csS])) ++BASE_OBJS = $(patsubst $(srcdir)/%,%.o,$(basename $(BASE_SRCS))) ++ARCH_SRCS = $(wildcard $(srcdir)/src/*/$(ARCH)/*.[csS]) ++ARCH_OBJS = $(patsubst $(srcdir)/%,%.o,$(basename $(ARCH_SRCS))) ++REPLACED_OBJS = $(sort $(subst /$(ARCH)/,/,$(ARCH_OBJS))) ++OBJS = $(addprefix obj/, $(filter-out $(REPLACED_OBJS), $(sort $(BASE_OBJS) $(ARCH_OBJS)))) + LOBJS = $(OBJS:.o=.lo) +-GENH = include/bits/alltypes.h +-GENH_INT = src/internal/version.h +-IMPH = src/internal/stdio_impl.h src/internal/pthread_impl.h src/internal/libc.h ++GENH = obj/include/bits/alltypes.h ++GENH_INT = obj/src/internal/version.h ++IMPH = $(addprefix $(srcdir)/, src/internal/stdio_impl.h src/internal/pthread_impl.h src/internal/libc.h) + +-LDFLAGS = ++LDFLAGS = ++LDFLAGS_AUTO = + LIBCC = -lgcc + CPPFLAGS = +-CFLAGS = -Os -pipe ++CFLAGS = ++CFLAGS_AUTO = -Os -pipe + CFLAGS_C99FSE = -std=c99 -ffreestanding -nostdinc + + CFLAGS_ALL = $(CFLAGS_C99FSE) +-CFLAGS_ALL += -D_XOPEN_SOURCE=700 -I./arch/$(ARCH) -I./src/internal -I./include +-CFLAGS_ALL += $(CPPFLAGS) $(CFLAGS) +-CFLAGS_ALL_STATIC = $(CFLAGS_ALL) +-CFLAGS_ALL_SHARED = $(CFLAGS_ALL) -fPIC -DSHARED ++CFLAGS_ALL += -D_XOPEN_SOURCE=700 -I$(srcdir)/arch/$(ARCH) -Iobj/src/internal -I$(srcdir)/src/internal -Iobj/include -I$(srcdir)/include ++CFLAGS_ALL += $(CPPFLAGS) $(CFLAGS_AUTO) $(CFLAGS) ++ ++LDFLAGS_ALL = $(LDFLAGS_AUTO) $(LDFLAGS) + + AR = $(CROSS_COMPILE)ar + RANLIB = $(CROSS_COMPILE)ranlib +-INSTALL = ./tools/install.sh ++INSTALL = $(srcdir)/tools/install.sh + +-ARCH_INCLUDES = $(wildcard arch/$(ARCH)/bits/*.h) +-ALL_INCLUDES = $(sort $(wildcard include/*.h include/*/*.h) $(GENH) $(ARCH_INCLUDES:arch/$(ARCH)/%=include/%)) ++ARCH_INCLUDES = $(wildcard $(srcdir)/arch/$(ARCH)/bits/*.h) ++INCLUDES = $(wildcard $(srcdir)/include/*.h $(srcdir)/include/*/*.h) ++ALL_INCLUDES = $(sort $(INCLUDES:$(srcdir)/%=%) $(GENH:obj/%=%) $(ARCH_INCLUDES:$(srcdir)/arch/$(ARCH)/%=include/%)) + + EMPTY_LIB_NAMES = m rt pthread crypt util xnet resolv dl + EMPTY_LIBS = $(EMPTY_LIB_NAMES:%=lib/lib%.a) +@@ -49,7 +57,7 @@ STATIC_LIBS = lib/libc.a + SHARED_LIBS = lib/libc.so + TOOL_LIBS = lib/musl-gcc.specs + ALL_LIBS = $(CRT_LIBS) $(STATIC_LIBS) $(SHARED_LIBS) $(EMPTY_LIBS) $(TOOL_LIBS) +-ALL_TOOLS = tools/musl-gcc ++ALL_TOOLS = obj/musl-gcc + + WRAPCC_GCC = gcc + WRAPCC_CLANG = clang +@@ -58,95 +66,93 @@ LDSO_PATHNAME = $(syslibdir)/ld-musl-$(A + + -include config.mak + ++ifeq ($(ARCH),) ++$(error Please set ARCH in config.mak before running make.) ++endif ++ + all: $(ALL_LIBS) $(ALL_TOOLS) + ++OBJ_DIRS = $(sort $(patsubst %/,%,$(dir $(ALL_LIBS) $(ALL_TOOLS) $(OBJS) $(GENH) $(GENH_INT))) $(addprefix obj/, crt crt/$(ARCH) include)) ++ ++$(ALL_LIBS) $(ALL_TOOLS) $(CRT_LIBS:lib/%=obj/crt/%) $(OBJS) $(LOBJS) $(GENH) $(GENH_INT): | $(OBJ_DIRS) ++ ++$(OBJ_DIRS): ++ mkdir -p $@ ++ + install: install-libs install-headers install-tools + + clean: +- rm -f crt/*.o +- rm -f $(OBJS) +- rm -f $(LOBJS) +- rm -f $(ALL_LIBS) lib/*.[ao] lib/*.so +- rm -f $(ALL_TOOLS) +- rm -f $(GENH) $(GENH_INT) +- rm -f include/bits ++ rm -rf obj lib + + distclean: clean + rm -f config.mak + +-include/bits: +- @test "$(ARCH)" || { echo "Please set ARCH in config.mak before running make." ; exit 1 ; } +- ln -sf ../arch/$(ARCH)/bits $@ ++obj/include/bits/alltypes.h: $(srcdir)/arch/$(ARCH)/bits/alltypes.h.in $(srcdir)/include/alltypes.h.in $(srcdir)/tools/mkalltypes.sed ++ sed -f $(srcdir)/tools/mkalltypes.sed $(srcdir)/arch/$(ARCH)/bits/alltypes.h.in $(srcdir)/include/alltypes.h.in > $@ + +-include/bits/alltypes.h.in: include/bits ++obj/src/internal/version.h: $(wildcard $(srcdir)/VERSION $(srcdir)/.git) ++ printf '#define VERSION "%s"\n' "$$(cd $(srcdir); sh tools/version.sh)" > $@ + +-include/bits/alltypes.h: include/bits/alltypes.h.in include/alltypes.h.in tools/mkalltypes.sed +- sed -f tools/mkalltypes.sed include/bits/alltypes.h.in include/alltypes.h.in > $@ ++obj/src/internal/version.o obj/src/internal/version.lo: obj/src/internal/version.h + +-src/internal/version.h: $(wildcard VERSION .git) +- printf '#define VERSION "%s"\n' "$$(sh tools/version.sh)" > $@ ++obj/crt/rcrt1.o obj/src/ldso/dlstart.lo obj/src/ldso/dynlink.lo: $(srcdir)/src/internal/dynlink.h $(srcdir)/arch/$(ARCH)/reloc.h + +-src/internal/version.lo: src/internal/version.h ++obj/crt/crt1.o obj/crt/scrt1.o obj/crt/rcrt1.o obj/src/ldso/dlstart.lo: $(srcdir)/arch/$(ARCH)/crt_arch.h + +-crt/rcrt1.o src/ldso/dlstart.lo src/ldso/dynlink.lo: src/internal/dynlink.h arch/$(ARCH)/reloc.h ++obj/crt/rcrt1.o: $(srcdir)/src/ldso/dlstart.c + +-crt/crt1.o crt/Scrt1.o crt/rcrt1.o src/ldso/dlstart.lo: $(wildcard arch/$(ARCH)/crt_arch.h) ++obj/crt/Scrt1.o obj/crt/rcrt1.o: CFLAGS_ALL += -fPIC + +-crt/rcrt1.o: src/ldso/dlstart.c ++obj/crt/$(ARCH)/crti.o: $(srcdir)/crt/$(ARCH)/crti.s + +-crt/Scrt1.o crt/rcrt1.o: CFLAGS += -fPIC ++obj/crt/$(ARCH)/crtn.o: $(srcdir)/crt/$(ARCH)/crtn.s + +-OPTIMIZE_SRCS = $(wildcard $(OPTIMIZE_GLOBS:%=src/%)) +-$(OPTIMIZE_SRCS:%.c=%.o) $(OPTIMIZE_SRCS:%.c=%.lo): CFLAGS += -O3 ++OPTIMIZE_SRCS = $(wildcard $(OPTIMIZE_GLOBS:%=$(srcdir)/src/%)) ++$(OPTIMIZE_SRCS:$(srcdir)/%.c=obj/%.o) $(OPTIMIZE_SRCS:$(srcdir)/%.c=obj/%.lo): CFLAGS += -O3 + + MEMOPS_SRCS = src/string/memcpy.c src/string/memmove.c src/string/memcmp.c src/string/memset.c +-$(MEMOPS_SRCS:%.c=%.o) $(MEMOPS_SRCS:%.c=%.lo): CFLAGS += $(CFLAGS_MEMOPS) ++$(MEMOPS_SRCS:%.c=obj/%.o) $(MEMOPS_SRCS:%.c=obj/%.lo): CFLAGS_ALL += $(CFLAGS_MEMOPS) + + NOSSP_SRCS = $(wildcard crt/*.c) \ + src/env/__libc_start_main.c src/env/__init_tls.c \ + src/thread/__set_thread_area.c src/env/__stack_chk_fail.c \ + src/string/memset.c src/string/memcpy.c \ + src/ldso/dlstart.c src/ldso/dynlink.c +-$(NOSSP_SRCS:%.c=%.o) $(NOSSP_SRCS:%.c=%.lo): CFLAGS += $(CFLAGS_NOSSP) ++$(NOSSP_SRCS:%.c=obj/%.o) $(NOSSP_SRCS:%.c=obj/%.lo): CFLAGS_ALL += $(CFLAGS_NOSSP) ++ ++$(CRT_LIBS:lib/%=obj/crt/%): CFLAGS_ALL += -DCRT + +-$(CRT_LIBS:lib/%=crt/%): CFLAGS += -DCRT ++$(LOBJS): CFLAGS_ALL += -fPIC -DSHARED + +-# This incantation ensures that changes to any subarch asm files will +-# force the corresponding object file to be rebuilt, even if the implicit +-# rule below goes indirectly through a .sub file. +-define mkasmdep +-$(dir $(patsubst %/,%,$(dir $(1))))$(notdir $(1:.s=.o)): $(1) +-endef +-$(foreach s,$(wildcard src/*/$(ARCH)*/*.s),$(eval $(call mkasmdep,$(s)))) ++CC_CMD = $(CC) $(CFLAGS_ALL) -c -o $@ $< + + # Choose invocation of assembler to be used +-# $(1) is input file, $(2) is output file, $(3) is assembler flags + ifeq ($(ADD_CFI),yes) +- AS_CMD = LC_ALL=C awk -f tools/add-cfi.common.awk -f tools/add-cfi.$(ARCH).awk $< | $(CC) -x assembler -c -o $@ - ++ AS_CMD = LC_ALL=C awk -f $(srcdir)/tools/add-cfi.common.awk -f $(srcdir)/tools/add-cfi.$(ARCH).awk $< | $(CC) $(CFLAGS_ALL) -x assembler -c -o $@ - + else +- AS_CMD = $(CC) -c -o $@ $< ++ AS_CMD = $(CC_CMD) + endif + +-%.o: $(ARCH)$(ASMSUBARCH)/%.sub +- $(CC) $(CFLAGS_ALL_STATIC) -c -o $@ $(dir $<)$(shell cat $<) ++obj/%.o: $(srcdir)/%.s ++ $(AS_CMD) + +-%.o: $(ARCH)/%.s +- $(AS_CMD) $(CFLAGS_ALL_STATIC) ++obj/%.o: $(srcdir)/%.S ++ $(CC_CMD) + +-%.o: %.c $(GENH) $(IMPH) +- $(CC) $(CFLAGS_ALL_STATIC) -c -o $@ $< ++obj/%.o: $(srcdir)/%.c $(GENH) $(IMPH) ++ $(CC_CMD) + +-%.lo: $(ARCH)$(ASMSUBARCH)/%.sub +- $(CC) $(CFLAGS_ALL_SHARED) -c -o $@ $(dir $<)$(shell cat $<) ++obj/%.lo: $(srcdir)/%.s ++ $(AS_CMD) + +-%.lo: $(ARCH)/%.s +- $(AS_CMD) $(CFLAGS_ALL_SHARED) ++obj/%.lo: $(srcdir)/%.S ++ $(CC_CMD) + +-%.lo: %.c $(GENH) $(IMPH) +- $(CC) $(CFLAGS_ALL_SHARED) -c -o $@ $< ++obj/%.lo: $(srcdir)/%.c $(GENH) $(IMPH) ++ $(CC_CMD) + + lib/libc.so: $(LOBJS) +- $(CC) $(CFLAGS_ALL_SHARED) $(LDFLAGS) -nostdlib -shared \ ++ $(CC) $(CFLAGS_ALL) $(LDFLAGS_ALL) -nostdlib -shared \ + -Wl,-e,_dlstart -Wl,-Bsymbolic-functions \ + -o $@ $(LOBJS) $(LIBCC) + +@@ -159,21 +165,27 @@ $(EMPTY_LIBS): + rm -f $@ + $(AR) rc $@ + +-lib/%.o: crt/%.o ++lib/%.o: obj/crt/%.o + cp $< $@ + +-lib/musl-gcc.specs: tools/musl-gcc.specs.sh config.mak ++lib/crti.o: obj/crt/$(ARCH)/crti.o ++ cp $< $@ ++ ++lib/crtn.o: obj/crt/$(ARCH)/crtn.o ++ cp $< $@ ++ ++lib/musl-gcc.specs: $(srcdir)/tools/musl-gcc.specs.sh config.mak + sh $< "$(includedir)" "$(libdir)" "$(LDSO_PATHNAME)" > $@ + +-tools/musl-gcc: config.mak ++obj/musl-gcc: config.mak + printf '#!/bin/sh\nexec "$${REALGCC:-$(WRAPCC_GCC)}" "$$@" -specs "%s/musl-gcc.specs"\n' "$(libdir)" > $@ + chmod +x $@ + +-tools/%-clang: tools/%-clang.in config.mak ++obj/%-clang: $(srcdir)/tools/%-clang.in config.mak + sed -e 's!@CC@!$(WRAPCC_CLANG)!g' -e 's!@PREFIX@!$(prefix)!g' -e 's!@INCDIR@!$(includedir)!g' -e 's!@LIBDIR@!$(libdir)!g' -e 's!@LDSO@!$(LDSO_PATHNAME)!g' $< > $@ + chmod +x $@ + +-$(DESTDIR)$(bindir)/%: tools/% ++$(DESTDIR)$(bindir)/%: obj/% + $(INSTALL) -D $< $@ + + $(DESTDIR)$(libdir)/%.so: lib/%.so +@@ -182,10 +194,13 @@ $(DESTDIR)$(libdir)/%.so: lib/%.so + $(DESTDIR)$(libdir)/%: lib/% + $(INSTALL) -D -m 644 $< $@ + +-$(DESTDIR)$(includedir)/bits/%: arch/$(ARCH)/bits/% ++$(DESTDIR)$(includedir)/bits/%: $(srcdir)/arch/$(ARCH)/bits/% ++ $(INSTALL) -D -m 644 $< $@ ++ ++$(DESTDIR)$(includedir)/bits/%: obj/include/bits/% + $(INSTALL) -D -m 644 $< $@ + +-$(DESTDIR)$(includedir)/%: include/% ++$(DESTDIR)$(includedir)/%: $(srcdir)/include/% + $(INSTALL) -D -m 644 $< $@ + + $(DESTDIR)$(LDSO_PATHNAME): $(DESTDIR)$(libdir)/libc.so +@@ -195,12 +210,12 @@ install-libs: $(ALL_LIBS:lib/%=$(DESTDIR + + install-headers: $(ALL_INCLUDES:include/%=$(DESTDIR)$(includedir)/%) + +-install-tools: $(ALL_TOOLS:tools/%=$(DESTDIR)$(bindir)/%) ++install-tools: $(ALL_TOOLS:obj/%=$(DESTDIR)$(bindir)/%) + + musl-git-%.tar.gz: .git +- git archive --format=tar.gz --prefix=$(patsubst %.tar.gz,%,$@)/ -o $@ $(patsubst musl-git-%.tar.gz,%,$@) ++ git --git-dir=$(srcdir)/.git archive --format=tar.gz --prefix=$(patsubst %.tar.gz,%,$@)/ -o $@ $(patsubst musl-git-%.tar.gz,%,$@) + + musl-%.tar.gz: .git +- git archive --format=tar.gz --prefix=$(patsubst %.tar.gz,%,$@)/ -o $@ v$(patsubst musl-%.tar.gz,%,$@) ++ git --git-dir=$(srcdir)/.git archive --format=tar.gz --prefix=$(patsubst %.tar.gz,%,$@)/ -o $@ v$(patsubst musl-%.tar.gz,%,$@) + + .PHONY: all clean install install-libs install-headers install-tools +--- a/arch/aarch64/atomic.h ++++ /dev/null +@@ -1,206 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_64(uint64_t x) +-{ +- __asm__( +- " rbit %0, %1\n" +- " clz %0, %0\n" +- : "=r"(x) : "r"(x)); +- return x; +-} +- +-static inline int a_ctz_l(unsigned long x) +-{ +- return a_ctz_64(x); +-} +- +-static inline void a_barrier() +-{ +- __asm__ __volatile__("dmb ish"); +-} +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- void *old; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %0,%3\n" +- " cmp %0,%1\n" +- " b.ne 1f\n" +- " stxr %w0,%2,%3\n" +- " cbnz %w0,1b\n" +- " mov %0,%1\n" +- "1: dmb ish\n" +- : "=&r"(old) +- : "r"(t), "r"(s), "Q"(*(long*)p) +- : "memory", "cc"); +- return old; +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- int old; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %w0,%3\n" +- " cmp %w0,%w1\n" +- " b.ne 1f\n" +- " stxr %w0,%w2,%3\n" +- " cbnz %w0,1b\n" +- " mov %w0,%w1\n" +- "1: dmb ish\n" +- : "=&r"(old) +- : "r"(t), "r"(s), "Q"(*p) +- : "memory", "cc"); +- return old; +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- int old, tmp; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %w0,%3\n" +- " stxr %w1,%w2,%3\n" +- " cbnz %w1,1b\n" +- " dmb ish\n" +- : "=&r"(old), "=&r"(tmp) +- : "r"(v), "Q"(*x) +- : "memory", "cc" ); +- return old; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- int old, tmp; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %w0,%3\n" +- " add %w0,%w0,%w2\n" +- " stxr %w1,%w0,%3\n" +- " cbnz %w1,1b\n" +- " dmb ish\n" +- : "=&r"(old), "=&r"(tmp) +- : "r"(v), "Q"(*x) +- : "memory", "cc" ); +- return old-v; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %w0,%2\n" +- " add %w0,%w0,#1\n" +- " stxr %w1,%w0,%2\n" +- " cbnz %w1,1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "Q"(*x) +- : "memory", "cc" ); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %w0,%2\n" +- " sub %w0,%w0,#1\n" +- " stxr %w1,%w0,%2\n" +- " cbnz %w1,1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "Q"(*x) +- : "memory", "cc" ); +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %0,%3\n" +- " and %0,%0,%2\n" +- " stxr %w1,%0,%3\n" +- " cbnz %w1,1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "r"(v), "Q"(*p) +- : "memory", "cc" ); +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %w0,%3\n" +- " and %w0,%w0,%w2\n" +- " stxr %w1,%w0,%3\n" +- " cbnz %w1,1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "r"(v), "Q"(*p) +- : "memory", "cc" ); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %0,%3\n" +- " orr %0,%0,%2\n" +- " stxr %w1,%0,%3\n" +- " cbnz %w1,1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "r"(v), "Q"(*p) +- : "memory", "cc" ); +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- return a_or_64(p, v); +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldxr %w0,%3\n" +- " orr %w0,%w0,%w2\n" +- " stxr %w1,%w0,%3\n" +- " cbnz %w1,1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "r"(v), "Q"(*p) +- : "memory", "cc" ); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- __asm__ __volatile__( +- " dmb ish\n" +- " str %w1,%0\n" +- " dmb ish\n" +- : "=m"(*p) +- : "r"(x) +- : "memory", "cc" ); +-} +- +-#define a_spin a_barrier +- +-static inline void a_crash() +-{ +- *(volatile char *)0=0; +-} +- +- +-#endif +--- /dev/null ++++ b/arch/aarch64/atomic_arch.h +@@ -0,0 +1,53 @@ ++#define a_ll a_ll ++static inline int a_ll(volatile int *p) ++{ ++ int v; ++ __asm__ __volatile__ ("ldxr %0, %1" : "=r"(v) : "Q"(*p)); ++ return v; ++} ++ ++#define a_sc a_sc ++static inline int a_sc(volatile int *p, int v) ++{ ++ int r; ++ __asm__ __volatile__ ("stxr %w0,%1,%2" : "=&r"(r) : "r"(v), "Q"(*p) : "memory"); ++ return !r; ++} ++ ++#define a_barrier a_barrier ++static inline void a_barrier() ++{ ++ __asm__ __volatile__ ("dmb ish" : : : "memory"); ++} ++ ++#define a_pre_llsc a_barrier ++#define a_post_llsc a_barrier ++ ++#define a_cas_p a_cas_p ++static inline void *a_cas_p(volatile void *p, void *t, void *s) ++{ ++ void *old; ++ __asm__ __volatile__( ++ " dmb ish\n" ++ "1: ldxr %0,%3\n" ++ " cmp %0,%1\n" ++ " b.ne 1f\n" ++ " stxr %w0,%2,%3\n" ++ " cbnz %w0,1b\n" ++ " mov %0,%1\n" ++ "1: dmb ish\n" ++ : "=&r"(old) ++ : "r"(t), "r"(s), "Q"(*(void *volatile *)p) ++ : "memory", "cc"); ++ return old; ++} ++ ++#define a_ctz_64 a_ctz_64 ++static inline int a_ctz_64(uint64_t x) ++{ ++ __asm__( ++ " rbit %0, %1\n" ++ " clz %0, %0\n" ++ : "=r"(x) : "r"(x)); ++ return x; ++} +--- a/arch/aarch64/pthread_arch.h ++++ b/arch/aarch64/pthread_arch.h +@@ -8,4 +8,4 @@ static inline struct pthread *__pthread_ + #define TLS_ABOVE_TP + #define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) - 16) + +-#define CANCEL_REG_IP 33 ++#define MC_PC pc +--- a/arch/arm/atomic.h ++++ /dev/null +@@ -1,261 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_l(unsigned long x) +-{ +- static const char debruijn32[32] = { +- 0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13, +- 31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14 +- }; +- return debruijn32[(x&-x)*0x076be629 >> 27]; +-} +- +-static inline int a_ctz_64(uint64_t x) +-{ +- uint32_t y = x; +- if (!y) { +- y = x>>32; +- return 32 + a_ctz_l(y); +- } +- return a_ctz_l(y); +-} +- +-#if __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH >= 7 +- +-static inline void a_barrier() +-{ +- __asm__ __volatile__("dmb ish"); +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- int old; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldrex %0,%3\n" +- " cmp %0,%1\n" +- " bne 1f\n" +- " strex %0,%2,%3\n" +- " cmp %0, #0\n" +- " bne 1b\n" +- " mov %0, %1\n" +- "1: dmb ish\n" +- : "=&r"(old) +- : "r"(t), "r"(s), "Q"(*p) +- : "memory", "cc" ); +- return old; +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- int old, tmp; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldrex %0,%3\n" +- " strex %1,%2,%3\n" +- " cmp %1, #0\n" +- " bne 1b\n" +- " dmb ish\n" +- : "=&r"(old), "=&r"(tmp) +- : "r"(v), "Q"(*x) +- : "memory", "cc" ); +- return old; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- int old, tmp; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldrex %0,%3\n" +- " add %0,%0,%2\n" +- " strex %1,%0,%3\n" +- " cmp %1, #0\n" +- " bne 1b\n" +- " dmb ish\n" +- : "=&r"(old), "=&r"(tmp) +- : "r"(v), "Q"(*x) +- : "memory", "cc" ); +- return old-v; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldrex %0,%2\n" +- " add %0,%0,#1\n" +- " strex %1,%0,%2\n" +- " cmp %1, #0\n" +- " bne 1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "Q"(*x) +- : "memory", "cc" ); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldrex %0,%2\n" +- " sub %0,%0,#1\n" +- " strex %1,%0,%2\n" +- " cmp %1, #0\n" +- " bne 1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "Q"(*x) +- : "memory", "cc" ); +-} +- +-static inline void a_and(volatile int *x, int v) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldrex %0,%3\n" +- " and %0,%0,%2\n" +- " strex %1,%0,%3\n" +- " cmp %1, #0\n" +- " bne 1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "r"(v), "Q"(*x) +- : "memory", "cc" ); +-} +- +-static inline void a_or(volatile int *x, int v) +-{ +- int tmp, tmp2; +- __asm__ __volatile__( +- " dmb ish\n" +- "1: ldrex %0,%3\n" +- " orr %0,%0,%2\n" +- " strex %1,%0,%3\n" +- " cmp %1, #0\n" +- " bne 1b\n" +- " dmb ish\n" +- : "=&r"(tmp), "=&r"(tmp2) +- : "r"(v), "Q"(*x) +- : "memory", "cc" ); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- __asm__ __volatile__( +- " dmb ish\n" +- " str %1,%0\n" +- " dmb ish\n" +- : "=m"(*p) +- : "r"(x) +- : "memory", "cc" ); +-} +- +-#else +- +-int __a_cas(int, int, volatile int *) __attribute__((__visibility__("hidden"))); +-#define __k_cas __a_cas +- +-static inline void a_barrier() +-{ +- __asm__ __volatile__("bl __a_barrier" +- : : : "memory", "cc", "ip", "lr" ); +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- int old; +- for (;;) { +- if (!__k_cas(t, s, p)) +- return t; +- if ((old=*p) != t) +- return old; +- } +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- int old; +- do old = *x; +- while (__k_cas(old, v, x)); +- return old; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- int old; +- do old = *x; +- while (__k_cas(old, old+v, x)); +- return old; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- a_fetch_add(x, 1); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- a_fetch_add(x, -1); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- a_barrier(); +- *p = x; +- a_barrier(); +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- int old; +- do old = *p; +- while (__k_cas(old, old&v, p)); +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- int old; +- do old = *p; +- while (__k_cas(old, old|v, p)); +-} +- +-#endif +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- return (void *)a_cas(p, (int)t, (int)s); +-} +- +-#define a_spin a_barrier +- +-static inline void a_crash() +-{ +- *(volatile char *)0=0; +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- a_or(p, v); +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_and((int *)p, u.r[0]); +- a_and((int *)p+1, u.r[1]); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_or((int *)p, u.r[0]); +- a_or((int *)p+1, u.r[1]); +-} +- +-#endif +--- /dev/null ++++ b/arch/arm/atomic_arch.h +@@ -0,0 +1,64 @@ ++__attribute__((__visibility__("hidden"))) ++extern const void *__arm_atomics[3]; /* gettp, cas, barrier */ ++ ++#if ((__ARM_ARCH_6__ || __ARM_ARCH_6K__ || __ARM_ARCH_6ZK__) && !__thumb__) \ ++ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH >= 7 ++ ++#define a_ll a_ll ++static inline int a_ll(volatile int *p) ++{ ++ int v; ++ __asm__ __volatile__ ("ldrex %0, %1" : "=r"(v) : "Q"(*p)); ++ return v; ++} ++ ++#define a_sc a_sc ++static inline int a_sc(volatile int *p, int v) ++{ ++ int r; ++ __asm__ __volatile__ ("strex %0,%1,%2" : "=&r"(r) : "r"(v), "Q"(*p) : "memory"); ++ return !r; ++} ++ ++#if __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH >= 7 ++ ++#define a_barrier a_barrier ++static inline void a_barrier() ++{ ++ __asm__ __volatile__ ("dmb ish" : : : "memory"); ++} ++ ++#endif ++ ++#define a_pre_llsc a_barrier ++#define a_post_llsc a_barrier ++ ++#else ++ ++#define a_cas a_cas ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ for (;;) { ++ register int r0 __asm__("r0") = t; ++ register int r1 __asm__("r1") = s; ++ register volatile int *r2 __asm__("r2") = p; ++ int old; ++ __asm__ __volatile__ ( ++ "bl __a_cas" ++ : "+r"(r0) : "r"(r1), "r"(r2) ++ : "memory", "r3", "lr", "ip", "cc" ); ++ if (!r0) return t; ++ if ((old=*p)!=t) return old; ++ } ++} ++ ++#endif ++ ++#ifndef a_barrier ++#define a_barrier a_barrier ++static inline void a_barrier() ++{ ++ __asm__ __volatile__("bl __a_barrier" ++ : : : "memory", "cc", "ip", "lr" ); ++} ++#endif +--- a/arch/arm/pthread_arch.h ++++ b/arch/arm/pthread_arch.h +@@ -27,4 +27,4 @@ static inline pthread_t __pthread_self() + #define TLS_ABOVE_TP + #define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) - 8) + +-#define CANCEL_REG_IP 18 ++#define MC_PC arm_pc +--- a/arch/arm/reloc.h ++++ b/arch/arm/reloc.h +@@ -6,10 +6,10 @@ + #define ENDIAN_SUFFIX "" + #endif + +-#if __SOFTFP__ +-#define FP_SUFFIX "" +-#else ++#if __ARM_PCS_VFP + #define FP_SUFFIX "hf" ++#else ++#define FP_SUFFIX "" + #endif + + #define LDSO_ARCH "arm" ENDIAN_SUFFIX FP_SUFFIX +@@ -28,10 +28,5 @@ + #define REL_TPOFF R_ARM_TLS_TPOFF32 + //#define REL_TLSDESC R_ARM_TLS_DESC + +-#ifdef __thumb__ + #define CRTJMP(pc,sp) __asm__ __volatile__( \ + "mov sp,%1 ; bx %0" : : "r"(pc), "r"(sp) : "memory" ) +-#else +-#define CRTJMP(pc,sp) __asm__ __volatile__( \ +- "mov sp,%1 ; tst %0,#1 ; moveq pc,%0 ; bx %0" : : "r"(pc), "r"(sp) : "memory" ) +-#endif +--- a/arch/arm/src/__aeabi_atexit.c ++++ /dev/null +@@ -1,6 +0,0 @@ +-int __cxa_atexit(void (*func)(void *), void *arg, void *dso); +- +-int __aeabi_atexit (void *obj, void (*func) (void *), void *d) +-{ +- return __cxa_atexit (func, obj, d); +-} +--- a/arch/arm/src/__aeabi_memclr.c ++++ /dev/null +@@ -1,9 +0,0 @@ +-#include <string.h> +-#include "libc.h" +- +-void __aeabi_memclr(void *dest, size_t n) +-{ +- memset(dest, 0, n); +-} +-weak_alias(__aeabi_memclr, __aeabi_memclr4); +-weak_alias(__aeabi_memclr, __aeabi_memclr8); +--- a/arch/arm/src/__aeabi_memcpy.c ++++ /dev/null +@@ -1,9 +0,0 @@ +-#include <string.h> +-#include "libc.h" +- +-void __aeabi_memcpy(void *restrict dest, const void *restrict src, size_t n) +-{ +- memcpy(dest, src, n); +-} +-weak_alias(__aeabi_memcpy, __aeabi_memcpy4); +-weak_alias(__aeabi_memcpy, __aeabi_memcpy8); +--- a/arch/arm/src/__aeabi_memmove.c ++++ /dev/null +@@ -1,9 +0,0 @@ +-#include <string.h> +-#include "libc.h" +- +-void __aeabi_memmove(void *dest, const void *src, size_t n) +-{ +- memmove(dest, src, n); +-} +-weak_alias(__aeabi_memmove, __aeabi_memmove4); +-weak_alias(__aeabi_memmove, __aeabi_memmove8); +--- a/arch/arm/src/__aeabi_memset.c ++++ /dev/null +@@ -1,9 +0,0 @@ +-#include <string.h> +-#include "libc.h" +- +-void __aeabi_memset(void *dest, size_t n, int c) +-{ +- memset(dest, c, n); +-} +-weak_alias(__aeabi_memset, __aeabi_memset4); +-weak_alias(__aeabi_memset, __aeabi_memset8); +--- a/arch/arm/src/__set_thread_area.c ++++ /dev/null +@@ -1,49 +0,0 @@ +-#include <stdint.h> +-#include <elf.h> +-#include "pthread_impl.h" +-#include "libc.h" +- +-#define HWCAP_TLS (1 << 15) +- +-extern const unsigned char __attribute__((__visibility__("hidden"))) +- __a_barrier_dummy[], __a_barrier_oldkuser[], +- __a_barrier_v6[], __a_barrier_v7[], +- __a_cas_dummy[], __a_cas_v6[], __a_cas_v7[], +- __a_gettp_dummy[]; +- +-#define __a_barrier_kuser 0xffff0fa0 +-#define __a_cas_kuser 0xffff0fc0 +-#define __a_gettp_kuser 0xffff0fe0 +- +-extern uintptr_t __attribute__((__visibility__("hidden"))) +- __a_barrier_ptr, __a_cas_ptr, __a_gettp_ptr; +- +-#define SET(op,ver) (__a_##op##_ptr = \ +- (uintptr_t)__a_##op##_##ver - (uintptr_t)__a_##op##_dummy) +- +-int __set_thread_area(void *p) +-{ +-#if !__ARM_ARCH_7A__ && !__ARM_ARCH_7R__ && __ARM_ARCH < 7 +- if (__hwcap & HWCAP_TLS) { +- size_t *aux; +- SET(cas, v7); +- SET(barrier, v7); +- for (aux=libc.auxv; *aux; aux+=2) { +- if (*aux != AT_PLATFORM) continue; +- const char *s = (void *)aux[1]; +- if (s[0]!='v' || s[1]!='6' || s[2]-'0'<10u) break; +- SET(cas, v6); +- SET(barrier, v6); +- break; +- } +- } else { +- int ver = *(int *)0xffff0ffc; +- SET(gettp, kuser); +- SET(cas, kuser); +- SET(barrier, kuser); +- if (ver < 2) a_crash(); +- if (ver < 3) SET(barrier, oldkuser); +- } +-#endif +- return __syscall(0xf0005, p); +-} +--- a/arch/arm/src/arm/atomics.s ++++ /dev/null +@@ -1,116 +0,0 @@ +-.text +- +-.global __a_barrier +-.hidden __a_barrier +-.type __a_barrier,%function +-__a_barrier: +- ldr ip,1f +- ldr ip,[pc,ip] +- add pc,pc,ip +-1: .word __a_barrier_ptr-1b +-.global __a_barrier_dummy +-.hidden __a_barrier_dummy +-__a_barrier_dummy: +- tst lr,#1 +- moveq pc,lr +- bx lr +-.global __a_barrier_oldkuser +-.hidden __a_barrier_oldkuser +-__a_barrier_oldkuser: +- push {r0,r1,r2,r3,ip,lr} +- mov r1,r0 +- mov r2,sp +- ldr ip,=0xffff0fc0 +- mov lr,pc +- mov pc,ip +- pop {r0,r1,r2,r3,ip,lr} +- tst lr,#1 +- moveq pc,lr +- bx lr +-.global __a_barrier_v6 +-.hidden __a_barrier_v6 +-__a_barrier_v6: +- mcr p15,0,r0,c7,c10,5 +- bx lr +-.global __a_barrier_v7 +-.hidden __a_barrier_v7 +-__a_barrier_v7: +- .word 0xf57ff05b /* dmb ish */ +- bx lr +- +-.global __a_cas +-.hidden __a_cas +-.type __a_cas,%function +-__a_cas: +- ldr ip,1f +- ldr ip,[pc,ip] +- add pc,pc,ip +-1: .word __a_cas_ptr-1b +-.global __a_cas_dummy +-.hidden __a_cas_dummy +-__a_cas_dummy: +- mov r3,r0 +- ldr r0,[r2] +- subs r0,r3,r0 +- streq r1,[r2] +- tst lr,#1 +- moveq pc,lr +- bx lr +-.global __a_cas_v6 +-.hidden __a_cas_v6 +-__a_cas_v6: +- mov r3,r0 +- mcr p15,0,r0,c7,c10,5 +-1: .word 0xe1920f9f /* ldrex r0,[r2] */ +- subs r0,r3,r0 +- .word 0x01820f91 /* strexeq r0,r1,[r2] */ +- teqeq r0,#1 +- beq 1b +- mcr p15,0,r0,c7,c10,5 +- bx lr +-.global __a_cas_v7 +-.hidden __a_cas_v7 +-__a_cas_v7: +- mov r3,r0 +- .word 0xf57ff05b /* dmb ish */ +-1: .word 0xe1920f9f /* ldrex r0,[r2] */ +- subs r0,r3,r0 +- .word 0x01820f91 /* strexeq r0,r1,[r2] */ +- teqeq r0,#1 +- beq 1b +- .word 0xf57ff05b /* dmb ish */ +- bx lr +- +-.global __aeabi_read_tp +-.type __aeabi_read_tp,%function +-__aeabi_read_tp: +- +-.global __a_gettp +-.hidden __a_gettp +-.type __a_gettp,%function +-__a_gettp: +- ldr r0,1f +- ldr r0,[pc,r0] +- add pc,pc,r0 +-1: .word __a_gettp_ptr-1b +-.global __a_gettp_dummy +-.hidden __a_gettp_dummy +-__a_gettp_dummy: +- mrc p15,0,r0,c13,c0,3 +- bx lr +- +-.data +-.global __a_barrier_ptr +-.hidden __a_barrier_ptr +-__a_barrier_ptr: +- .word 0 +- +-.global __a_cas_ptr +-.hidden __a_cas_ptr +-__a_cas_ptr: +- .word 0 +- +-.global __a_gettp_ptr +-.hidden __a_gettp_ptr +-__a_gettp_ptr: +- .word 0 +--- a/arch/arm/src/find_exidx.c ++++ /dev/null +@@ -1,42 +0,0 @@ +-#define _GNU_SOURCE +-#include <link.h> +-#include <stdint.h> +- +-struct find_exidx_data { +- uintptr_t pc, exidx_start; +- int exidx_len; +-}; +- +-static int find_exidx(struct dl_phdr_info *info, size_t size, void *ptr) +-{ +- struct find_exidx_data *data = ptr; +- const ElfW(Phdr) *phdr = info->dlpi_phdr; +- uintptr_t addr, exidx_start = 0; +- int i, match = 0, exidx_len = 0; +- +- for (i = info->dlpi_phnum; i > 0; i--, phdr++) { +- addr = info->dlpi_addr + phdr->p_vaddr; +- switch (phdr->p_type) { +- case PT_LOAD: +- match |= data->pc >= addr && data->pc < addr + phdr->p_memsz; +- break; +- case PT_ARM_EXIDX: +- exidx_start = addr; +- exidx_len = phdr->p_memsz; +- break; +- } +- } +- data->exidx_start = exidx_start; +- data->exidx_len = exidx_len; +- return match; +-} +- +-uintptr_t __gnu_Unwind_Find_exidx(uintptr_t pc, int *pcount) +-{ +- struct find_exidx_data data; +- data.pc = pc; +- if (dl_iterate_phdr(find_exidx, &data) <= 0) +- return 0; +- *pcount = data.exidx_len / 8; +- return data.exidx_start; +-} +--- a/arch/i386/atomic.h ++++ /dev/null +@@ -1,110 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_64(uint64_t x) +-{ +- int r; +- __asm__( "bsf %1,%0 ; jnz 1f ; bsf %2,%0 ; addl $32,%0\n1:" +- : "=&r"(r) : "r"((unsigned)x), "r"((unsigned)(x>>32)) ); +- return r; +-} +- +-static inline int a_ctz_l(unsigned long x) +-{ +- long r; +- __asm__( "bsf %1,%0" : "=r"(r) : "r"(x) ); +- return r; +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- __asm__( "lock ; andl %1, (%0) ; lock ; andl %2, 4(%0)" +- : : "r"((long *)p), "r"((unsigned)v), "r"((unsigned)(v>>32)) : "memory" ); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- __asm__( "lock ; orl %1, (%0) ; lock ; orl %2, 4(%0)" +- : : "r"((long *)p), "r"((unsigned)v), "r"((unsigned)(v>>32)) : "memory" ); +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- __asm__( "lock ; orl %1, %0" +- : "=m"(*(long *)p) : "r"(v) : "memory" ); +-} +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- __asm__( "lock ; cmpxchg %3, %1" +- : "=a"(t), "=m"(*(long *)p) : "a"(t), "r"(s) : "memory" ); +- return t; +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- __asm__( "lock ; cmpxchg %3, %1" +- : "=a"(t), "=m"(*p) : "a"(t), "r"(s) : "memory" ); +- return t; +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- __asm__( "lock ; orl %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- __asm__( "lock ; andl %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- __asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); +- return v; +-} +- +-#define a_xchg a_swap +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- __asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); +- return v; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- __asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" ); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- __asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" ); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- __asm__( "movl %1, %0 ; lock ; orl $0,(%%esp)" : "=m"(*p) : "r"(x) : "memory" ); +-} +- +-static inline void a_spin() +-{ +- __asm__ __volatile__( "pause" : : : "memory" ); +-} +- +-static inline void a_barrier() +-{ +- __asm__ __volatile__( "" : : : "memory" ); +-} +- +-static inline void a_crash() +-{ +- __asm__ __volatile__( "hlt" : : : "memory" ); +-} +- +- +-#endif +--- /dev/null ++++ b/arch/i386/atomic_arch.h +@@ -0,0 +1,109 @@ ++#define a_ctz_64 a_ctz_64 ++static inline int a_ctz_64(uint64_t x) ++{ ++ int r; ++ __asm__( "bsf %1,%0 ; jnz 1f ; bsf %2,%0 ; addl $32,%0\n1:" ++ : "=&r"(r) : "r"((unsigned)x), "r"((unsigned)(x>>32)) ); ++ return r; ++} ++ ++#define a_ctz_l a_ctz_l ++static inline int a_ctz_l(unsigned long x) ++{ ++ long r; ++ __asm__( "bsf %1,%0" : "=r"(r) : "r"(x) ); ++ return r; ++} ++ ++#define a_and_64 a_and_64 ++static inline void a_and_64(volatile uint64_t *p, uint64_t v) ++{ ++ __asm__( "lock ; andl %1, (%0) ; lock ; andl %2, 4(%0)" ++ : : "r"((long *)p), "r"((unsigned)v), "r"((unsigned)(v>>32)) : "memory" ); ++} ++ ++#define a_or_64 a_or_64 ++static inline void a_or_64(volatile uint64_t *p, uint64_t v) ++{ ++ __asm__( "lock ; orl %1, (%0) ; lock ; orl %2, 4(%0)" ++ : : "r"((long *)p), "r"((unsigned)v), "r"((unsigned)(v>>32)) : "memory" ); ++} ++ ++#define a_or_l a_or_l ++static inline void a_or_l(volatile void *p, long v) ++{ ++ __asm__( "lock ; orl %1, %0" ++ : "=m"(*(long *)p) : "r"(v) : "memory" ); ++} ++ ++#define a_cas a_cas ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ __asm__( "lock ; cmpxchg %3, %1" ++ : "=a"(t), "=m"(*p) : "a"(t), "r"(s) : "memory" ); ++ return t; ++} ++ ++#define a_or a_or ++static inline void a_or(volatile int *p, int v) ++{ ++ __asm__( "lock ; orl %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_and a_and ++static inline void a_and(volatile int *p, int v) ++{ ++ __asm__( "lock ; andl %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_swap a_swap ++static inline int a_swap(volatile int *x, int v) ++{ ++ __asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); ++ return v; ++} ++ ++#define a_fetch_add a_fetch_add ++static inline int a_fetch_add(volatile int *x, int v) ++{ ++ __asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); ++ return v; ++} ++ ++#define a_inc a_inc ++static inline void a_inc(volatile int *x) ++{ ++ __asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" ); ++} ++ ++#define a_dec a_dec ++static inline void a_dec(volatile int *x) ++{ ++ __asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" ); ++} ++ ++#define a_store a_store ++static inline void a_store(volatile int *p, int x) ++{ ++ __asm__( "movl %1, %0 ; lock ; orl $0,(%%esp)" : "=m"(*p) : "r"(x) : "memory" ); ++} ++ ++#define a_spin a_spin ++static inline void a_spin() ++{ ++ __asm__ __volatile__( "pause" : : : "memory" ); ++} ++ ++#define a_barrier a_barrier ++static inline void a_barrier() ++{ ++ __asm__ __volatile__( "" : : : "memory" ); ++} ++ ++#define a_crash a_crash ++static inline void a_crash() ++{ ++ __asm__ __volatile__( "hlt" : : : "memory" ); ++} +--- a/arch/i386/bits/alltypes.h.in ++++ b/arch/i386/bits/alltypes.h.in +@@ -26,10 +26,12 @@ TYPEDEF long double float_t; + TYPEDEF long double double_t; + #endif + +-#ifdef __cplusplus +-TYPEDEF struct { alignas(8) long long __ll; long double __ld; } max_align_t; +-#else ++#if !defined(__cplusplus) + TYPEDEF struct { _Alignas(8) long long __ll; long double __ld; } max_align_t; ++#elif defined(__GNUC__) ++TYPEDEF struct { __attribute__((__aligned__(8))) long long __ll; long double __ld; } max_align_t; ++#else ++TYPEDEF struct { alignas(8) long long __ll; long double __ld; } max_align_t; + #endif + + TYPEDEF long time_t; +--- a/arch/i386/pthread_arch.h ++++ b/arch/i386/pthread_arch.h +@@ -7,4 +7,4 @@ static inline struct pthread *__pthread_ + + #define TP_ADJ(p) (p) + +-#define CANCEL_REG_IP 14 ++#define MC_PC gregs[REG_EIP] +--- a/arch/microblaze/atomic.h ++++ /dev/null +@@ -1,143 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_l(unsigned long x) +-{ +- static const char debruijn32[32] = { +- 0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13, +- 31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14 +- }; +- return debruijn32[(x&-x)*0x076be629 >> 27]; +-} +- +-static inline int a_ctz_64(uint64_t x) +-{ +- uint32_t y = x; +- if (!y) { +- y = x>>32; +- return 32 + a_ctz_l(y); +- } +- return a_ctz_l(y); +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- register int old, tmp; +- __asm__ __volatile__ ( +- " addi %0, r0, 0\n" +- "1: lwx %0, %2, r0\n" +- " rsubk %1, %0, %3\n" +- " bnei %1, 1f\n" +- " swx %4, %2, r0\n" +- " addic %1, r0, 0\n" +- " bnei %1, 1b\n" +- "1: " +- : "=&r"(old), "=&r"(tmp) +- : "r"(p), "r"(t), "r"(s) +- : "cc", "memory" ); +- return old; +-} +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- return (void *)a_cas(p, (int)t, (int)s); +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- register int old, tmp; +- __asm__ __volatile__ ( +- " addi %0, r0, 0\n" +- "1: lwx %0, %2, r0\n" +- " swx %3, %2, r0\n" +- " addic %1, r0, 0\n" +- " bnei %1, 1b\n" +- "1: " +- : "=&r"(old), "=&r"(tmp) +- : "r"(x), "r"(v) +- : "cc", "memory" ); +- return old; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- register int new, tmp; +- __asm__ __volatile__ ( +- " addi %0, r0, 0\n" +- "1: lwx %0, %2, r0\n" +- " addk %0, %0, %3\n" +- " swx %0, %2, r0\n" +- " addic %1, r0, 0\n" +- " bnei %1, 1b\n" +- "1: " +- : "=&r"(new), "=&r"(tmp) +- : "r"(x), "r"(v) +- : "cc", "memory" ); +- return new-v; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- a_fetch_add(x, 1); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- a_fetch_add(x, -1); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- __asm__ __volatile__ ( +- "swi %1, %0" +- : "=m"(*p) : "r"(x) : "memory" ); +-} +- +-#define a_spin a_barrier +- +-static inline void a_barrier() +-{ +- a_cas(&(int){0}, 0, 0); +-} +- +-static inline void a_crash() +-{ +- *(volatile char *)0=0; +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- int old; +- do old = *p; +- while (a_cas(p, old, old&v) != old); +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- int old; +- do old = *p; +- while (a_cas(p, old, old|v) != old); +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- a_or(p, v); +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_and((int *)p, u.r[0]); +- a_and((int *)p+1, u.r[1]); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_or((int *)p, u.r[0]); +- a_or((int *)p+1, u.r[1]); +-} +- +-#endif +--- /dev/null ++++ b/arch/microblaze/atomic_arch.h +@@ -0,0 +1,53 @@ ++#define a_cas a_cas ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ register int old, tmp; ++ __asm__ __volatile__ ( ++ " addi %0, r0, 0\n" ++ "1: lwx %0, %2, r0\n" ++ " rsubk %1, %0, %3\n" ++ " bnei %1, 1f\n" ++ " swx %4, %2, r0\n" ++ " addic %1, r0, 0\n" ++ " bnei %1, 1b\n" ++ "1: " ++ : "=&r"(old), "=&r"(tmp) ++ : "r"(p), "r"(t), "r"(s) ++ : "cc", "memory" ); ++ return old; ++} ++ ++#define a_swap a_swap ++static inline int a_swap(volatile int *x, int v) ++{ ++ register int old, tmp; ++ __asm__ __volatile__ ( ++ " addi %0, r0, 0\n" ++ "1: lwx %0, %2, r0\n" ++ " swx %3, %2, r0\n" ++ " addic %1, r0, 0\n" ++ " bnei %1, 1b\n" ++ "1: " ++ : "=&r"(old), "=&r"(tmp) ++ : "r"(x), "r"(v) ++ : "cc", "memory" ); ++ return old; ++} ++ ++#define a_fetch_add a_fetch_add ++static inline int a_fetch_add(volatile int *x, int v) ++{ ++ register int new, tmp; ++ __asm__ __volatile__ ( ++ " addi %0, r0, 0\n" ++ "1: lwx %0, %2, r0\n" ++ " addk %0, %0, %3\n" ++ " swx %0, %2, r0\n" ++ " addic %1, r0, 0\n" ++ " bnei %1, 1b\n" ++ "1: " ++ : "=&r"(new), "=&r"(tmp) ++ : "r"(x), "r"(v) ++ : "cc", "memory" ); ++ return new-v; ++} +--- a/arch/microblaze/pthread_arch.h ++++ b/arch/microblaze/pthread_arch.h +@@ -7,4 +7,4 @@ static inline struct pthread *__pthread_ + + #define TP_ADJ(p) (p) + +-#define CANCEL_REG_IP 32 ++#define MC_PC regs.pc +--- a/arch/mips/atomic.h ++++ /dev/null +@@ -1,205 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_l(unsigned long x) +-{ +- static const char debruijn32[32] = { +- 0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13, +- 31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14 +- }; +- return debruijn32[(x&-x)*0x076be629 >> 27]; +-} +- +-static inline int a_ctz_64(uint64_t x) +-{ +- uint32_t y = x; +- if (!y) { +- y = x>>32; +- return 32 + a_ctz_l(y); +- } +- return a_ctz_l(y); +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- int dummy; +- __asm__ __volatile__( +- ".set push\n" +- ".set mips2\n" +- ".set noreorder\n" +- " sync\n" +- "1: ll %0, %2\n" +- " bne %0, %3, 1f\n" +- " addu %1, %4, $0\n" +- " sc %1, %2\n" +- " beq %1, $0, 1b\n" +- " nop\n" +- " sync\n" +- "1: \n" +- ".set pop\n" +- : "=&r"(t), "=&r"(dummy), "+m"(*p) : "r"(t), "r"(s) : "memory" ); +- return t; +-} +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- return (void *)a_cas(p, (int)t, (int)s); +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- int old, dummy; +- __asm__ __volatile__( +- ".set push\n" +- ".set mips2\n" +- ".set noreorder\n" +- " sync\n" +- "1: ll %0, %2\n" +- " addu %1, %3, $0\n" +- " sc %1, %2\n" +- " beq %1, $0, 1b\n" +- " nop\n" +- " sync\n" +- ".set pop\n" +- : "=&r"(old), "=&r"(dummy), "+m"(*x) : "r"(v) : "memory" ); +- return old; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- int old, dummy; +- __asm__ __volatile__( +- ".set push\n" +- ".set mips2\n" +- ".set noreorder\n" +- " sync\n" +- "1: ll %0, %2\n" +- " addu %1, %0, %3\n" +- " sc %1, %2\n" +- " beq %1, $0, 1b\n" +- " nop\n" +- " sync\n" +- ".set pop\n" +- : "=&r"(old), "=&r"(dummy), "+m"(*x) : "r"(v) : "memory" ); +- return old; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- int dummy; +- __asm__ __volatile__( +- ".set push\n" +- ".set mips2\n" +- ".set noreorder\n" +- " sync\n" +- "1: ll %0, %1\n" +- " addu %0, %0, 1\n" +- " sc %0, %1\n" +- " beq %0, $0, 1b\n" +- " nop\n" +- " sync\n" +- ".set pop\n" +- : "=&r"(dummy), "+m"(*x) : : "memory" ); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- int dummy; +- __asm__ __volatile__( +- ".set push\n" +- ".set mips2\n" +- ".set noreorder\n" +- " sync\n" +- "1: ll %0, %1\n" +- " subu %0, %0, 1\n" +- " sc %0, %1\n" +- " beq %0, $0, 1b\n" +- " nop\n" +- " sync\n" +- ".set pop\n" +- : "=&r"(dummy), "+m"(*x) : : "memory" ); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- __asm__ __volatile__( +- ".set push\n" +- ".set mips2\n" +- ".set noreorder\n" +- " sync\n" +- " sw %1, %0\n" +- " sync\n" +- ".set pop\n" +- : "+m"(*p) : "r"(x) : "memory" ); +-} +- +-#define a_spin a_barrier +- +-static inline void a_barrier() +-{ +- a_cas(&(int){0}, 0, 0); +-} +- +-static inline void a_crash() +-{ +- *(volatile char *)0=0; +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- int dummy; +- __asm__ __volatile__( +- ".set push\n" +- ".set mips2\n" +- ".set noreorder\n" +- " sync\n" +- "1: ll %0, %1\n" +- " and %0, %0, %2\n" +- " sc %0, %1\n" +- " beq %0, $0, 1b\n" +- " nop\n" +- " sync\n" +- ".set pop\n" +- : "=&r"(dummy), "+m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- int dummy; +- __asm__ __volatile__( +- ".set push\n" +- ".set mips2\n" +- ".set noreorder\n" +- " sync\n" +- "1: ll %0, %1\n" +- " or %0, %0, %2\n" +- " sc %0, %1\n" +- " beq %0, $0, 1b\n" +- " nop\n" +- " sync\n" +- ".set pop\n" +- : "=&r"(dummy), "+m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- a_or(p, v); +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_and((int *)p, u.r[0]); +- a_and((int *)p+1, u.r[1]); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_or((int *)p, u.r[0]); +- a_or((int *)p+1, u.r[1]); +-} +- +-#endif +--- /dev/null ++++ b/arch/mips/atomic_arch.h +@@ -0,0 +1,39 @@ ++#define a_ll a_ll ++static inline int a_ll(volatile int *p) ++{ ++ int v; ++ __asm__ __volatile__ ( ++ ".set push ; .set mips2\n\t" ++ "ll %0, %1" ++ "\n\t.set pop" ++ : "=r"(v) : "m"(*p)); ++ return v; ++} ++ ++#define a_sc a_sc ++static inline int a_sc(volatile int *p, int v) ++{ ++ int r; ++ __asm__ __volatile__ ( ++ ".set push ; .set mips2\n\t" ++ "sc %0, %1" ++ "\n\t.set pop" ++ : "=r"(r), "=m"(*p) : "0"(v) : "memory"); ++ return r; ++} ++ ++#define a_barrier a_barrier ++static inline void a_barrier() ++{ ++ /* mips2 sync, but using too many directives causes ++ * gcc not to inline it, so encode with .long instead. */ ++ __asm__ __volatile__ (".long 0xf" : : : "memory"); ++#if 0 ++ __asm__ __volatile__ ( ++ ".set push ; .set mips2 ; sync ; .set pop" ++ : : : "memory"); ++#endif ++} ++ ++#define a_pre_llsc a_barrier ++#define a_post_llsc a_barrier +--- a/arch/mips/crt_arch.h ++++ b/arch/mips/crt_arch.h +@@ -4,13 +4,16 @@ __asm__( + ".text \n" + ".global _" START "\n" + ".global " START "\n" ++".global " START "_data\n" + ".type _" START ", @function\n" + ".type " START ", @function\n" ++".type " START "_data, @function\n" + "_" START ":\n" + "" START ":\n" + " bal 1f \n" + " move $fp, $0 \n" +-"2: .gpword 2b \n" ++"" START "_data: \n" ++" .gpword " START "_data \n" + " .gpword " START "_c \n" + ".weak _DYNAMIC \n" + ".hidden _DYNAMIC \n" +--- a/arch/mips/pthread_arch.h ++++ b/arch/mips/pthread_arch.h +@@ -16,4 +16,4 @@ static inline struct pthread *__pthread_ + + #define DTP_OFFSET 0x8000 + +-#define CANCEL_REG_IP (3-(union {int __i; char __b;}){1}.__b) ++#define MC_PC pc +--- a/arch/mips/syscall_arch.h ++++ b/arch/mips/syscall_arch.h +@@ -3,9 +3,7 @@ + ((union { long long ll; long l[2]; }){ .ll = x }).l[1] + #define __SYSCALL_LL_O(x) 0, __SYSCALL_LL_E((x)) + +-#ifdef SHARED + __attribute__((visibility("hidden"))) +-#endif + long (__syscall)(long, ...); + + #define SYSCALL_RLIM_INFINITY (-1UL/2) +--- a/arch/or1k/atomic.h ++++ /dev/null +@@ -1,120 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_l(unsigned long x) +-{ +- static const char debruijn32[32] = { +- 0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13, +- 31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14 +- }; +- return debruijn32[(x&-x)*0x076be629 >> 27]; +-} +- +-static inline int a_ctz_64(uint64_t x) +-{ +- uint32_t y = x; +- if (!y) { +- y = x>>32; +- return 32 + a_ctz_l(y); +- } +- return a_ctz_l(y); +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- __asm__("1: l.lwa %0, %1\n" +- " l.sfeq %0, %2\n" +- " l.bnf 1f\n" +- " l.nop\n" +- " l.swa %1, %3\n" +- " l.bnf 1b\n" +- " l.nop\n" +- "1: \n" +- : "=&r"(t), "+m"(*p) : "r"(t), "r"(s) : "cc", "memory" ); +- return t; +-} +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- return (void *)a_cas(p, (int)t, (int)s); +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- int old; +- do old = *x; +- while (a_cas(x, old, v) != old); +- return old; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- int old; +- do old = *x; +- while (a_cas(x, old, old+v) != old); +- return old; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- a_fetch_add(x, 1); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- a_fetch_add(x, -1); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- a_swap(p, x); +-} +- +-#define a_spin a_barrier +- +-static inline void a_barrier() +-{ +- a_cas(&(int){0}, 0, 0); +-} +- +-static inline void a_crash() +-{ +- *(volatile char *)0=0; +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- int old; +- do old = *p; +- while (a_cas(p, old, old&v) != old); +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- int old; +- do old = *p; +- while (a_cas(p, old, old|v) != old); +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- a_or(p, v); +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_and((int *)p, u.r[0]); +- a_and((int *)p+1, u.r[1]); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_or((int *)p, u.r[0]); +- a_or((int *)p+1, u.r[1]); +-} +- +-#endif +--- /dev/null ++++ b/arch/or1k/atomic_arch.h +@@ -0,0 +1,14 @@ ++#define a_cas a_cas ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ __asm__("1: l.lwa %0, %1\n" ++ " l.sfeq %0, %2\n" ++ " l.bnf 1f\n" ++ " l.nop\n" ++ " l.swa %1, %3\n" ++ " l.bnf 1b\n" ++ " l.nop\n" ++ "1: \n" ++ : "=&r"(t), "+m"(*p) : "r"(t), "r"(s) : "cc", "memory" ); ++ return t; ++} +--- a/arch/or1k/pthread_arch.h ++++ b/arch/or1k/pthread_arch.h +@@ -14,5 +14,4 @@ static inline struct pthread *__pthread_ + #define TLS_ABOVE_TP + #define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread)) + +-/* word-offset to 'pc' in mcontext_t */ +-#define CANCEL_REG_IP 32 ++#define MC_PC regs.pc +--- a/arch/powerpc/atomic.h ++++ /dev/null +@@ -1,126 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +-#include <endian.h> +- +-static inline int a_ctz_l(unsigned long x) +-{ +- static const char debruijn32[32] = { +- 0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13, +- 31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14 +- }; +- return debruijn32[(x&-x)*0x076be629 >> 27]; +-} +- +-static inline int a_ctz_64(uint64_t x) +-{ +- uint32_t y = x; +- if (!y) { +- y = x>>32; +- return 32 + a_ctz_l(y); +- } +- return a_ctz_l(y); +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- __asm__("\n" +- " sync\n" +- "1: lwarx %0, 0, %4\n" +- " cmpw %0, %2\n" +- " bne 1f\n" +- " stwcx. %3, 0, %4\n" +- " bne- 1b\n" +- " isync\n" +- "1: \n" +- : "=&r"(t), "+m"(*p) : "r"(t), "r"(s), "r"(p) : "cc", "memory" ); +- return t; +-} +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- return (void *)a_cas(p, (int)t, (int)s); +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- int old; +- do old = *x; +- while (a_cas(x, old, v) != old); +- return old; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- int old; +- do old = *x; +- while (a_cas(x, old, old+v) != old); +- return old; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- a_fetch_add(x, 1); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- a_fetch_add(x, -1); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- __asm__ __volatile__ ("\n" +- " sync\n" +- " stw %1, %0\n" +- " isync\n" +- : "=m"(*p) : "r"(x) : "memory" ); +-} +- +-#define a_spin a_barrier +- +-static inline void a_barrier() +-{ +- a_cas(&(int){0}, 0, 0); +-} +- +-static inline void a_crash() +-{ +- *(volatile char *)0=0; +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- int old; +- do old = *p; +- while (a_cas(p, old, old&v) != old); +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- int old; +- do old = *p; +- while (a_cas(p, old, old|v) != old); +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- a_or(p, v); +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_and((int *)p, u.r[0]); +- a_and((int *)p+1, u.r[1]); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_or((int *)p, u.r[0]); +- a_or((int *)p+1, u.r[1]); +-} +- +-#endif +--- /dev/null ++++ b/arch/powerpc/atomic_arch.h +@@ -0,0 +1,15 @@ ++#define a_cas a_cas ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ __asm__("\n" ++ " sync\n" ++ "1: lwarx %0, 0, %4\n" ++ " cmpw %0, %2\n" ++ " bne 1f\n" ++ " stwcx. %3, 0, %4\n" ++ " bne- 1b\n" ++ " isync\n" ++ "1: \n" ++ : "=&r"(t), "+m"(*p) : "r"(t), "r"(s), "r"(p) : "cc", "memory" ); ++ return t; ++} +--- a/arch/powerpc/pthread_arch.h ++++ b/arch/powerpc/pthread_arch.h +@@ -15,9 +15,8 @@ static inline struct pthread *__pthread_ + + #define DTP_OFFSET 0x8000 + +-// offset of the PC register in mcontext_t, divided by the system wordsize + // the kernel calls the ip "nip", it's the first saved value after the 32 + // GPRs. +-#define CANCEL_REG_IP 32 ++#define MC_PC gregs[32] + + #define CANARY canary_at_end +--- a/arch/sh/atomic.h ++++ /dev/null +@@ -1,168 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_l(unsigned long x) +-{ +- static const char debruijn32[32] = { +- 0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13, +- 31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14 +- }; +- return debruijn32[(x&-x)*0x076be629 >> 27]; +-} +- +-static inline int a_ctz_64(uint64_t x) +-{ +- uint32_t y = x; +- if (!y) { +- y = x>>32; +- return 32 + a_ctz_l(y); +- } +- return a_ctz_l(y); +-} +- +-#define LLSC_CLOBBERS "r0", "t", "memory" +-#define LLSC_START(mem) "synco\n" \ +- "0: movli.l @" mem ", r0\n" +-#define LLSC_END(mem) \ +- "1: movco.l r0, @" mem "\n" \ +- " bf 0b\n" \ +- " synco\n" +- +-static inline int __sh_cas_llsc(volatile int *p, int t, int s) +-{ +- int old; +- __asm__ __volatile__( +- LLSC_START("%1") +- " mov r0, %0\n" +- " cmp/eq %0, %2\n" +- " bf 1f\n" +- " mov %3, r0\n" +- LLSC_END("%1") +- : "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS); +- return old; +-} +- +-static inline int __sh_swap_llsc(volatile int *x, int v) +-{ +- int old; +- __asm__ __volatile__( +- LLSC_START("%1") +- " mov r0, %0\n" +- " mov %2, r0\n" +- LLSC_END("%1") +- : "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS); +- return old; +-} +- +-static inline int __sh_fetch_add_llsc(volatile int *x, int v) +-{ +- int old; +- __asm__ __volatile__( +- LLSC_START("%1") +- " mov r0, %0\n" +- " add %2, r0\n" +- LLSC_END("%1") +- : "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS); +- return old; +-} +- +-static inline void __sh_store_llsc(volatile int *p, int x) +-{ +- __asm__ __volatile__( +- " synco\n" +- " mov.l %1, @%0\n" +- " synco\n" +- : : "r"(p), "r"(x) : "memory"); +-} +- +-static inline void __sh_and_llsc(volatile int *x, int v) +-{ +- __asm__ __volatile__( +- LLSC_START("%0") +- " and %1, r0\n" +- LLSC_END("%0") +- : : "r"(x), "r"(v) : LLSC_CLOBBERS); +-} +- +-static inline void __sh_or_llsc(volatile int *x, int v) +-{ +- __asm__ __volatile__( +- LLSC_START("%0") +- " or %1, r0\n" +- LLSC_END("%0") +- : : "r"(x), "r"(v) : LLSC_CLOBBERS); +-} +- +-#ifdef __SH4A__ +-#define a_cas(p,t,s) __sh_cas_llsc(p,t,s) +-#define a_swap(x,v) __sh_swap_llsc(x,v) +-#define a_fetch_add(x,v) __sh_fetch_add_llsc(x, v) +-#define a_store(x,v) __sh_store_llsc(x, v) +-#define a_and(x,v) __sh_and_llsc(x, v) +-#define a_or(x,v) __sh_or_llsc(x, v) +-#else +- +-int __sh_cas(volatile int *, int, int); +-int __sh_swap(volatile int *, int); +-int __sh_fetch_add(volatile int *, int); +-void __sh_store(volatile int *, int); +-void __sh_and(volatile int *, int); +-void __sh_or(volatile int *, int); +- +-#define a_cas(p,t,s) __sh_cas(p,t,s) +-#define a_swap(x,v) __sh_swap(x,v) +-#define a_fetch_add(x,v) __sh_fetch_add(x, v) +-#define a_store(x,v) __sh_store(x, v) +-#define a_and(x,v) __sh_and(x, v) +-#define a_or(x,v) __sh_or(x, v) +-#endif +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- return (void *)a_cas(p, (int)t, (int)s); +-} +- +-static inline void a_inc(volatile int *x) +-{ +- a_fetch_add(x, 1); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- a_fetch_add(x, -1); +-} +- +-#define a_spin a_barrier +- +-static inline void a_barrier() +-{ +- a_cas(&(int){0}, 0, 0); +-} +- +-static inline void a_crash() +-{ +- *(volatile char *)0=0; +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- a_or(p, v); +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_and((int *)p, u.r[0]); +- a_and((int *)p+1, u.r[1]); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- union { uint64_t v; uint32_t r[2]; } u = { v }; +- a_or((int *)p, u.r[0]); +- a_or((int *)p+1, u.r[1]); +-} +- +-#endif +--- /dev/null ++++ b/arch/sh/atomic_arch.h +@@ -0,0 +1,46 @@ ++#if defined(__SH4A__) ++ ++#define a_ll a_ll ++static inline int a_ll(volatile int *p) ++{ ++ int v; ++ __asm__ __volatile__ ("movli.l @%1, %0" : "=z"(v) : "r"(p), "m"(*p)); ++ return v; ++} ++ ++#define a_sc a_sc ++static inline int a_sc(volatile int *p, int v) ++{ ++ int r; ++ __asm__ __volatile__ ( ++ "movco.l %2, @%3 ; movt %0" ++ : "=r"(r), "=m"(*p) : "z"(v), "r"(p) : "memory", "cc"); ++ return r; ++} ++ ++#define a_barrier a_barrier ++static inline void a_barrier() ++{ ++ __asm__ __volatile__ ("synco" : : "memory"); ++} ++ ++#define a_pre_llsc a_barrier ++#define a_post_llsc a_barrier ++ ++#else ++ ++#define a_cas a_cas ++__attribute__((__visibility__("hidden"))) extern const void *__sh_cas_ptr; ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ register int r1 __asm__("r1"); ++ register int r2 __asm__("r2") = t; ++ register int r3 __asm__("r3") = s; ++ __asm__ __volatile__ ( ++ "jsr @%4 ; nop" ++ : "=r"(r1), "+r"(r3) : "z"(p), "r"(r2), "r"(__sh_cas_ptr) ++ : "memory", "pr", "cc"); ++ return r3; ++} ++ ++#endif +--- a/arch/sh/crt_arch.h ++++ b/arch/sh/crt_arch.h +@@ -22,7 +22,8 @@ START ": \n" + " mov.l 1f, r5 \n" + " mov.l 1f+4, r6 \n" + " add r0, r5 \n" +-" bsr __fdpic_fixup \n" ++" mov.l 4f, r1 \n" ++"5: bsrf r1 \n" + " add r0, r6 \n" + " mov r0, r12 \n" + #endif +@@ -31,11 +32,16 @@ START ": \n" + " mov.l r9, @-r15 \n" + " mov.l r8, @-r15 \n" + " mov #-16, r0 \n" +-" bsr " START "_c \n" ++" mov.l 2f, r1 \n" ++"3: bsrf r1 \n" + " and r0, r15 \n" + ".align 2 \n" + "1: .long __ROFIXUP_LIST__@PCREL \n" + " .long __ROFIXUP_END__@PCREL + 4 \n" ++"2: .long " START "_c@PCREL - (3b+4-.) \n" ++#ifndef SHARED ++"4: .long __fdpic_fixup@PCREL - (5b+4-.) \n" ++#endif + ); + + #ifndef SHARED +@@ -53,13 +59,14 @@ START ": \n" + " add r0, r5 \n" + " mov r15, r4 \n" + " mov #-16, r0 \n" +-" and r0, r15 \n" +-" bsr " START "_c \n" +-" nop \n" ++" mov.l 2f, r1 \n" ++"3: bsrf r1 \n" ++" and r0, r15 \n" + ".align 2 \n" + ".weak _DYNAMIC \n" + ".hidden _DYNAMIC \n" + "1: .long _DYNAMIC-. \n" ++"2: .long " START "_c@PCREL - (3b+4-.) \n" + ); + + #endif +--- a/arch/sh/pthread_arch.h ++++ b/arch/sh/pthread_arch.h +@@ -8,4 +8,4 @@ static inline struct pthread *__pthread_ + #define TLS_ABOVE_TP + #define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) - 8) + +-#define CANCEL_REG_IP 17 ++#define MC_PC sc_pc +--- a/arch/sh/reloc.h ++++ b/arch/sh/reloc.h +@@ -32,6 +32,8 @@ + #define REL_DTPOFF R_SH_TLS_DTPOFF32 + #define REL_TPOFF R_SH_TLS_TPOFF32 + ++#define DL_NOMMU_SUPPORT 1 ++ + #if __SH_FDPIC__ + #define REL_FUNCDESC R_SH_FUNCDESC + #define REL_FUNCDESC_VAL R_SH_FUNCDESC_VALUE +--- a/arch/sh/src/__set_thread_area.c ++++ /dev/null +@@ -1,34 +0,0 @@ +-#include "pthread_impl.h" +-#include "libc.h" +-#include "sh_atomic.h" +-#include <elf.h> +- +-/* Also perform sh-specific init */ +- +-#define CPU_HAS_LLSC 0x0040 +- +-__attribute__((__visibility__("hidden"))) unsigned __sh_atomic_model, __sh_nommu; +- +-int __set_thread_area(void *p) +-{ +- size_t *aux; +- __asm__ __volatile__ ( "ldc %0, gbr" : : "r"(p) : "memory" ); +-#ifndef __SH4A__ +- if (__hwcap & CPU_HAS_LLSC) { +- __sh_atomic_model = SH_A_LLSC; +- return 0; +- } +-#if !defined(__SH3__) && !defined(__SH4__) +- for (aux=libc.auxv; *aux; aux+=2) { +- if (*aux != AT_PLATFORM) continue; +- const char *s = (void *)aux[1]; +- if (s[0]!='s' || s[1]!='h' || s[2]!='2' || s[3]-'0'<10u) break; +- __sh_atomic_model = SH_A_IMASK; +- __sh_nommu = 1; +- return 0; +- } +-#endif +- /* __sh_atomic_model = SH_A_GUSA; */ /* 0, default */ +-#endif +- return 0; +-} +--- a/arch/sh/src/atomic.c ++++ /dev/null +@@ -1,158 +0,0 @@ +-#ifndef __SH4A__ +- +-#include "sh_atomic.h" +-#include "atomic.h" +-#include "libc.h" +- +-static inline unsigned mask() +-{ +- unsigned sr; +- __asm__ __volatile__ ( "\n" +- " stc sr,r0 \n" +- " mov r0,%0 \n" +- " or #0xf0,r0 \n" +- " ldc r0,sr \n" +- : "=&r"(sr) : : "memory", "r0" ); +- return sr; +-} +- +-static inline void unmask(unsigned sr) +-{ +- __asm__ __volatile__ ( "ldc %0,sr" : : "r"(sr) : "memory" ); +-} +- +-/* gusa is a hack in the kernel which lets you create a sequence of instructions +- * which will be restarted if the process is preempted in the middle of the +- * sequence. It will do for implementing atomics on non-smp systems. ABI is: +- * r0 = address of first instruction after the atomic sequence +- * r1 = original stack pointer +- * r15 = -1 * length of atomic sequence in bytes +- */ +-#define GUSA_CLOBBERS "r0", "r1", "memory" +-#define GUSA_START(mem,old,nop) \ +- " .align 2\n" \ +- " mova 1f, r0\n" \ +- nop \ +- " mov r15, r1\n" \ +- " mov #(0f-1f), r15\n" \ +- "0: mov.l @" mem ", " old "\n" +-/* the target of mova must be 4 byte aligned, so we may need a nop */ +-#define GUSA_START_ODD(mem,old) GUSA_START(mem,old,"") +-#define GUSA_START_EVEN(mem,old) GUSA_START(mem,old,"\tnop\n") +-#define GUSA_END(mem,new) \ +- " mov.l " new ", @" mem "\n" \ +- "1: mov r1, r15\n" +- +-int __sh_cas(volatile int *p, int t, int s) +-{ +- if (__sh_atomic_model == SH_A_LLSC) return __sh_cas_llsc(p, t, s); +- +- if (__sh_atomic_model == SH_A_IMASK) { +- unsigned sr = mask(); +- int old = *p; +- if (old==t) *p = s; +- unmask(sr); +- return old; +- } +- +- int old; +- __asm__ __volatile__( +- GUSA_START_EVEN("%1", "%0") +- " cmp/eq %0, %2\n" +- " bf 1f\n" +- GUSA_END("%1", "%3") +- : "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t"); +- return old; +-} +- +-int __sh_swap(volatile int *x, int v) +-{ +- if (__sh_atomic_model == SH_A_LLSC) return __sh_swap_llsc(x, v); +- +- if (__sh_atomic_model == SH_A_IMASK) { +- unsigned sr = mask(); +- int old = *x; +- *x = v; +- unmask(sr); +- return old; +- } +- +- int old; +- __asm__ __volatile__( +- GUSA_START_EVEN("%1", "%0") +- GUSA_END("%1", "%2") +- : "=&r"(old) : "r"(x), "r"(v) : GUSA_CLOBBERS); +- return old; +-} +- +-int __sh_fetch_add(volatile int *x, int v) +-{ +- if (__sh_atomic_model == SH_A_LLSC) return __sh_fetch_add_llsc(x, v); +- +- if (__sh_atomic_model == SH_A_IMASK) { +- unsigned sr = mask(); +- int old = *x; +- *x = old + v; +- unmask(sr); +- return old; +- } +- +- int old, dummy; +- __asm__ __volatile__( +- GUSA_START_EVEN("%2", "%0") +- " mov %0, %1\n" +- " add %3, %1\n" +- GUSA_END("%2", "%1") +- : "=&r"(old), "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS); +- return old; +-} +- +-void __sh_store(volatile int *p, int x) +-{ +- if (__sh_atomic_model == SH_A_LLSC) return __sh_store_llsc(p, x); +- __asm__ __volatile__( +- " mov.l %1, @%0\n" +- : : "r"(p), "r"(x) : "memory"); +-} +- +-void __sh_and(volatile int *x, int v) +-{ +- if (__sh_atomic_model == SH_A_LLSC) return __sh_and_llsc(x, v); +- +- if (__sh_atomic_model == SH_A_IMASK) { +- unsigned sr = mask(); +- int old = *x; +- *x = old & v; +- unmask(sr); +- return; +- } +- +- int dummy; +- __asm__ __volatile__( +- GUSA_START_ODD("%1", "%0") +- " and %2, %0\n" +- GUSA_END("%1", "%0") +- : "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS); +-} +- +-void __sh_or(volatile int *x, int v) +-{ +- if (__sh_atomic_model == SH_A_LLSC) return __sh_or_llsc(x, v); +- +- if (__sh_atomic_model == SH_A_IMASK) { +- unsigned sr = mask(); +- int old = *x; +- *x = old | v; +- unmask(sr); +- return; +- } +- +- int dummy; +- __asm__ __volatile__( +- GUSA_START_ODD("%1", "%0") +- " or %2, %0\n" +- GUSA_END("%1", "%0") +- : "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS); +-} +- +-#endif +--- a/arch/sh/src/sh_atomic.h ++++ /dev/null +@@ -1,15 +0,0 @@ +-#ifndef _SH_ATOMIC_H +-#define _SH_ATOMIC_H +- +-#define SH_A_GUSA 0 +-#define SH_A_LLSC 1 +-#define SH_A_CAS 2 +-#if !defined(__SH3__) && !defined(__SH4__) +-#define SH_A_IMASK 3 +-#else +-#define SH_A_IMASK -1LL /* unmatchable by unsigned int */ +-#endif +- +-extern __attribute__((__visibility__("hidden"))) unsigned __sh_atomic_model; +- +-#endif +--- a/arch/x32/atomic.h ++++ /dev/null +@@ -1,105 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_64(uint64_t x) +-{ +- __asm__( "bsf %1,%0" : "=r"(x) : "r"(x) ); +- return x; +-} +- +-static inline int a_ctz_l(unsigned long x) +-{ +- __asm__( "bsf %1,%0" : "=r"(x) : "r"(x) ); +- return x; +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- __asm__( "lock ; and %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- __asm__( "lock ; or %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- __asm__( "lock ; or %1, %0" +- : "=m"(*(long *)p) : "r"(v) : "memory" ); +-} +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- __asm__( "lock ; cmpxchg %3, %1" +- : "=a"(t), "=m"(*(long *)p) : "a"(t), "r"(s) : "memory" ); +- return t; +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- __asm__( "lock ; cmpxchg %3, %1" +- : "=a"(t), "=m"(*p) : "a"(t), "r"(s) : "memory" ); +- return t; +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- __asm__( "lock ; or %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- __asm__( "lock ; and %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- __asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); +- return v; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- __asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); +- return v; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- __asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" ); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- __asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" ); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- __asm__( "mov %1, %0 ; lock ; orl $0,(%%rsp)" : "=m"(*p) : "r"(x) : "memory" ); +-} +- +-static inline void a_spin() +-{ +- __asm__ __volatile__( "pause" : : : "memory" ); +-} +- +-static inline void a_barrier() +-{ +- __asm__ __volatile__( "" : : : "memory" ); +-} +- +-static inline void a_crash() +-{ +- __asm__ __volatile__( "hlt" : : : "memory" ); +-} +- +- +-#endif +--- /dev/null ++++ b/arch/x32/atomic_arch.h +@@ -0,0 +1,106 @@ ++#define a_ctz_64 a_ctz_64 ++static inline int a_ctz_64(uint64_t x) ++{ ++ __asm__( "bsf %1,%0" : "=r"(x) : "r"(x) ); ++ return x; ++} ++ ++#define a_ctz_l a_ctz_l ++static inline int a_ctz_l(unsigned long x) ++{ ++ __asm__( "bsf %1,%0" : "=r"(x) : "r"(x) ); ++ return x; ++} ++ ++#define a_and_64 a_and_64 ++static inline void a_and_64(volatile uint64_t *p, uint64_t v) ++{ ++ __asm__( "lock ; and %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_or_64 a_or_64 ++static inline void a_or_64(volatile uint64_t *p, uint64_t v) ++{ ++ __asm__( "lock ; or %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_or_l a_or_l ++static inline void a_or_l(volatile void *p, long v) ++{ ++ __asm__( "lock ; or %1, %0" ++ : "=m"(*(long *)p) : "r"(v) : "memory" ); ++} ++ ++#define a_cas a_cas ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ __asm__( "lock ; cmpxchg %3, %1" ++ : "=a"(t), "=m"(*p) : "a"(t), "r"(s) : "memory" ); ++ return t; ++} ++ ++#define a_or a_or ++static inline void a_or(volatile int *p, int v) ++{ ++ __asm__( "lock ; or %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_and a_and ++static inline void a_and(volatile int *p, int v) ++{ ++ __asm__( "lock ; and %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_swap a_swap ++static inline int a_swap(volatile int *x, int v) ++{ ++ __asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); ++ return v; ++} ++ ++#define a_fetch_add a_fetch_add ++static inline int a_fetch_add(volatile int *x, int v) ++{ ++ __asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); ++ return v; ++} ++ ++#define a_inc a_inc ++static inline void a_inc(volatile int *x) ++{ ++ __asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" ); ++} ++ ++#define a_dec a_dec ++static inline void a_dec(volatile int *x) ++{ ++ __asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" ); ++} ++ ++#define a_store a_store ++static inline void a_store(volatile int *p, int x) ++{ ++ __asm__( "mov %1, %0 ; lock ; orl $0,(%%rsp)" : "=m"(*p) : "r"(x) : "memory" ); ++} ++ ++#define a_spin a_spin ++static inline void a_spin() ++{ ++ __asm__ __volatile__( "pause" : : : "memory" ); ++} ++ ++#define a_barrier a_barrier ++static inline void a_barrier() ++{ ++ __asm__ __volatile__( "" : : : "memory" ); ++} ++ ++#define a_crash a_crash ++static inline void a_crash() ++{ ++ __asm__ __volatile__( "hlt" : : : "memory" ); ++} +--- a/arch/x32/pthread_arch.h ++++ b/arch/x32/pthread_arch.h +@@ -7,6 +7,6 @@ static inline struct pthread *__pthread_ + + #define TP_ADJ(p) (p) + +-#define CANCEL_REG_IP 32 ++#define MC_PC gregs[REG_RIP] + + #define CANARY canary2 +--- a/arch/x32/src/syscall_cp_fixup.c ++++ b/arch/x32/src/syscall_cp_fixup.c +@@ -1,8 +1,6 @@ + #include <sys/syscall.h> + +-#ifdef SHARED + __attribute__((__visibility__("hidden"))) +-#endif + long __syscall_cp_internal(volatile void*, long long, long long, long long, long long, + long long, long long, long long); + +@@ -14,9 +12,7 @@ struct __timespec_kernel { long long tv_ + ts->tv_nsec = __tsc(X)->tv_nsec; \ + (X) = (unsigned long)ts; } } while(0) + +-#ifdef SHARED + __attribute__((__visibility__("hidden"))) +-#endif + long __syscall_cp_asm (volatile void * foo, long long n, long long a1, long long a2, long long a3, + long long a4, long long a5, long long a6) + { +--- a/arch/x86_64/atomic.h ++++ /dev/null +@@ -1,105 +0,0 @@ +-#ifndef _INTERNAL_ATOMIC_H +-#define _INTERNAL_ATOMIC_H +- +-#include <stdint.h> +- +-static inline int a_ctz_64(uint64_t x) +-{ +- __asm__( "bsf %1,%0" : "=r"(x) : "r"(x) ); +- return x; +-} +- +-static inline int a_ctz_l(unsigned long x) +-{ +- __asm__( "bsf %1,%0" : "=r"(x) : "r"(x) ); +- return x; +-} +- +-static inline void a_and_64(volatile uint64_t *p, uint64_t v) +-{ +- __asm__( "lock ; and %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_or_64(volatile uint64_t *p, uint64_t v) +-{ +- __asm__( "lock ; or %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_or_l(volatile void *p, long v) +-{ +- __asm__( "lock ; or %1, %0" +- : "=m"(*(long *)p) : "r"(v) : "memory" ); +-} +- +-static inline void *a_cas_p(volatile void *p, void *t, void *s) +-{ +- __asm__( "lock ; cmpxchg %3, %1" +- : "=a"(t), "=m"(*(long *)p) : "a"(t), "r"(s) : "memory" ); +- return t; +-} +- +-static inline int a_cas(volatile int *p, int t, int s) +-{ +- __asm__( "lock ; cmpxchg %3, %1" +- : "=a"(t), "=m"(*p) : "a"(t), "r"(s) : "memory" ); +- return t; +-} +- +-static inline void a_or(volatile int *p, int v) +-{ +- __asm__( "lock ; or %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline void a_and(volatile int *p, int v) +-{ +- __asm__( "lock ; and %1, %0" +- : "=m"(*p) : "r"(v) : "memory" ); +-} +- +-static inline int a_swap(volatile int *x, int v) +-{ +- __asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); +- return v; +-} +- +-static inline int a_fetch_add(volatile int *x, int v) +-{ +- __asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); +- return v; +-} +- +-static inline void a_inc(volatile int *x) +-{ +- __asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" ); +-} +- +-static inline void a_dec(volatile int *x) +-{ +- __asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" ); +-} +- +-static inline void a_store(volatile int *p, int x) +-{ +- __asm__( "mov %1, %0 ; lock ; orl $0,(%%rsp)" : "=m"(*p) : "r"(x) : "memory" ); +-} +- +-static inline void a_spin() +-{ +- __asm__ __volatile__( "pause" : : : "memory" ); +-} +- +-static inline void a_barrier() +-{ +- __asm__ __volatile__( "" : : : "memory" ); +-} +- +-static inline void a_crash() +-{ +- __asm__ __volatile__( "hlt" : : : "memory" ); +-} +- +- +-#endif +--- /dev/null ++++ b/arch/x86_64/atomic_arch.h +@@ -0,0 +1,107 @@ ++#define a_ctz_64 a_ctz_64 ++static inline int a_ctz_64(uint64_t x) ++{ ++ __asm__( "bsf %1,%0" : "=r"(x) : "r"(x) ); ++ return x; ++} ++ ++#define a_and_64 a_and_64 ++static inline void a_and_64(volatile uint64_t *p, uint64_t v) ++{ ++ __asm__( "lock ; and %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_or_64 a_or_64 ++static inline void a_or_64(volatile uint64_t *p, uint64_t v) ++{ ++ __asm__( "lock ; or %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_or_l a_or_l ++static inline void a_or_l(volatile void *p, long v) ++{ ++ __asm__( "lock ; or %1, %0" ++ : "=m"(*(long *)p) : "r"(v) : "memory" ); ++} ++ ++#define a_cas_p a_cas_p ++static inline void *a_cas_p(volatile void *p, void *t, void *s) ++{ ++ __asm__( "lock ; cmpxchg %3, %1" ++ : "=a"(t), "=m"(*(long *)p) : "a"(t), "r"(s) : "memory" ); ++ return t; ++} ++ ++#define a_cas a_cas ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ __asm__( "lock ; cmpxchg %3, %1" ++ : "=a"(t), "=m"(*p) : "a"(t), "r"(s) : "memory" ); ++ return t; ++} ++ ++#define a_or a_or ++static inline void a_or(volatile int *p, int v) ++{ ++ __asm__( "lock ; or %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_and a_and ++static inline void a_and(volatile int *p, int v) ++{ ++ __asm__( "lock ; and %1, %0" ++ : "=m"(*p) : "r"(v) : "memory" ); ++} ++ ++#define a_swap a_swap ++static inline int a_swap(volatile int *x, int v) ++{ ++ __asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); ++ return v; ++} ++ ++#define a_fetch_add a_fetch_add ++static inline int a_fetch_add(volatile int *x, int v) ++{ ++ __asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" ); ++ return v; ++} ++ ++#define a_inc a_inc ++static inline void a_inc(volatile int *x) ++{ ++ __asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" ); ++} ++ ++#define a_dec a_dec ++static inline void a_dec(volatile int *x) ++{ ++ __asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" ); ++} ++ ++#define a_store a_store ++static inline void a_store(volatile int *p, int x) ++{ ++ __asm__( "mov %1, %0 ; lock ; orl $0,(%%rsp)" : "=m"(*p) : "r"(x) : "memory" ); ++} ++ ++#define a_spin a_spin ++static inline void a_spin() ++{ ++ __asm__ __volatile__( "pause" : : : "memory" ); ++} ++ ++#define a_barrier a_barrier ++static inline void a_barrier() ++{ ++ __asm__ __volatile__( "" : : : "memory" ); ++} ++ ++#define a_crash a_crash ++static inline void a_crash() ++{ ++ __asm__ __volatile__( "hlt" : : : "memory" ); ++} +--- a/arch/x86_64/pthread_arch.h ++++ b/arch/x86_64/pthread_arch.h +@@ -7,4 +7,4 @@ static inline struct pthread *__pthread_ + + #define TP_ADJ(p) (p) + +-#define CANCEL_REG_IP 16 ++#define MC_PC gregs[REG_RIP] +--- a/configure ++++ b/configure +@@ -9,6 +9,9 @@ VAR=VALUE. See below for descriptions o + + Defaults for the options are specified in brackets. + ++Configuration: ++ --srcdir=DIR source directory [detected] ++ + Installation directories: + --prefix=PREFIX main installation prefix [/usr/local/musl] + --exec-prefix=EPREFIX installation prefix for executable files [PREFIX] +@@ -117,6 +120,7 @@ CFLAGS_TRY= + LDFLAGS_AUTO= + LDFLAGS_TRY= + OPTIMIZE_GLOBS= ++srcdir= + prefix=/usr/local/musl + exec_prefix='$(prefix)' + bindir='$(exec_prefix)/bin' +@@ -139,6 +143,7 @@ clang_wrapper=no + for arg ; do + case "$arg" in + --help) usage ;; ++--srcdir=*) srcdir=${arg#*=} ;; + --prefix=*) prefix=${arg#*=} ;; + --exec-prefix=*) exec_prefix=${arg#*=} ;; + --bindir=*) bindir=${arg#*=} ;; +@@ -179,11 +184,23 @@ LIBCC=*) LIBCC=${arg#*=} ;; + esac + done + +-for i in prefix exec_prefix bindir libdir includedir syslibdir ; do ++for i in srcdir prefix exec_prefix bindir libdir includedir syslibdir ; do + stripdir $i + done + + # ++# Get the source dir for out-of-tree builds ++# ++if test -z "$srcdir" ; then ++srcdir="${0%/configure}" ++stripdir srcdir ++fi ++abs_builddir="$(pwd)" || fail "$0: cannot determine working directory" ++abs_srcdir="$(cd $srcdir && pwd)" || fail "$0: invalid source directory $srcdir" ++test "$abs_srcdir" = "$abs_builddir" && srcdir=. ++test "$srcdir" != "." -a -f Makefile -a ! -h Makefile && fail "$0: Makefile already exists in the working directory" ++ ++# + # Get a temp filename we can use + # + i=0 +@@ -263,11 +280,11 @@ fi + fi + + if test "$gcc_wrapper" = yes ; then +-tools="$tools tools/musl-gcc" ++tools="$tools obj/musl-gcc" + tool_libs="$tool_libs lib/musl-gcc.specs" + fi + if test "$clang_wrapper" = yes ; then +-tools="$tools tools/musl-clang tools/ld.musl-clang" ++tools="$tools obj/musl-clang obj/ld.musl-clang" + fi + + # +@@ -321,7 +338,7 @@ __attribute__((__may_alias__)) + #endif + x; + EOF +-if $CC $CFLAGS_C99FSE -I./arch/$ARCH -I./include $CPPFLAGS $CFLAGS \ ++if $CC $CFLAGS_C99FSE -I$srcdir/arch/$ARCH -I$srcdir/include $CPPFLAGS $CFLAGS \ + -c -o /dev/null "$tmpc" >/dev/null 2>&1 ; then + printf "no\n" + else +@@ -330,6 +347,13 @@ CFLAGS_C99FSE="$CFLAGS_C99FSE -D__may_al + fi + + # ++# The GNU toolchain defaults to assuming unmarked files need an ++# executable stack, potentially exposing vulnerabilities in programs ++# linked with such object files. Fix this. ++# ++tryflag CFLAGS_C99FSE -Wa,--noexecstack ++ ++# + # Check for options to disable stack protector, which needs to be + # disabled for a few early-bootstrap translation units. If not found, + # this is not an error; we assume the toolchain does not do ssp. +@@ -430,11 +454,15 @@ tryflag CFLAGS_AUTO -fno-unwind-tables + tryflag CFLAGS_AUTO -fno-asynchronous-unwind-tables + + # +-# The GNU toolchain defaults to assuming unmarked files need an +-# executable stack, potentially exposing vulnerabilities in programs +-# linked with such object files. Fix this. ++# Attempt to put each function and each data object in its own ++# section. This both allows additional size optimizations at link ++# time and works around a dangerous class of compiler/assembler bugs ++# whereby relative address expressions are constant-folded by the ++# assembler even when one or more of the symbols involved is ++# replaceable. See gas pr 18561 and gcc pr 66609, 68178, etc. + # +-tryflag CFLAGS_AUTO -Wa,--noexecstack ++tryflag CFLAGS_AUTO -ffunction-sections ++tryflag CFLAGS_AUTO -fdata-sections + + # + # On x86, make sure we don't have incompatible instruction set +@@ -489,7 +517,7 @@ int foo(void) { } + int bar(void) { fp = foo; return foo(); } + EOF + if $CC $CFLAGS_C99FSE $CPPFLAGS $CFLAGS \ +- -DSHARED -fPIC -I./src/internal -include vis.h \ ++ -DSHARED -fPIC -I$srcdir/src/internal -include vis.h \ + -nostdlib -shared -Wl,-Bsymbolic-functions \ + -o /dev/null "$tmpc" >/dev/null 2>&1 ; then + visibility=yes +@@ -504,6 +532,16 @@ CFLAGS_AUTO="$CFLAGS_AUTO -include vis.h + CFLAGS_AUTO="${CFLAGS_AUTO# }" + fi + ++# Reduce space lost to padding for alignment purposes by sorting data ++# objects according to their alignment reqirements. This approximates ++# optimal packing. ++tryldflag LDFLAGS_AUTO -Wl,--sort-section,alignment ++tryldflag LDFLAGS_AUTO -Wl,--sort-common ++ ++# When linking shared library, drop dummy weak definitions that were ++# replaced by strong definitions from other translation units. ++tryldflag LDFLAGS_AUTO -Wl,--gc-sections ++ + # Some patched GCC builds have these defaults messed up... + tryldflag LDFLAGS_AUTO -Wl,--hash-style=both + +@@ -513,6 +551,11 @@ tryldflag LDFLAGS_AUTO -Wl,--hash-style= + # runtime library; implementation error is also a possibility. + tryldflag LDFLAGS_AUTO -Wl,--no-undefined + ++# Avoid exporting symbols from compiler runtime libraries. They ++# should be hidden anyway, but some toolchains including old gcc ++# versions built without shared library support and pcc are broken. ++tryldflag LDFLAGS_AUTO -Wl,--exclude-libs=ALL ++ + test "$shared" = "no" || { + # Disable dynamic linking if ld is broken and can't do -Bsymbolic-functions + LDFLAGS_DUMMY= +@@ -599,7 +642,7 @@ echo '#include <float.h>' > "$tmpc" + echo '#if LDBL_MANT_DIG == 53' >> "$tmpc" + echo 'typedef char ldcheck[9-(int)sizeof(long double)];' >> "$tmpc" + echo '#endif' >> "$tmpc" +-if $CC $CFLAGS_C99FSE -I./arch/$ARCH -I./include $CPPFLAGS $CFLAGS \ ++if $CC $CFLAGS_C99FSE -I$srcdir/arch/$ARCH -I$srcdir/include $CPPFLAGS $CFLAGS \ + -c -o /dev/null "$tmpc" >/dev/null 2>&1 ; then + printf "yes\n" + else +@@ -622,6 +665,7 @@ cat << EOF + ARCH = $ARCH + SUBARCH = $SUBARCH + ASMSUBARCH = $ASMSUBARCH ++srcdir = $srcdir + prefix = $prefix + exec_prefix = $exec_prefix + bindir = $bindir +@@ -629,12 +673,14 @@ libdir = $libdir + includedir = $includedir + syslibdir = $syslibdir + CC = $CC +-CFLAGS = $CFLAGS_AUTO $CFLAGS ++CFLAGS = $CFLAGS ++CFLAGS_AUTO = $CFLAGS_AUTO + CFLAGS_C99FSE = $CFLAGS_C99FSE + CFLAGS_MEMOPS = $CFLAGS_MEMOPS + CFLAGS_NOSSP = $CFLAGS_NOSSP + CPPFLAGS = $CPPFLAGS +-LDFLAGS = $LDFLAGS_AUTO $LDFLAGS ++LDFLAGS = $LDFLAGS ++LDFLAGS_AUTO = $LDFLAGS_AUTO + CROSS_COMPILE = $CROSS_COMPILE + LIBCC = $LIBCC + OPTIMIZE_GLOBS = $OPTIMIZE_GLOBS +@@ -648,4 +694,6 @@ test "x$cc_family" = xgcc && echo 'WRAPC + test "x$cc_family" = xclang && echo 'WRAPCC_CLANG = $(CC)' + exec 1>&3 3>&- + ++test "$srcdir" = "." || ln -sf $srcdir/Makefile . ++ + printf "done\n" +--- a/crt/arm/crti.s ++++ b/crt/arm/crti.s +@@ -1,3 +1,5 @@ ++.syntax unified ++ + .section .init + .global _init + .type _init,%function +--- a/crt/arm/crtn.s ++++ b/crt/arm/crtn.s +@@ -1,11 +1,9 @@ ++.syntax unified ++ + .section .init + pop {r0,lr} +- tst lr,#1 +- moveq pc,lr + bx lr + + .section .fini + pop {r0,lr} +- tst lr,#1 +- moveq pc,lr + bx lr +--- a/include/complex.h ++++ b/include/complex.h +@@ -116,7 +116,7 @@ long double creall(long double complex); + + #if __STDC_VERSION__ >= 201112L + #if defined(_Imaginary_I) +-#define __CMPLX(x, y, t) ((t)(x) + _Imaginary_I*(t)(y))) ++#define __CMPLX(x, y, t) ((t)(x) + _Imaginary_I*(t)(y)) + #elif defined(__clang__) + #define __CMPLX(x, y, t) (+(_Complex t){ (t)(x), (t)(y) }) + #else +--- a/include/netinet/tcp.h ++++ b/include/netinet/tcp.h +@@ -41,7 +41,20 @@ + #define TCP_CLOSING 11 + + #if defined(_GNU_SOURCE) || defined(_BSD_SOURCE) ++#define TCPOPT_EOL 0 ++#define TCPOPT_NOP 1 ++#define TCPOPT_MAXSEG 2 ++#define TCPOPT_WINDOW 3 ++#define TCPOPT_SACK_PERMITTED 4 ++#define TCPOPT_SACK 5 ++#define TCPOPT_TIMESTAMP 8 ++#define TCPOLEN_SACK_PERMITTED 2 ++#define TCPOLEN_WINDOW 3 ++#define TCPOLEN_MAXSEG 4 ++#define TCPOLEN_TIMESTAMP 10 ++ + #define SOL_TCP 6 ++ + #include <sys/types.h> + #include <sys/socket.h> + #include <stdint.h> +--- a/src/env/__init_tls.c ++++ b/src/env/__init_tls.c +@@ -8,9 +8,6 @@ + #include "atomic.h" + #include "syscall.h" + +-#ifndef SHARED +-static +-#endif + int __init_tp(void *p) + { + pthread_t td = p; +@@ -24,8 +21,6 @@ int __init_tp(void *p) + return 0; + } + +-#ifndef SHARED +- + static struct builtin_tls { + char c; + struct pthread pt; +@@ -33,33 +28,40 @@ static struct builtin_tls { + } builtin_tls[1]; + #define MIN_TLS_ALIGN offsetof(struct builtin_tls, pt) + +-struct tls_image { +- void *image; +- size_t len, size, align; +-} __static_tls; +- +-#define T __static_tls ++static struct tls_module main_tls; + + void *__copy_tls(unsigned char *mem) + { + pthread_t td; +- if (!T.image) return mem; +- void **dtv = (void *)mem; +- dtv[0] = (void *)1; ++ struct tls_module *p; ++ size_t i; ++ void **dtv; ++ + #ifdef TLS_ABOVE_TP +- mem += sizeof(void *) * 2; +- mem += -((uintptr_t)mem + sizeof(struct pthread)) & (T.align-1); ++ dtv = (void **)(mem + libc.tls_size) - (libc.tls_cnt + 1); ++ ++ mem += -((uintptr_t)mem + sizeof(struct pthread)) & (libc.tls_align-1); + td = (pthread_t)mem; + mem += sizeof(struct pthread); ++ ++ for (i=1, p=libc.tls_head; p; i++, p=p->next) { ++ dtv[i] = mem + p->offset; ++ memcpy(dtv[i], p->image, p->len); ++ } + #else ++ dtv = (void **)mem; ++ + mem += libc.tls_size - sizeof(struct pthread); +- mem -= (uintptr_t)mem & (T.align-1); ++ mem -= (uintptr_t)mem & (libc.tls_align-1); + td = (pthread_t)mem; +- mem -= T.size; ++ ++ for (i=1, p=libc.tls_head; p; i++, p=p->next) { ++ dtv[i] = mem - p->offset; ++ memcpy(dtv[i], p->image, p->len); ++ } + #endif ++ dtv[0] = (void *)libc.tls_cnt; + td->dtv = td->dtv_copy = dtv; +- dtv[1] = mem; +- memcpy(mem, T.image, T.len); + return td; + } + +@@ -69,7 +71,7 @@ typedef Elf32_Phdr Phdr; + typedef Elf64_Phdr Phdr; + #endif + +-void __init_tls(size_t *aux) ++static void static_init_tls(size_t *aux) + { + unsigned char *p; + size_t n; +@@ -86,16 +88,24 @@ void __init_tls(size_t *aux) + } + + if (tls_phdr) { +- T.image = (void *)(base + tls_phdr->p_vaddr); +- T.len = tls_phdr->p_filesz; +- T.size = tls_phdr->p_memsz; +- T.align = tls_phdr->p_align; ++ main_tls.image = (void *)(base + tls_phdr->p_vaddr); ++ main_tls.len = tls_phdr->p_filesz; ++ main_tls.size = tls_phdr->p_memsz; ++ main_tls.align = tls_phdr->p_align; ++ libc.tls_cnt = 1; ++ libc.tls_head = &main_tls; + } + +- T.size += (-T.size - (uintptr_t)T.image) & (T.align-1); +- if (T.align < MIN_TLS_ALIGN) T.align = MIN_TLS_ALIGN; ++ main_tls.size += (-main_tls.size - (uintptr_t)main_tls.image) ++ & (main_tls.align-1); ++ if (main_tls.align < MIN_TLS_ALIGN) main_tls.align = MIN_TLS_ALIGN; ++#ifndef TLS_ABOVE_TP ++ main_tls.offset = main_tls.size; ++#endif + +- libc.tls_size = 2*sizeof(void *)+T.size+T.align+sizeof(struct pthread) ++ libc.tls_align = main_tls.align; ++ libc.tls_size = 2*sizeof(void *) + sizeof(struct pthread) ++ + main_tls.size + main_tls.align + + MIN_TLS_ALIGN-1 & -MIN_TLS_ALIGN; + + if (libc.tls_size > sizeof builtin_tls) { +@@ -117,6 +127,5 @@ void __init_tls(size_t *aux) + if (__init_tp(__copy_tls(mem)) < 0) + a_crash(); + } +-#else +-void __init_tls(size_t *auxv) { } +-#endif ++ ++weak_alias(static_init_tls, __init_tls); +--- a/src/env/__libc_start_main.c ++++ b/src/env/__libc_start_main.c +@@ -8,21 +8,17 @@ + + void __init_tls(size_t *); + +-#ifndef SHARED +-static void dummy() {} ++static void dummy(void) {} + weak_alias(dummy, _init); +-extern void (*const __init_array_start)() __attribute__((weak)); +-extern void (*const __init_array_end)() __attribute__((weak)); +-#endif ++ ++__attribute__((__weak__, __visibility__("hidden"))) ++extern void (*const __init_array_start)(void), (*const __init_array_end)(void); + + static void dummy1(void *p) {} + weak_alias(dummy1, __init_ssp); + + #define AUX_CNT 38 + +-#ifndef SHARED +-static +-#endif + void __init_libc(char **envp, char *pn) + { + size_t i, *auxv, aux[AUX_CNT] = { 0 }; +@@ -57,20 +53,22 @@ void __init_libc(char **envp, char *pn) + libc.secure = 1; + } + +-int __libc_start_main(int (*main)(int,char **,char **), int argc, char **argv) ++static void libc_start_init(void) + { +- char **envp = argv+argc+1; +- +-#ifndef SHARED +- __init_libc(envp, argv[0]); + _init(); + uintptr_t a = (uintptr_t)&__init_array_start; + for (; a<(uintptr_t)&__init_array_end; a+=sizeof(void(*)())) + (*(void (**)())a)(); +-#else +- void __libc_start_init(void); ++} ++ ++weak_alias(libc_start_init, __libc_start_init); ++ ++int __libc_start_main(int (*main)(int,char **,char **), int argc, char **argv) ++{ ++ char **envp = argv+argc+1; ++ ++ __init_libc(envp, argv[0]); + __libc_start_init(); +-#endif + + /* Pass control to the application */ + exit(main(argc, argv, envp)); +--- a/src/env/__reset_tls.c ++++ b/src/env/__reset_tls.c +@@ -1,21 +1,16 @@ +-#ifndef SHARED +- + #include <string.h> + #include "pthread_impl.h" +- +-extern struct tls_image { +- void *image; +- size_t len, size, align; +-} __static_tls; +- +-#define T __static_tls ++#include "libc.h" + + void __reset_tls() + { +- if (!T.size) return; + pthread_t self = __pthread_self(); +- memcpy(self->dtv[1], T.image, T.len); +- memset((char *)self->dtv[1]+T.len, 0, T.size-T.len); ++ struct tls_module *p; ++ size_t i, n = (size_t)self->dtv[0]; ++ if (n) for (p=libc.tls_head, i=1; i<=n; i++, p=p->next) { ++ if (!self->dtv[i]) continue; ++ memcpy(self->dtv[i], p->image, p->len); ++ memset((char *)self->dtv[i]+p->len, 0, ++ p->size - p->len); ++ } + } +- +-#endif +--- a/src/env/__stack_chk_fail.c ++++ b/src/env/__stack_chk_fail.c +@@ -17,16 +17,7 @@ void __stack_chk_fail(void) + a_crash(); + } + +-#ifdef SHARED +- + __attribute__((__visibility__("hidden"))) +-void __stack_chk_fail_local(void) +-{ +- a_crash(); +-} +- +-#else ++void __stack_chk_fail_local(void); + + weak_alias(__stack_chk_fail, __stack_chk_fail_local); +- +-#endif +--- /dev/null ++++ b/src/exit/arm/__aeabi_atexit.c +@@ -0,0 +1,6 @@ ++int __cxa_atexit(void (*func)(void *), void *arg, void *dso); ++ ++int __aeabi_atexit (void *obj, void (*func) (void *), void *d) ++{ ++ return __cxa_atexit (func, obj, d); ++} +--- a/src/exit/exit.c ++++ b/src/exit/exit.c +@@ -10,25 +10,25 @@ static void dummy() + * as a consequence of linking either __toread.c or __towrite.c. */ + weak_alias(dummy, __funcs_on_exit); + weak_alias(dummy, __stdio_exit); +- +-#ifndef SHARED + weak_alias(dummy, _fini); +-extern void (*const __fini_array_start)() __attribute__((weak)); +-extern void (*const __fini_array_end)() __attribute__((weak)); +-#endif + +-_Noreturn void exit(int code) +-{ +- __funcs_on_exit(); ++__attribute__((__weak__, __visibility__("hidden"))) ++extern void (*const __fini_array_start)(void), (*const __fini_array_end)(void); + +-#ifndef SHARED ++static void libc_exit_fini(void) ++{ + uintptr_t a = (uintptr_t)&__fini_array_end; + for (; a>(uintptr_t)&__fini_array_start; a-=sizeof(void(*)())) + (*(void (**)())(a-sizeof(void(*)())))(); + _fini(); +-#endif ++} + +- __stdio_exit(); ++weak_alias(libc_exit_fini, __libc_exit_fini); + ++_Noreturn void exit(int code) ++{ ++ __funcs_on_exit(); ++ __libc_exit_fini(); ++ __stdio_exit(); + _Exit(code); + } +--- /dev/null ++++ b/src/fenv/arm/fenv-hf.S +@@ -0,0 +1,69 @@ ++#if __ARM_PCS_VFP ++ ++.syntax unified ++.fpu vfp ++ ++.global fegetround ++.type fegetround,%function ++fegetround: ++ fmrx r0, fpscr ++ and r0, r0, #0xc00000 ++ bx lr ++ ++.global __fesetround ++.type __fesetround,%function ++__fesetround: ++ fmrx r3, fpscr ++ bic r3, r3, #0xc00000 ++ orr r3, r3, r0 ++ fmxr fpscr, r3 ++ mov r0, #0 ++ bx lr ++ ++.global fetestexcept ++.type fetestexcept,%function ++fetestexcept: ++ and r0, r0, #0x1f ++ fmrx r3, fpscr ++ and r0, r0, r3 ++ bx lr ++ ++.global feclearexcept ++.type feclearexcept,%function ++feclearexcept: ++ and r0, r0, #0x1f ++ fmrx r3, fpscr ++ bic r3, r3, r0 ++ fmxr fpscr, r3 ++ mov r0, #0 ++ bx lr ++ ++.global feraiseexcept ++.type feraiseexcept,%function ++feraiseexcept: ++ and r0, r0, #0x1f ++ fmrx r3, fpscr ++ orr r3, r3, r0 ++ fmxr fpscr, r3 ++ mov r0, #0 ++ bx lr ++ ++.global fegetenv ++.type fegetenv,%function ++fegetenv: ++ fmrx r3, fpscr ++ str r3, [r0] ++ mov r0, #0 ++ bx lr ++ ++.global fesetenv ++.type fesetenv,%function ++fesetenv: ++ cmn r0, #1 ++ moveq r3, #0 ++ ldrne r3, [r0] ++ fmxr fpscr, r3 ++ mov r0, #0 ++ bx lr ++ ++#endif +--- /dev/null ++++ b/src/fenv/arm/fenv.c +@@ -0,0 +1,3 @@ ++#if !__ARM_PCS_VFP ++#include "../fenv.c" ++#endif +--- a/src/fenv/armebhf/fenv.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../armhf/fenv.s +--- a/src/fenv/armhf/fenv.s ++++ /dev/null +@@ -1,64 +0,0 @@ +-.fpu vfp +- +-.global fegetround +-.type fegetround,%function +-fegetround: +- mrc p10, 7, r0, cr1, cr0, 0 +- and r0, r0, #0xc00000 +- bx lr +- +-.global __fesetround +-.type __fesetround,%function +-__fesetround: +- mrc p10, 7, r3, cr1, cr0, 0 +- bic r3, r3, #0xc00000 +- orr r3, r3, r0 +- mcr p10, 7, r3, cr1, cr0, 0 +- mov r0, #0 +- bx lr +- +-.global fetestexcept +-.type fetestexcept,%function +-fetestexcept: +- and r0, r0, #0x1f +- mrc p10, 7, r3, cr1, cr0, 0 +- and r0, r0, r3 +- bx lr +- +-.global feclearexcept +-.type feclearexcept,%function +-feclearexcept: +- and r0, r0, #0x1f +- mrc p10, 7, r3, cr1, cr0, 0 +- bic r3, r3, r0 +- mcr p10, 7, r3, cr1, cr0, 0 +- mov r0, #0 +- bx lr +- +-.global feraiseexcept +-.type feraiseexcept,%function +-feraiseexcept: +- and r0, r0, #0x1f +- mrc p10, 7, r3, cr1, cr0, 0 +- orr r3, r3, r0 +- mcr p10, 7, r3, cr1, cr0, 0 +- mov r0, #0 +- bx lr +- +-.global fegetenv +-.type fegetenv,%function +-fegetenv: +- mrc p10, 7, r3, cr1, cr0, 0 +- str r3, [r0] +- mov r0, #0 +- bx lr +- +-.global fesetenv +-.type fesetenv,%function +-fesetenv: +- cmn r0, #1 +- moveq r3, #0 +- ldrne r3, [r0] +- mcr p10, 7, r3, cr1, cr0, 0 +- mov r0, #0 +- bx lr +--- a/src/fenv/armhf/fenv.sub ++++ /dev/null +@@ -1 +0,0 @@ +-fenv.s +--- a/src/fenv/mips-sf/fenv.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../fenv.c +--- /dev/null ++++ b/src/fenv/mips/fenv-sf.c +@@ -0,0 +1,3 @@ ++#ifdef __mips_soft_float ++#include "../fenv.c" ++#endif +--- /dev/null ++++ b/src/fenv/mips/fenv.S +@@ -0,0 +1,71 @@ ++#ifndef __mips_soft_float ++ ++.set noreorder ++ ++.global feclearexcept ++.type feclearexcept,@function ++feclearexcept: ++ and $4, $4, 0x7c ++ cfc1 $5, $31 ++ or $5, $5, $4 ++ xor $5, $5, $4 ++ ctc1 $5, $31 ++ jr $ra ++ li $2, 0 ++ ++.global feraiseexcept ++.type feraiseexcept,@function ++feraiseexcept: ++ and $4, $4, 0x7c ++ cfc1 $5, $31 ++ or $5, $5, $4 ++ ctc1 $5, $31 ++ jr $ra ++ li $2, 0 ++ ++.global fetestexcept ++.type fetestexcept,@function ++fetestexcept: ++ and $4, $4, 0x7c ++ cfc1 $2, $31 ++ jr $ra ++ and $2, $2, $4 ++ ++.global fegetround ++.type fegetround,@function ++fegetround: ++ cfc1 $2, $31 ++ jr $ra ++ andi $2, $2, 3 ++ ++.global __fesetround ++.type __fesetround,@function ++__fesetround: ++ cfc1 $5, $31 ++ li $6, -4 ++ and $5, $5, $6 ++ or $5, $5, $4 ++ ctc1 $5, $31 ++ jr $ra ++ li $2, 0 ++ ++.global fegetenv ++.type fegetenv,@function ++fegetenv: ++ cfc1 $5, $31 ++ sw $5, 0($4) ++ jr $ra ++ li $2, 0 ++ ++.global fesetenv ++.type fesetenv,@function ++fesetenv: ++ addiu $5, $4, 1 ++ beq $5, $0, 1f ++ nop ++ lw $5, 0($4) ++1: ctc1 $5, $31 ++ jr $ra ++ li $2, 0 ++ ++#endif +--- a/src/fenv/mips/fenv.s ++++ /dev/null +@@ -1,67 +0,0 @@ +-.set noreorder +- +-.global feclearexcept +-.type feclearexcept,@function +-feclearexcept: +- and $4, $4, 0x7c +- cfc1 $5, $31 +- or $5, $5, $4 +- xor $5, $5, $4 +- ctc1 $5, $31 +- jr $ra +- li $2, 0 +- +-.global feraiseexcept +-.type feraiseexcept,@function +-feraiseexcept: +- and $4, $4, 0x7c +- cfc1 $5, $31 +- or $5, $5, $4 +- ctc1 $5, $31 +- jr $ra +- li $2, 0 +- +-.global fetestexcept +-.type fetestexcept,@function +-fetestexcept: +- and $4, $4, 0x7c +- cfc1 $2, $31 +- jr $ra +- and $2, $2, $4 +- +-.global fegetround +-.type fegetround,@function +-fegetround: +- cfc1 $2, $31 +- jr $ra +- andi $2, $2, 3 +- +-.global __fesetround +-.type __fesetround,@function +-__fesetround: +- cfc1 $5, $31 +- li $6, -4 +- and $5, $5, $6 +- or $5, $5, $4 +- ctc1 $5, $31 +- jr $ra +- li $2, 0 +- +-.global fegetenv +-.type fegetenv,@function +-fegetenv: +- cfc1 $5, $31 +- sw $5, 0($4) +- jr $ra +- li $2, 0 +- +-.global fesetenv +-.type fesetenv,@function +-fesetenv: +- addiu $5, $4, 1 +- beq $5, $0, 1f +- nop +- lw $5, 0($4) +-1: ctc1 $5, $31 +- jr $ra +- li $2, 0 +--- a/src/fenv/mipsel-sf/fenv.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../fenv.c +--- a/src/fenv/sh-nofpu/fenv.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../fenv.c +--- /dev/null ++++ b/src/fenv/sh/fenv-nofpu.c +@@ -0,0 +1,3 @@ ++#if !__SH_FPU_ANY__ && !__SH4__ ++#include "../fenv.c" ++#endif +--- /dev/null ++++ b/src/fenv/sh/fenv.S +@@ -0,0 +1,78 @@ ++#if __SH_FPU_ANY__ || __SH4__ ++ ++.global fegetround ++.type fegetround, @function ++fegetround: ++ sts fpscr, r0 ++ rts ++ and #3, r0 ++ ++.global __fesetround ++.type __fesetround, @function ++__fesetround: ++ sts fpscr, r0 ++ or r4, r0 ++ lds r0, fpscr ++ rts ++ mov #0, r0 ++ ++.global fetestexcept ++.type fetestexcept, @function ++fetestexcept: ++ sts fpscr, r0 ++ and r4, r0 ++ rts ++ and #0x7c, r0 ++ ++.global feclearexcept ++.type feclearexcept, @function ++feclearexcept: ++ mov r4, r0 ++ and #0x7c, r0 ++ not r0, r4 ++ sts fpscr, r0 ++ and r4, r0 ++ lds r0, fpscr ++ rts ++ mov #0, r0 ++ ++.global feraiseexcept ++.type feraiseexcept, @function ++feraiseexcept: ++ mov r4, r0 ++ and #0x7c, r0 ++ sts fpscr, r4 ++ or r4, r0 ++ lds r0, fpscr ++ rts ++ mov #0, r0 ++ ++.global fegetenv ++.type fegetenv, @function ++fegetenv: ++ sts fpscr, r0 ++ mov.l r0, @r4 ++ rts ++ mov #0, r0 ++ ++.global fesetenv ++.type fesetenv, @function ++fesetenv: ++ mov r4, r0 ++ cmp/eq #-1, r0 ++ bf 1f ++ ++ ! the default environment is complicated by the fact that we need to ++ ! preserve the current precision bit, which we do not know a priori ++ sts fpscr, r0 ++ mov #8, r1 ++ swap.w r1, r1 ++ bra 2f ++ and r1, r0 ++ ++1: mov.l @r4, r0 ! non-default environment ++2: lds r0, fpscr ++ rts ++ mov #0, r0 ++ ++#endif +--- a/src/fenv/sh/fenv.s ++++ /dev/null +@@ -1,74 +0,0 @@ +-.global fegetround +-.type fegetround, @function +-fegetround: +- sts fpscr, r0 +- rts +- and #3, r0 +- +-.global __fesetround +-.type __fesetround, @function +-__fesetround: +- sts fpscr, r0 +- or r4, r0 +- lds r0, fpscr +- rts +- mov #0, r0 +- +-.global fetestexcept +-.type fetestexcept, @function +-fetestexcept: +- sts fpscr, r0 +- and r4, r0 +- rts +- and #0x7c, r0 +- +-.global feclearexcept +-.type feclearexcept, @function +-feclearexcept: +- mov r4, r0 +- and #0x7c, r0 +- not r0, r4 +- sts fpscr, r0 +- and r4, r0 +- lds r0, fpscr +- rts +- mov #0, r0 +- +-.global feraiseexcept +-.type feraiseexcept, @function +-feraiseexcept: +- mov r4, r0 +- and #0x7c, r0 +- sts fpscr, r4 +- or r4, r0 +- lds r0, fpscr +- rts +- mov #0, r0 +- +-.global fegetenv +-.type fegetenv, @function +-fegetenv: +- sts fpscr, r0 +- mov.l r0, @r4 +- rts +- mov #0, r0 +- +-.global fesetenv +-.type fesetenv, @function +-fesetenv: +- mov r4, r0 +- cmp/eq #-1, r0 +- bf 1f +- +- ! the default environment is complicated by the fact that we need to +- ! preserve the current precision bit, which we do not know a priori +- sts fpscr, r0 +- mov #8, r1 +- swap.w r1, r1 +- bra 2f +- and r1, r0 +- +-1: mov.l @r4, r0 ! non-default environment +-2: lds r0, fpscr +- rts +- mov #0, r0 +--- a/src/fenv/sheb-nofpu/fenv.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../fenv.c +--- a/src/internal/arm/syscall.s ++++ b/src/internal/arm/syscall.s +@@ -1,3 +1,4 @@ ++.syntax unified + .global __syscall + .hidden __syscall + .type __syscall,%function +@@ -11,6 +12,4 @@ __syscall: + ldmfd ip,{r3,r4,r5,r6} + svc 0 + ldmfd sp!,{r4,r5,r6,r7} +- tst lr,#1 +- moveq pc,lr + bx lr +--- /dev/null ++++ b/src/internal/atomic.h +@@ -0,0 +1,275 @@ ++#ifndef _ATOMIC_H ++#define _ATOMIC_H ++ ++#include <stdint.h> ++ ++#include "atomic_arch.h" ++ ++#ifdef a_ll ++ ++#ifndef a_pre_llsc ++#define a_pre_llsc() ++#endif ++ ++#ifndef a_post_llsc ++#define a_post_llsc() ++#endif ++ ++#ifndef a_cas ++#define a_cas a_cas ++static inline int a_cas(volatile int *p, int t, int s) ++{ ++ int old; ++ a_pre_llsc(); ++ do old = a_ll(p); ++ while (old==t && !a_sc(p, s)); ++ a_post_llsc(); ++ return old; ++} ++#endif ++ ++#ifndef a_swap ++#define a_swap a_swap ++static inline int a_swap(volatile int *p, int v) ++{ ++ int old; ++ a_pre_llsc(); ++ do old = a_ll(p); ++ while (!a_sc(p, v)); ++ a_post_llsc(); ++ return old; ++} ++#endif ++ ++#ifndef a_fetch_add ++#define a_fetch_add a_fetch_add ++static inline int a_fetch_add(volatile int *p, int v) ++{ ++ int old; ++ a_pre_llsc(); ++ do old = a_ll(p); ++ while (!a_sc(p, (unsigned)old + v)); ++ a_post_llsc(); ++ return old; ++} ++#endif ++ ++#ifndef a_fetch_and ++#define a_fetch_and a_fetch_and ++static inline int a_fetch_and(volatile int *p, int v) ++{ ++ int old; ++ a_pre_llsc(); ++ do old = a_ll(p); ++ while (!a_sc(p, old & v)); ++ a_post_llsc(); ++ return old; ++} ++#endif ++ ++#ifndef a_fetch_or ++#define a_fetch_or a_fetch_or ++static inline int a_fetch_or(volatile int *p, int v) ++{ ++ int old; ++ a_pre_llsc(); ++ do old = a_ll(p); ++ while (!a_sc(p, old | v)); ++ a_post_llsc(); ++ return old; ++} ++#endif ++ ++#endif ++ ++#ifndef a_cas ++#error missing definition of a_cas ++#endif ++ ++#ifndef a_swap ++#define a_swap a_swap ++static inline int a_swap(volatile int *p, int v) ++{ ++ int old; ++ do old = *p; ++ while (a_cas(p, old, v) != old); ++ return old; ++} ++#endif ++ ++#ifndef a_fetch_add ++#define a_fetch_add a_fetch_add ++static inline int a_fetch_add(volatile int *p, int v) ++{ ++ int old; ++ do old = *p; ++ while (a_cas(p, old, (unsigned)old+v) != old); ++ return old; ++} ++#endif ++ ++#ifndef a_fetch_and ++#define a_fetch_and a_fetch_and ++static inline int a_fetch_and(volatile int *p, int v) ++{ ++ int old; ++ do old = *p; ++ while (a_cas(p, old, old&v) != old); ++ return old; ++} ++#endif ++#ifndef a_fetch_or ++#define a_fetch_or a_fetch_or ++static inline int a_fetch_or(volatile int *p, int v) ++{ ++ int old; ++ do old = *p; ++ while (a_cas(p, old, old|v) != old); ++ return old; ++} ++#endif ++ ++#ifndef a_and ++#define a_and a_and ++static inline void a_and(volatile int *p, int v) ++{ ++ a_fetch_and(p, v); ++} ++#endif ++ ++#ifndef a_or ++#define a_or a_or ++static inline void a_or(volatile int *p, int v) ++{ ++ a_fetch_or(p, v); ++} ++#endif ++ ++#ifndef a_inc ++#define a_inc a_inc ++static inline void a_inc(volatile int *p) ++{ ++ a_fetch_add(p, 1); ++} ++#endif ++ ++#ifndef a_dec ++#define a_dec a_dec ++static inline void a_dec(volatile int *p) ++{ ++ a_fetch_add(p, -1); ++} ++#endif ++ ++#ifndef a_store ++#define a_store a_store ++static inline void a_store(volatile int *p, int v) ++{ ++#ifdef a_barrier ++ a_barrier(); ++ *p = v; ++ a_barrier(); ++#else ++ a_swap(p, v); ++#endif ++} ++#endif ++ ++#ifndef a_barrier ++#define a_barrier a_barrier ++static void a_barrier() ++{ ++ volatile int tmp = 0; ++ a_cas(&tmp, 0, 0); ++} ++#endif ++ ++#ifndef a_spin ++#define a_spin a_barrier ++#endif ++ ++#ifndef a_and_64 ++#define a_and_64 a_and_64 ++static inline void a_and_64(volatile uint64_t *p, uint64_t v) ++{ ++ union { uint64_t v; uint32_t r[2]; } u = { v }; ++ if (u.r[0]+1) a_and((int *)p, u.r[0]); ++ if (u.r[1]+1) a_and((int *)p+1, u.r[1]); ++} ++#endif ++ ++#ifndef a_or_64 ++#define a_or_64 a_or_64 ++static inline void a_or_64(volatile uint64_t *p, uint64_t v) ++{ ++ union { uint64_t v; uint32_t r[2]; } u = { v }; ++ if (u.r[0]) a_or((int *)p, u.r[0]); ++ if (u.r[1]) a_or((int *)p+1, u.r[1]); ++} ++#endif ++ ++#ifndef a_cas_p ++#define a_cas_p a_cas_p ++static inline void *a_cas_p(volatile void *p, void *t, void *s) ++{ ++ return (void *)a_cas((volatile int *)p, (int)t, (int)s); ++} ++#endif ++ ++#ifndef a_or_l ++#define a_or_l a_or_l ++static inline void a_or_l(volatile void *p, long v) ++{ ++ if (sizeof(long) == sizeof(int)) a_or(p, v); ++ else a_or_64(p, v); ++} ++#endif ++ ++#ifndef a_crash ++#define a_crash a_crash ++static inline void a_crash() ++{ ++ *(volatile char *)0=0; ++} ++#endif ++ ++#ifndef a_ctz_64 ++#define a_ctz_64 a_ctz_64 ++static inline int a_ctz_64(uint64_t x) ++{ ++ static const char debruijn64[64] = { ++ 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, ++ 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, ++ 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, ++ 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12 ++ }; ++ static const char debruijn32[32] = { ++ 0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13, ++ 31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14 ++ }; ++ if (sizeof(long) < 8) { ++ uint32_t y = x; ++ if (!y) { ++ y = x>>32; ++ return 32 + debruijn32[(y&-y)*0x076be629 >> 27]; ++ } ++ return debruijn32[(y&-y)*0x076be629 >> 27]; ++ } ++ return debruijn64[(x&-x)*0x022fdd63cc95386dull >> 58]; ++} ++#endif ++ ++#ifndef a_ctz_l ++#define a_ctz_l a_ctz_l ++static inline int a_ctz_l(unsigned long x) ++{ ++ static const char debruijn32[32] = { ++ 0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13, ++ 31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14 ++ }; ++ if (sizeof(long) == 8) return a_ctz_64(x); ++ return debruijn32[(x&-x)*0x076be629 >> 27]; ++} ++#endif ++ ++#endif +--- a/src/internal/dynlink.h ++++ b/src/internal/dynlink.h +@@ -64,6 +64,10 @@ struct fdpic_dummy_loadmap { + #define DL_FDPIC 0 + #endif + ++#ifndef DL_NOMMU_SUPPORT ++#define DL_NOMMU_SUPPORT 0 ++#endif ++ + #if !DL_FDPIC + #define IS_RELATIVE(x,s) ( \ + (R_TYPE(x) == REL_RELATIVE) || \ +--- a/src/internal/libc.h ++++ b/src/internal/libc.h +@@ -11,13 +11,20 @@ struct __locale_struct { + const struct __locale_map *volatile cat[6]; + }; + ++struct tls_module { ++ struct tls_module *next; ++ void *image; ++ size_t len, size, align, offset; ++}; ++ + struct __libc { + int can_do_threads; + int threaded; + int secure; + volatile int threads_minus_1; + size_t *auxv; +- size_t tls_size; ++ struct tls_module *tls_head; ++ size_t tls_size, tls_align, tls_cnt; + size_t page_size; + struct __locale_struct global_locale; + }; +--- a/src/internal/syscall.h ++++ b/src/internal/syscall.h +@@ -17,9 +17,7 @@ + typedef long syscall_arg_t; + #endif + +-#ifdef SHARED + __attribute__((visibility("hidden"))) +-#endif + long __syscall_ret(unsigned long), __syscall(syscall_arg_t, ...), + __syscall_cp(syscall_arg_t, syscall_arg_t, syscall_arg_t, syscall_arg_t, + syscall_arg_t, syscall_arg_t, syscall_arg_t); +--- a/src/internal/version.c ++++ b/src/internal/version.c +@@ -1,12 +1,9 @@ +-#ifdef SHARED +- + #include "version.h" + + static const char version[] = VERSION; + ++__attribute__((__visibility__("hidden"))) + const char *__libc_get_version() + { + return version; + } +- +-#endif +--- a/src/internal/vis.h ++++ b/src/internal/vis.h +@@ -4,10 +4,9 @@ + * override default visibilities to reduce the size and performance costs + * of position-independent code. */ + +-#ifndef CRT +-#ifdef SHARED ++#if !defined(CRT) && !defined(__ASSEMBLER__) + +-/* For shared libc.so, all symbols should be protected, but some toolchains ++/* Conceptually, all symbols should be protected, but some toolchains + * fail to support copy relocations for protected data, so exclude all + * exported data symbols. */ + +@@ -25,16 +24,4 @@ extern char *optarg, **environ, **__envi + + #pragma GCC visibility push(protected) + +-#elif defined(__PIC__) +- +-/* If building static libc.a as position-independent code, try to make +- * everything hidden except possibly-undefined weak references. */ +- +-__attribute__((__visibility__("default"))) +-extern void (*const __init_array_start)(), (*const __init_array_end)(), +- (*const __fini_array_start)(), (*const __fini_array_end)(); +- +-#pragma GCC visibility push(hidden) +- +-#endif + #endif +--- a/src/ldso/arm/dlsym.s ++++ b/src/ldso/arm/dlsym.s +@@ -1,3 +1,4 @@ ++.syntax unified + .text + .global dlsym + .hidden __dlsym +--- /dev/null ++++ b/src/ldso/arm/find_exidx.c +@@ -0,0 +1,42 @@ ++#define _GNU_SOURCE ++#include <link.h> ++#include <stdint.h> ++ ++struct find_exidx_data { ++ uintptr_t pc, exidx_start; ++ int exidx_len; ++}; ++ ++static int find_exidx(struct dl_phdr_info *info, size_t size, void *ptr) ++{ ++ struct find_exidx_data *data = ptr; ++ const ElfW(Phdr) *phdr = info->dlpi_phdr; ++ uintptr_t addr, exidx_start = 0; ++ int i, match = 0, exidx_len = 0; ++ ++ for (i = info->dlpi_phnum; i > 0; i--, phdr++) { ++ addr = info->dlpi_addr + phdr->p_vaddr; ++ switch (phdr->p_type) { ++ case PT_LOAD: ++ match |= data->pc >= addr && data->pc < addr + phdr->p_memsz; ++ break; ++ case PT_ARM_EXIDX: ++ exidx_start = addr; ++ exidx_len = phdr->p_memsz; ++ break; ++ } ++ } ++ data->exidx_start = exidx_start; ++ data->exidx_len = exidx_len; ++ return match; ++} ++ ++uintptr_t __gnu_Unwind_Find_exidx(uintptr_t pc, int *pcount) ++{ ++ struct find_exidx_data data; ++ data.pc = pc; ++ if (dl_iterate_phdr(find_exidx, &data) <= 0) ++ return 0; ++ *pcount = data.exidx_len / 8; ++ return data.exidx_start; ++} +--- a/src/ldso/dynlink.c ++++ b/src/ldso/dynlink.c +@@ -70,8 +70,8 @@ struct dso { + char kernel_mapped; + struct dso **deps, *needed_by; + char *rpath_orig, *rpath; +- void *tls_image; +- size_t tls_len, tls_size, tls_align, tls_id, tls_offset; ++ struct tls_module tls; ++ size_t tls_id; + size_t relro_start, relro_end; + void **new_dtv; + unsigned char *new_tls; +@@ -99,7 +99,9 @@ struct symdef { + + int __init_tp(void *); + void __init_libc(char **, char *); ++void *__copy_tls(unsigned char *); + ++__attribute__((__visibility__("hidden"))) + const char *__libc_get_version(void); + + static struct builtin_tls { +@@ -123,6 +125,7 @@ static int noload; + static jmp_buf *rtld_fail; + static pthread_rwlock_t lock; + static struct debug debug; ++static struct tls_module *tls_tail; + static size_t tls_cnt, tls_offset, tls_align = MIN_TLS_ALIGN; + static size_t static_tls_cnt; + static pthread_mutex_t init_fini_lock = { ._m_type = PTHREAD_MUTEX_RECURSIVE }; +@@ -131,6 +134,15 @@ static struct fdpic_dummy_loadmap app_du + + struct debug *_dl_debug_addr = &debug; + ++__attribute__((__visibility__("hidden"))) ++void (*const __init_array_start)(void)=0, (*const __fini_array_start)(void)=0; ++ ++__attribute__((__visibility__("hidden"))) ++extern void (*const __init_array_end)(void), (*const __fini_array_end)(void); ++ ++weak_alias(__init_array_start, __init_array_end); ++weak_alias(__fini_array_start, __fini_array_end); ++ + static int dl_strcmp(const char *l, const char *r) + { + for (; *l==*r && *l; l++, r++); +@@ -397,14 +409,14 @@ static void do_relocs(struct dso *dso, s + break; + #ifdef TLS_ABOVE_TP + case REL_TPOFF: +- *reloc_addr = tls_val + def.dso->tls_offset + TPOFF_K + addend; ++ *reloc_addr = tls_val + def.dso->tls.offset + TPOFF_K + addend; + break; + #else + case REL_TPOFF: +- *reloc_addr = tls_val - def.dso->tls_offset + addend; ++ *reloc_addr = tls_val - def.dso->tls.offset + addend; + break; + case REL_TPOFF_NEG: +- *reloc_addr = def.dso->tls_offset - tls_val + addend; ++ *reloc_addr = def.dso->tls.offset - tls_val + addend; + break; + #endif + case REL_TLSDESC: +@@ -426,10 +438,10 @@ static void do_relocs(struct dso *dso, s + } else { + reloc_addr[0] = (size_t)__tlsdesc_static; + #ifdef TLS_ABOVE_TP +- reloc_addr[1] = tls_val + def.dso->tls_offset ++ reloc_addr[1] = tls_val + def.dso->tls.offset + + TPOFF_K + addend; + #else +- reloc_addr[1] = tls_val - def.dso->tls_offset ++ reloc_addr[1] = tls_val - def.dso->tls.offset + + addend; + #endif + } +@@ -482,8 +494,14 @@ static void reclaim_gaps(struct dso *dso + + static void *mmap_fixed(void *p, size_t n, int prot, int flags, int fd, off_t off) + { +- char *q = mmap(p, n, prot, flags, fd, off); +- if (q != MAP_FAILED || errno != EINVAL) return q; ++ static int no_map_fixed; ++ char *q; ++ if (!no_map_fixed) { ++ q = mmap(p, n, prot, flags|MAP_FIXED, fd, off); ++ if (!DL_NOMMU_SUPPORT || q != MAP_FAILED || errno != EINVAL) ++ return q; ++ no_map_fixed = 1; ++ } + /* Fallbacks for MAP_FIXED failure on NOMMU kernels. */ + if (flags & MAP_ANONYMOUS) { + memset(p, 0, n); +@@ -561,9 +579,9 @@ static void *map_library(int fd, struct + dyn = ph->p_vaddr; + } else if (ph->p_type == PT_TLS) { + tls_image = ph->p_vaddr; +- dso->tls_align = ph->p_align; +- dso->tls_len = ph->p_filesz; +- dso->tls_size = ph->p_memsz; ++ dso->tls.align = ph->p_align; ++ dso->tls.len = ph->p_filesz; ++ dso->tls.size = ph->p_memsz; + } else if (ph->p_type == PT_GNU_RELRO) { + dso->relro_start = ph->p_vaddr & -PAGE_SIZE; + dso->relro_end = (ph->p_vaddr + ph->p_memsz) & -PAGE_SIZE; +@@ -593,7 +611,7 @@ static void *map_library(int fd, struct + ((ph->p_flags&PF_W) ? PROT_WRITE: 0) | + ((ph->p_flags&PF_X) ? PROT_EXEC : 0)); + map = mmap(0, ph->p_memsz + (ph->p_vaddr & PAGE_SIZE-1), +- prot, (prot&PROT_WRITE) ? MAP_PRIVATE : MAP_SHARED, ++ prot, MAP_PRIVATE, + fd, ph->p_offset & -PAGE_SIZE); + if (map == MAP_FAILED) { + unmap_library(dso); +@@ -604,6 +622,19 @@ static void *map_library(int fd, struct + dso->loadmap->segs[i].p_vaddr = ph->p_vaddr; + dso->loadmap->segs[i].p_memsz = ph->p_memsz; + i++; ++ if (prot & PROT_WRITE) { ++ size_t brk = (ph->p_vaddr & PAGE_SIZE-1) ++ + ph->p_filesz; ++ size_t pgbrk = brk + PAGE_SIZE-1 & -PAGE_SIZE; ++ size_t pgend = brk + ph->p_memsz - ph->p_filesz ++ + PAGE_SIZE-1 & -PAGE_SIZE; ++ if (pgend > pgbrk && mmap_fixed(map+pgbrk, ++ pgend-pgbrk, prot, ++ MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, ++ -1, off_start) == MAP_FAILED) ++ goto error; ++ memset(map + brk, 0, pgbrk-brk); ++ } + } + map = (void *)dso->loadmap->segs[0].addr; + map_len = 0; +@@ -618,7 +649,11 @@ static void *map_library(int fd, struct + * the length of the file. This is okay because we will not + * use the invalid part; we just need to reserve the right + * amount of virtual address space to map over later. */ +- map = mmap((void *)addr_min, map_len, prot, MAP_PRIVATE, fd, off_start); ++ map = DL_NOMMU_SUPPORT ++ ? mmap((void *)addr_min, map_len, PROT_READ|PROT_WRITE|PROT_EXEC, ++ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) ++ : mmap((void *)addr_min, map_len, prot, ++ MAP_PRIVATE, fd, off_start); + if (map==MAP_FAILED) goto error; + dso->map = map; + dso->map_len = map_len; +@@ -643,7 +678,8 @@ static void *map_library(int fd, struct + dso->phentsize = eh->e_phentsize; + } + /* Reuse the existing mapping for the lowest-address LOAD */ +- if ((ph->p_vaddr & -PAGE_SIZE) == addr_min) continue; ++ if ((ph->p_vaddr & -PAGE_SIZE) == addr_min && !DL_NOMMU_SUPPORT) ++ continue; + this_min = ph->p_vaddr & -PAGE_SIZE; + this_max = ph->p_vaddr+ph->p_memsz+PAGE_SIZE-1 & -PAGE_SIZE; + off_start = ph->p_offset & -PAGE_SIZE; +@@ -670,7 +706,7 @@ static void *map_library(int fd, struct + done_mapping: + dso->base = base; + dso->dynv = laddr(dso, dyn); +- if (dso->tls_size) dso->tls_image = laddr(dso, tls_image); ++ if (dso->tls.size) dso->tls.image = laddr(dso, tls_image); + if (!runtime) reclaim_gaps(dso); + free(allocated_buf); + return map; +@@ -987,8 +1023,8 @@ static struct dso *load_library(const ch + * extended DTV capable of storing an additional slot for + * the newly-loaded DSO. */ + alloc_size = sizeof *p + strlen(pathname) + 1; +- if (runtime && temp_dso.tls_image) { +- size_t per_th = temp_dso.tls_size + temp_dso.tls_align ++ if (runtime && temp_dso.tls.image) { ++ size_t per_th = temp_dso.tls.size + temp_dso.tls.align + + sizeof(void *) * (tls_cnt+3); + n_th = libc.threads_minus_1 + 1; + if (n_th > SSIZE_MAX / per_th) alloc_size = SIZE_MAX; +@@ -1009,22 +1045,25 @@ static struct dso *load_library(const ch + strcpy(p->name, pathname); + /* Add a shortname only if name arg was not an explicit pathname. */ + if (pathname != name) p->shortname = strrchr(p->name, '/')+1; +- if (p->tls_image) { ++ if (p->tls.image) { + p->tls_id = ++tls_cnt; +- tls_align = MAXP2(tls_align, p->tls_align); ++ tls_align = MAXP2(tls_align, p->tls.align); + #ifdef TLS_ABOVE_TP +- p->tls_offset = tls_offset + ( (tls_align-1) & +- -(tls_offset + (uintptr_t)p->tls_image) ); +- tls_offset += p->tls_size; ++ p->tls.offset = tls_offset + ( (tls_align-1) & ++ -(tls_offset + (uintptr_t)p->tls.image) ); ++ tls_offset += p->tls.size; + #else +- tls_offset += p->tls_size + p->tls_align - 1; +- tls_offset -= (tls_offset + (uintptr_t)p->tls_image) +- & (p->tls_align-1); +- p->tls_offset = tls_offset; ++ tls_offset += p->tls.size + p->tls.align - 1; ++ tls_offset -= (tls_offset + (uintptr_t)p->tls.image) ++ & (p->tls.align-1); ++ p->tls.offset = tls_offset; + #endif + p->new_dtv = (void *)(-sizeof(size_t) & + (uintptr_t)(p->name+strlen(p->name)+sizeof(size_t))); + p->new_tls = (void *)(p->new_dtv + n_th*(tls_cnt+1)); ++ if (tls_tail) tls_tail->next = &p->tls; ++ else libc.tls_head = &p->tls; ++ tls_tail = &p->tls; + } + + tail->next = p; +@@ -1151,7 +1190,7 @@ static void kernel_mapped_dso(struct dso + p->kernel_mapped = 1; + } + +-static void do_fini() ++void __libc_exit_fini() + { + struct dso *p; + size_t dyn[DYN_CNT]; +@@ -1214,53 +1253,8 @@ static void dl_debug_state(void) + + weak_alias(dl_debug_state, _dl_debug_state); + +-void __reset_tls() ++void __init_tls(size_t *auxv) + { +- pthread_t self = __pthread_self(); +- struct dso *p; +- for (p=head; p; p=p->next) { +- if (!p->tls_id || !self->dtv[p->tls_id]) continue; +- memcpy(self->dtv[p->tls_id], p->tls_image, p->tls_len); +- memset((char *)self->dtv[p->tls_id]+p->tls_len, 0, +- p->tls_size - p->tls_len); +- if (p->tls_id == (size_t)self->dtv[0]) break; +- } +-} +- +-void *__copy_tls(unsigned char *mem) +-{ +- pthread_t td; +- struct dso *p; +- void **dtv; +- +-#ifdef TLS_ABOVE_TP +- dtv = (void **)(mem + libc.tls_size) - (tls_cnt + 1); +- +- mem += -((uintptr_t)mem + sizeof(struct pthread)) & (tls_align-1); +- td = (pthread_t)mem; +- mem += sizeof(struct pthread); +- +- for (p=head; p; p=p->next) { +- if (!p->tls_id) continue; +- dtv[p->tls_id] = mem + p->tls_offset; +- memcpy(dtv[p->tls_id], p->tls_image, p->tls_len); +- } +-#else +- dtv = (void **)mem; +- +- mem += libc.tls_size - sizeof(struct pthread); +- mem -= (uintptr_t)mem & (tls_align-1); +- td = (pthread_t)mem; +- +- for (p=head; p; p=p->next) { +- if (!p->tls_id) continue; +- dtv[p->tls_id] = mem - p->tls_offset; +- memcpy(dtv[p->tls_id], p->tls_image, p->tls_len); +- } +-#endif +- dtv[0] = (void *)tls_cnt; +- td->dtv = td->dtv_copy = dtv; +- return td; + } + + __attribute__((__visibility__("hidden"))) +@@ -1286,7 +1280,7 @@ void *__tls_get_new(size_t *v) + /* Get new DTV space from new DSO if needed */ + if (v[0] > (size_t)self->dtv[0]) { + void **newdtv = p->new_dtv + +- (v[0]+1)*sizeof(void *)*a_fetch_add(&p->new_dtv_idx,1); ++ (v[0]+1)*a_fetch_add(&p->new_dtv_idx,1); + memcpy(newdtv, self->dtv, + ((size_t)self->dtv[0]+1) * sizeof(void *)); + newdtv[0] = (void *)v[0]; +@@ -1297,12 +1291,12 @@ void *__tls_get_new(size_t *v) + unsigned char *mem; + for (p=head; ; p=p->next) { + if (!p->tls_id || self->dtv[p->tls_id]) continue; +- mem = p->new_tls + (p->tls_size + p->tls_align) ++ mem = p->new_tls + (p->tls.size + p->tls.align) + * a_fetch_add(&p->new_tls_idx,1); +- mem += ((uintptr_t)p->tls_image - (uintptr_t)mem) +- & (p->tls_align-1); ++ mem += ((uintptr_t)p->tls.image - (uintptr_t)mem) ++ & (p->tls.align-1); + self->dtv[p->tls_id] = mem; +- memcpy(mem, p->tls_image, p->tls_len); ++ memcpy(mem, p->tls.image, p->tls.len); + if (p->tls_id == v[0]) break; + } + __restore_sigs(&set); +@@ -1311,6 +1305,8 @@ void *__tls_get_new(size_t *v) + + static void update_tls_size() + { ++ libc.tls_cnt = tls_cnt; ++ libc.tls_align = tls_align; + libc.tls_size = ALIGN( + (1+tls_cnt) * sizeof(void *) + + tls_offset + +@@ -1421,6 +1417,7 @@ _Noreturn void __dls3(size_t *sp) + * use during dynamic linking. If possible it will also serve as the + * thread pointer at runtime. */ + libc.tls_size = sizeof builtin_tls; ++ libc.tls_align = tls_align; + if (__init_tp(__copy_tls((void *)builtin_tls)) < 0) { + a_crash(); + } +@@ -1448,13 +1445,13 @@ _Noreturn void __dls3(size_t *sp) + interp_off = (size_t)phdr->p_vaddr; + else if (phdr->p_type == PT_TLS) { + tls_image = phdr->p_vaddr; +- app.tls_len = phdr->p_filesz; +- app.tls_size = phdr->p_memsz; +- app.tls_align = phdr->p_align; ++ app.tls.len = phdr->p_filesz; ++ app.tls.size = phdr->p_memsz; ++ app.tls.align = phdr->p_align; + } + } + if (DL_FDPIC) app.loadmap = app_loadmap; +- if (app.tls_size) app.tls_image = laddr(&app, tls_image); ++ if (app.tls.size) app.tls.image = laddr(&app, tls_image); + if (interp_off) ldso.name = laddr(&app, interp_off); + if ((aux[0] & (1UL<<AT_EXECFN)) + && strncmp((char *)aux[AT_EXECFN], "/proc/", 6)) +@@ -1523,19 +1520,20 @@ _Noreturn void __dls3(size_t *sp) + dprintf(1, "\t%s (%p)\n", ldso.name, ldso.base); + } + } +- if (app.tls_size) { ++ if (app.tls.size) { ++ libc.tls_head = &app.tls; + app.tls_id = tls_cnt = 1; + #ifdef TLS_ABOVE_TP +- app.tls_offset = 0; +- tls_offset = app.tls_size +- + ( -((uintptr_t)app.tls_image + app.tls_size) +- & (app.tls_align-1) ); ++ app.tls.offset = 0; ++ tls_offset = app.tls.size ++ + ( -((uintptr_t)app.tls.image + app.tls.size) ++ & (app.tls.align-1) ); + #else +- tls_offset = app.tls_offset = app.tls_size +- + ( -((uintptr_t)app.tls_image + app.tls_size) +- & (app.tls_align-1) ); ++ tls_offset = app.tls.offset = app.tls.size ++ + ( -((uintptr_t)app.tls.image + app.tls.size) ++ & (app.tls.align-1) ); + #endif +- tls_align = MAXP2(tls_align, app.tls_align); ++ tls_align = MAXP2(tls_align, app.tls.align); + } + app.global = 1; + decode_dyn(&app); +@@ -1635,8 +1633,6 @@ _Noreturn void __dls3(size_t *sp) + debug.state = 0; + _dl_debug_state(); + +- __init_libc(envp, argv[0]); +- atexit(do_fini); + errno = 0; + + CRTJMP((void *)aux[AT_ENTRY], argv-1); +@@ -1646,6 +1642,7 @@ _Noreturn void __dls3(size_t *sp) + void *dlopen(const char *file, int mode) + { + struct dso *volatile p, *orig_tail, *next; ++ struct tls_module *orig_tls_tail; + size_t orig_tls_cnt, orig_tls_offset, orig_tls_align; + size_t i; + int cs; +@@ -1658,6 +1655,7 @@ void *dlopen(const char *file, int mode) + __inhibit_ptc(); + + p = 0; ++ orig_tls_tail = tls_tail; + orig_tls_cnt = tls_cnt; + orig_tls_offset = tls_offset; + orig_tls_align = tls_align; +@@ -1684,6 +1682,8 @@ void *dlopen(const char *file, int mode) + unmap_library(p); + free(p); + } ++ if (!orig_tls_tail) libc.tls_head = 0; ++ tls_tail = orig_tls_tail; + tls_cnt = orig_tls_cnt; + tls_offset = orig_tls_offset; + tls_align = orig_tls_align; +@@ -1900,7 +1900,7 @@ int dl_iterate_phdr(int(*callback)(struc + info.dlpi_adds = gencnt; + info.dlpi_subs = 0; + info.dlpi_tls_modid = current->tls_id; +- info.dlpi_tls_data = current->tls_image; ++ info.dlpi_tls_data = current->tls.image; + + ret = (callback)(&info, sizeof (info), data); + +--- a/src/locale/langinfo.c ++++ b/src/locale/langinfo.c +@@ -37,23 +37,23 @@ char *__nl_langinfo_l(nl_item item, loca + + switch (cat) { + case LC_NUMERIC: +- if (idx > 1) return NULL; ++ if (idx > 1) return ""; + str = c_numeric; + break; + case LC_TIME: +- if (idx > 0x31) return NULL; ++ if (idx > 0x31) return ""; + str = c_time; + break; + case LC_MONETARY: +- if (idx > 0) return NULL; ++ if (idx > 0) return ""; + str = ""; + break; + case LC_MESSAGES: +- if (idx > 3) return NULL; ++ if (idx > 3) return ""; + str = c_messages; + break; + default: +- return NULL; ++ return ""; + } + + for (; idx; idx--, str++) for (; *str; str++); +--- a/src/malloc/lite_malloc.c ++++ b/src/malloc/lite_malloc.c +@@ -8,7 +8,7 @@ + + void *__expand_heap(size_t *); + +-void *__simple_malloc(size_t n) ++static void *__simple_malloc(size_t n) + { + static char *cur, *end; + static volatile int lock[2]; +--- a/src/math/__rem_pio2.c ++++ b/src/math/__rem_pio2.c +@@ -118,7 +118,7 @@ int __rem_pio2(double x, double *y) + if (ix < 0x413921fb) { /* |x| ~< 2^20*(pi/2), medium size */ + medium: + /* rint(x/(pi/2)), Assume round-to-nearest. */ +- fn = x*invpio2 + toint - toint; ++ fn = (double_t)x*invpio2 + toint - toint; + n = (int32_t)fn; + r = x - fn*pio2_1; + w = fn*pio2_1t; /* 1st round, good to 85 bits */ +--- a/src/math/__rem_pio2f.c ++++ b/src/math/__rem_pio2f.c +@@ -51,7 +51,7 @@ int __rem_pio2f(float x, double *y) + /* 25+53 bit pi is good enough for medium size */ + if (ix < 0x4dc90fdb) { /* |x| ~< 2^28*(pi/2), medium size */ + /* Use a specialized rint() to get fn. Assume round-to-nearest. */ +- fn = x*invpio2 + toint - toint; ++ fn = (double_t)x*invpio2 + toint - toint; + n = (int32_t)fn; + *y = x - fn*pio2_1 - fn*pio2_1t; + return n; +--- /dev/null ++++ b/src/math/arm/fabs.c +@@ -0,0 +1,15 @@ ++#include <math.h> ++ ++#if __ARM_PCS_VFP ++ ++double fabs(double x) ++{ ++ __asm__ ("vabs.f64 %P0, %P1" : "=w"(x) : "w"(x)); ++ return x; ++} ++ ++#else ++ ++#include "../fabs.c" ++ ++#endif +--- /dev/null ++++ b/src/math/arm/fabsf.c +@@ -0,0 +1,15 @@ ++#include <math.h> ++ ++#if __ARM_PCS_VFP ++ ++float fabsf(float x) ++{ ++ __asm__ ("vabs.f32 %0, %1" : "=t"(x) : "t"(x)); ++ return x; ++} ++ ++#else ++ ++#include "../fabsf.c" ++ ++#endif +--- /dev/null ++++ b/src/math/arm/sqrt.c +@@ -0,0 +1,15 @@ ++#include <math.h> ++ ++#if __VFP_FP__ && !__SOFTFP__ ++ ++double sqrt(double x) ++{ ++ __asm__ ("vsqrt.f64 %P0, %P1" : "=w"(x) : "w"(x)); ++ return x; ++} ++ ++#else ++ ++#include "../sqrt.c" ++ ++#endif +--- /dev/null ++++ b/src/math/arm/sqrtf.c +@@ -0,0 +1,15 @@ ++#include <math.h> ++ ++#if __VFP_FP__ && !__SOFTFP__ ++ ++float sqrtf(float x) ++{ ++ __asm__ ("vsqrt.f32 %0, %1" : "=t"(x) : "t"(x)); ++ return x; ++} ++ ++#else ++ ++#include "../sqrtf.c" ++ ++#endif +--- a/src/math/armebhf/fabs.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../armhf/fabs.s +--- a/src/math/armebhf/fabsf.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../armhf/fabsf.s +--- a/src/math/armebhf/sqrt.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../armhf/sqrt.s +--- a/src/math/armebhf/sqrtf.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../armhf/sqrtf.s +--- a/src/math/armhf/fabs.s ++++ /dev/null +@@ -1,7 +0,0 @@ +-.fpu vfp +-.text +-.global fabs +-.type fabs,%function +-fabs: +- vabs.f64 d0, d0 +- bx lr +--- a/src/math/armhf/fabs.sub ++++ /dev/null +@@ -1 +0,0 @@ +-fabs.s +--- a/src/math/armhf/fabsf.s ++++ /dev/null +@@ -1,7 +0,0 @@ +-.fpu vfp +-.text +-.global fabsf +-.type fabsf,%function +-fabsf: +- vabs.f32 s0, s0 +- bx lr +--- a/src/math/armhf/fabsf.sub ++++ /dev/null +@@ -1 +0,0 @@ +-fabsf.s +--- a/src/math/armhf/sqrt.s ++++ /dev/null +@@ -1,7 +0,0 @@ +-.fpu vfp +-.text +-.global sqrt +-.type sqrt,%function +-sqrt: +- vsqrt.f64 d0, d0 +- bx lr +--- a/src/math/armhf/sqrt.sub ++++ /dev/null +@@ -1 +0,0 @@ +-sqrt.s +--- a/src/math/armhf/sqrtf.s ++++ /dev/null +@@ -1,7 +0,0 @@ +-.fpu vfp +-.text +-.global sqrtf +-.type sqrtf,%function +-sqrtf: +- vsqrt.f32 s0, s0 +- bx lr +--- a/src/math/armhf/sqrtf.sub ++++ /dev/null +@@ -1 +0,0 @@ +-sqrtf.s +--- a/src/math/hypot.c ++++ b/src/math/hypot.c +@@ -12,10 +12,10 @@ static void sq(double_t *hi, double_t *l + { + double_t xh, xl, xc; + +- xc = x*SPLIT; ++ xc = (double_t)x*SPLIT; + xh = x - xc + xc; + xl = x - xh; +- *hi = x*x; ++ *hi = (double_t)x*x; + *lo = xh*xh - *hi + 2*xh*xl + xl*xl; + } + +--- a/src/mman/mremap.c ++++ b/src/mman/mremap.c +@@ -1,17 +1,31 @@ ++#define _GNU_SOURCE + #include <unistd.h> + #include <sys/mman.h> ++#include <errno.h> ++#include <stdint.h> + #include <stdarg.h> + #include "syscall.h" + #include "libc.h" + ++static void dummy(void) { } ++weak_alias(dummy, __vm_wait); ++ + void *__mremap(void *old_addr, size_t old_len, size_t new_len, int flags, ...) + { + va_list ap; +- void *new_addr; +- +- va_start(ap, flags); +- new_addr = va_arg(ap, void *); +- va_end(ap); ++ void *new_addr = 0; ++ ++ if (new_len >= PTRDIFF_MAX) { ++ errno = ENOMEM; ++ return MAP_FAILED; ++ } ++ ++ if (flags & MREMAP_FIXED) { ++ __vm_wait(); ++ va_start(ap, flags); ++ new_addr = va_arg(ap, void *); ++ va_end(ap); ++ } + + return (void *)syscall(SYS_mremap, old_addr, old_len, new_len, flags, new_addr); + } +--- a/src/network/getifaddrs.c ++++ b/src/network/getifaddrs.c +@@ -162,13 +162,26 @@ static int netlink_msg_to_ifaddr(void *p + for (rta = NLMSG_RTA(h, sizeof(*ifa)); NLMSG_RTAOK(rta, h); rta = RTA_NEXT(rta)) { + switch (rta->rta_type) { + case IFA_ADDRESS: +- copy_addr(&ifs->ifa.ifa_addr, ifa->ifa_family, &ifs->addr, RTA_DATA(rta), RTA_DATALEN(rta), ifa->ifa_index); ++ /* If ifa_addr is already set we, received an IFA_LOCAL before ++ * so treat this as destination address */ ++ if (ifs->ifa.ifa_addr) ++ copy_addr(&ifs->ifa.ifa_dstaddr, ifa->ifa_family, &ifs->ifu, RTA_DATA(rta), RTA_DATALEN(rta), ifa->ifa_index); ++ else ++ copy_addr(&ifs->ifa.ifa_addr, ifa->ifa_family, &ifs->addr, RTA_DATA(rta), RTA_DATALEN(rta), ifa->ifa_index); + break; + case IFA_BROADCAST: +- /* For point-to-point links this is peer, but ifa_broadaddr +- * and ifa_dstaddr are union, so this works for both. */ + copy_addr(&ifs->ifa.ifa_broadaddr, ifa->ifa_family, &ifs->ifu, RTA_DATA(rta), RTA_DATALEN(rta), ifa->ifa_index); + break; ++ case IFA_LOCAL: ++ /* If ifa_addr is set and we get IFA_LOCAL, assume we have ++ * a point-to-point network. Move address to correct field. */ ++ if (ifs->ifa.ifa_addr) { ++ ifs->ifu = ifs->addr; ++ ifs->ifa.ifa_dstaddr = &ifs->ifu.sa; ++ memset(&ifs->addr, 0, sizeof(ifs->addr)); ++ } ++ copy_addr(&ifs->ifa.ifa_addr, ifa->ifa_family, &ifs->addr, RTA_DATA(rta), RTA_DATALEN(rta), ifa->ifa_index); ++ break; + case IFA_LABEL: + if (RTA_DATALEN(rta) < sizeof(ifs->name)) { + memcpy(ifs->name, RTA_DATA(rta), RTA_DATALEN(rta)); +--- a/src/network/getnameinfo.c ++++ b/src/network/getnameinfo.c +@@ -135,13 +135,13 @@ int getnameinfo(const struct sockaddr *r + switch (af) { + case AF_INET: + a = (void *)&((struct sockaddr_in *)sa)->sin_addr; +- if (sl != sizeof(struct sockaddr_in)) return EAI_FAMILY; ++ if (sl < sizeof(struct sockaddr_in)) return EAI_FAMILY; + mkptr4(ptr, a); + scopeid = 0; + break; + case AF_INET6: + a = (void *)&((struct sockaddr_in6 *)sa)->sin6_addr; +- if (sl != sizeof(struct sockaddr_in6)) return EAI_FAMILY; ++ if (sl < sizeof(struct sockaddr_in6)) return EAI_FAMILY; + if (memcmp(a, "\0\0\0\0\0\0\0\0\0\0\xff\xff", 12)) + mkptr6(ptr, a); + else +--- a/src/network/if_nametoindex.c ++++ b/src/network/if_nametoindex.c +@@ -10,7 +10,7 @@ unsigned if_nametoindex(const char *name + struct ifreq ifr; + int fd, r; + +- if ((fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0)) < 0) return -1; ++ if ((fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0)) < 0) return 0; + strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name); + r = ioctl(fd, SIOCGIFINDEX, &ifr); + __syscall(SYS_close, fd); +--- a/src/network/lookup_name.c ++++ b/src/network/lookup_name.c +@@ -9,6 +9,7 @@ + #include <fcntl.h> + #include <unistd.h> + #include <pthread.h> ++#include <errno.h> + #include "lookup.h" + #include "stdio_impl.h" + #include "syscall.h" +@@ -51,7 +52,14 @@ static int name_from_hosts(struct addres + int cnt = 0; + unsigned char _buf[1032]; + FILE _f, *f = __fopen_rb_ca("/etc/hosts", &_f, _buf, sizeof _buf); +- if (!f) return 0; ++ if (!f) switch (errno) { ++ case ENOENT: ++ case ENOTDIR: ++ case EACCES: ++ return 0; ++ default: ++ return EAI_SYSTEM; ++ } + while (fgets(line, sizeof line, f) && cnt < MAXADDRS) { + char *p, *z; + +--- a/src/network/lookup_serv.c ++++ b/src/network/lookup_serv.c +@@ -4,6 +4,7 @@ + #include <ctype.h> + #include <string.h> + #include <fcntl.h> ++#include <errno.h> + #include "lookup.h" + #include "stdio_impl.h" + +@@ -69,7 +70,14 @@ int __lookup_serv(struct service buf[sta + + unsigned char _buf[1032]; + FILE _f, *f = __fopen_rb_ca("/etc/services", &_f, _buf, sizeof _buf); +- if (!f) return EAI_SERVICE; ++ if (!f) switch (errno) { ++ case ENOENT: ++ case ENOTDIR: ++ case EACCES: ++ return EAI_SERVICE; ++ default: ++ return EAI_SYSTEM; ++ } + + while (fgets(line, sizeof line, f) && cnt < MAXSERVS) { + if ((p=strchr(line, '#'))) *p++='\n', *p=0; +--- a/src/network/proto.c ++++ b/src/network/proto.c +@@ -9,21 +9,36 @@ static const unsigned char protos[] = { + "\001icmp\0" + "\002igmp\0" + "\003ggp\0" ++ "\004ipencap\0" ++ "\005st\0" + "\006tcp\0" ++ "\008egp\0" + "\014pup\0" + "\021udp\0" +- "\026idp\0" ++ "\024hmp\0" ++ "\026xns-idp\0" ++ "\033rdp\0" ++ "\035iso-tp4\0" ++ "\044xtp\0" ++ "\045ddp\0" ++ "\046idpr-cmtp\0" + "\051ipv6\0" + "\053ipv6-route\0" + "\054ipv6-frag\0" ++ "\055idrp\0" ++ "\056rsvp\0" + "\057gre\0" + "\062esp\0" + "\063ah\0" ++ "\071skip\0" + "\072ipv6-icmp\0" + "\073ipv6-nonxt\0" + "\074ipv6-opts\0" ++ "\111rspf\0" ++ "\121vmtp\0" + "\131ospf\0" + "\136ipip\0" ++ "\142encap\0" + "\147pim\0" + "\377raw" + }; +--- a/src/network/res_msend.c ++++ b/src/network/res_msend.c +@@ -54,7 +54,15 @@ int __res_msend(int nqueries, const unsi + + /* Get nameservers from resolv.conf, fallback to localhost */ + f = __fopen_rb_ca("/etc/resolv.conf", &_f, _buf, sizeof _buf); +- if (f) for (nns=0; nns<3 && fgets(line, sizeof line, f); ) { ++ if (!f) switch (errno) { ++ case ENOENT: ++ case ENOTDIR: ++ case EACCES: ++ goto no_resolv_conf; ++ default: ++ return -1; ++ } ++ for (nns=0; nns<3 && fgets(line, sizeof line, f); ) { + if (!strncmp(line, "options", 7) && isspace(line[7])) { + unsigned long x; + char *p, *z; +@@ -92,7 +100,8 @@ int __res_msend(int nqueries, const unsi + } + } + } +- if (f) __fclose_ca(f); ++ __fclose_ca(f); ++no_resolv_conf: + if (!nns) { + ns[0].sin.sin_family = AF_INET; + ns[0].sin.sin_port = htons(53); +--- a/src/search/tsearch_avl.c ++++ b/src/search/tsearch_avl.c +@@ -77,38 +77,45 @@ static struct node *find(struct node *n, + return find(n->right, k, cmp); + } + +-static struct node *insert(struct node **n, const void *k, +- int (*cmp)(const void *, const void *), int *new) ++static struct node *insert(struct node *n, const void *k, ++ int (*cmp)(const void *, const void *), struct node **found) + { +- struct node *r = *n; ++ struct node *r; + int c; + +- if (!r) { +- *n = r = malloc(sizeof **n); +- if (r) { +- r->key = k; +- r->left = r->right = 0; +- r->height = 1; ++ if (!n) { ++ n = malloc(sizeof *n); ++ if (n) { ++ n->key = k; ++ n->left = n->right = 0; ++ n->height = 1; + } +- *new = 1; +- return r; ++ *found = n; ++ return n; ++ } ++ c = cmp(k, n->key); ++ if (c == 0) { ++ *found = n; ++ return 0; ++ } ++ r = insert(c < 0 ? n->left : n->right, k, cmp, found); ++ if (r) { ++ if (c < 0) ++ n->left = r; ++ else ++ n->right = r; ++ r = balance(n); + } +- c = cmp(k, r->key); +- if (c == 0) +- return r; +- if (c < 0) +- r = insert(&r->left, k, cmp, new); +- else +- r = insert(&r->right, k, cmp, new); +- if (*new) +- *n = balance(*n); + return r; + } + +-static struct node *movr(struct node *n, struct node *r) { +- if (!n) +- return r; +- n->right = movr(n->right, r); ++static struct node *remove_rightmost(struct node *n, struct node **rightmost) ++{ ++ if (!n->right) { ++ *rightmost = n; ++ return n->left; ++ } ++ n->right = remove_rightmost(n->right, rightmost); + return balance(n); + } + +@@ -122,7 +129,13 @@ static struct node *remove(struct node * + c = cmp(k, (*n)->key); + if (c == 0) { + struct node *r = *n; +- *n = movr(r->left, r->right); ++ if (r->left) { ++ r->left = remove_rightmost(r->left, n); ++ (*n)->left = r->left; ++ (*n)->right = r->right; ++ *n = balance(*n); ++ } else ++ *n = r->right; + free(r); + return parent; + } +@@ -138,6 +151,8 @@ static struct node *remove(struct node * + void *tdelete(const void *restrict key, void **restrict rootp, + int(*compar)(const void *, const void *)) + { ++ if (!rootp) ++ return 0; + struct node *n = *rootp; + struct node *ret; + /* last argument is arbitrary non-null pointer +@@ -150,17 +165,21 @@ void *tdelete(const void *restrict key, + void *tfind(const void *key, void *const *rootp, + int(*compar)(const void *, const void *)) + { ++ if (!rootp) ++ return 0; + return find(*rootp, key, compar); + } + + void *tsearch(const void *key, void **rootp, + int (*compar)(const void *, const void *)) + { +- int new = 0; +- struct node *n = *rootp; ++ struct node *update; + struct node *ret; +- ret = insert(&n, key, compar, &new); +- *rootp = n; ++ if (!rootp) ++ return 0; ++ update = insert(*rootp, key, compar, &ret); ++ if (update) ++ *rootp = update; + return ret; + } + +--- a/src/setjmp/arm/longjmp.s ++++ b/src/setjmp/arm/longjmp.s +@@ -1,3 +1,4 @@ ++.syntax unified + .global _longjmp + .global longjmp + .type _longjmp,%function +@@ -20,7 +21,11 @@ longjmp: + ldc p2, cr4, [ip], #48 + 2: tst r1,#0x40 + beq 2f +- .word 0xecbc8b10 /* vldmia ip!, {d8-d15} */ ++ .fpu vfp ++ vldmia ip!, {d8-d15} ++ .fpu softvfp ++ .eabi_attribute 10, 0 ++ .eabi_attribute 27, 0 + 2: tst r1,#0x200 + beq 3f + ldcl p1, cr10, [ip], #8 +@@ -29,9 +34,7 @@ longjmp: + ldcl p1, cr13, [ip], #8 + ldcl p1, cr14, [ip], #8 + ldcl p1, cr15, [ip], #8 +-3: tst lr,#1 +- moveq pc,lr +- bx lr ++3: bx lr + + .hidden __hwcap + 1: .word __hwcap-1b +--- a/src/setjmp/arm/setjmp.s ++++ b/src/setjmp/arm/setjmp.s +@@ -1,3 +1,4 @@ ++.syntax unified + .global __setjmp + .global _setjmp + .global setjmp +@@ -22,7 +23,11 @@ setjmp: + stc p2, cr4, [ip], #48 + 2: tst r1,#0x40 + beq 2f +- .word 0xecac8b10 /* vstmia ip!, {d8-d15} */ ++ .fpu vfp ++ vstmia ip!, {d8-d15} ++ .fpu softvfp ++ .eabi_attribute 10, 0 ++ .eabi_attribute 27, 0 + 2: tst r1,#0x200 + beq 3f + stcl p1, cr10, [ip], #8 +@@ -31,9 +36,7 @@ setjmp: + stcl p1, cr13, [ip], #8 + stcl p1, cr14, [ip], #8 + stcl p1, cr15, [ip], #8 +-3: tst lr,#1 +- moveq pc,lr +- bx lr ++3: bx lr + + .hidden __hwcap + 1: .word __hwcap-1b +--- a/src/setjmp/mips-sf/longjmp.s ++++ /dev/null +@@ -1,25 +0,0 @@ +-.set noreorder +- +-.global _longjmp +-.global longjmp +-.type _longjmp,@function +-.type longjmp,@function +-_longjmp: +-longjmp: +- move $2, $5 +- bne $2, $0, 1f +- nop +- addu $2, $2, 1 +-1: lw $ra, 0($4) +- lw $sp, 4($4) +- lw $16, 8($4) +- lw $17, 12($4) +- lw $18, 16($4) +- lw $19, 20($4) +- lw $20, 24($4) +- lw $21, 28($4) +- lw $22, 32($4) +- lw $23, 36($4) +- lw $30, 40($4) +- jr $ra +- lw $28, 44($4) +--- a/src/setjmp/mips-sf/longjmp.sub ++++ /dev/null +@@ -1 +0,0 @@ +-longjmp.s +--- a/src/setjmp/mips-sf/setjmp.s ++++ /dev/null +@@ -1,25 +0,0 @@ +-.set noreorder +- +-.global __setjmp +-.global _setjmp +-.global setjmp +-.type __setjmp,@function +-.type _setjmp,@function +-.type setjmp,@function +-__setjmp: +-_setjmp: +-setjmp: +- sw $ra, 0($4) +- sw $sp, 4($4) +- sw $16, 8($4) +- sw $17, 12($4) +- sw $18, 16($4) +- sw $19, 20($4) +- sw $20, 24($4) +- sw $21, 28($4) +- sw $22, 32($4) +- sw $23, 36($4) +- sw $30, 40($4) +- sw $28, 44($4) +- jr $ra +- li $2, 0 +--- a/src/setjmp/mips-sf/setjmp.sub ++++ /dev/null +@@ -1 +0,0 @@ +-setjmp.s +--- /dev/null ++++ b/src/setjmp/mips/longjmp.S +@@ -0,0 +1,40 @@ ++.set noreorder ++ ++.global _longjmp ++.global longjmp ++.type _longjmp,@function ++.type longjmp,@function ++_longjmp: ++longjmp: ++ move $2, $5 ++ bne $2, $0, 1f ++ nop ++ addu $2, $2, 1 ++1: ++#ifndef __mips_soft_float ++ lwc1 $20, 56($4) ++ lwc1 $21, 60($4) ++ lwc1 $22, 64($4) ++ lwc1 $23, 68($4) ++ lwc1 $24, 72($4) ++ lwc1 $25, 76($4) ++ lwc1 $26, 80($4) ++ lwc1 $27, 84($4) ++ lwc1 $28, 88($4) ++ lwc1 $29, 92($4) ++ lwc1 $30, 96($4) ++ lwc1 $31, 100($4) ++#endif ++ lw $ra, 0($4) ++ lw $sp, 4($4) ++ lw $16, 8($4) ++ lw $17, 12($4) ++ lw $18, 16($4) ++ lw $19, 20($4) ++ lw $20, 24($4) ++ lw $21, 28($4) ++ lw $22, 32($4) ++ lw $23, 36($4) ++ lw $30, 40($4) ++ jr $ra ++ lw $28, 44($4) +--- a/src/setjmp/mips/longjmp.s ++++ /dev/null +@@ -1,37 +0,0 @@ +-.set noreorder +- +-.global _longjmp +-.global longjmp +-.type _longjmp,@function +-.type longjmp,@function +-_longjmp: +-longjmp: +- move $2, $5 +- bne $2, $0, 1f +- nop +- addu $2, $2, 1 +-1: lwc1 $20, 56($4) +- lwc1 $21, 60($4) +- lwc1 $22, 64($4) +- lwc1 $23, 68($4) +- lwc1 $24, 72($4) +- lwc1 $25, 76($4) +- lwc1 $26, 80($4) +- lwc1 $27, 84($4) +- lwc1 $28, 88($4) +- lwc1 $29, 92($4) +- lwc1 $30, 96($4) +- lwc1 $31, 100($4) +- lw $ra, 0($4) +- lw $sp, 4($4) +- lw $16, 8($4) +- lw $17, 12($4) +- lw $18, 16($4) +- lw $19, 20($4) +- lw $20, 24($4) +- lw $21, 28($4) +- lw $22, 32($4) +- lw $23, 36($4) +- lw $30, 40($4) +- jr $ra +- lw $28, 44($4) +--- /dev/null ++++ b/src/setjmp/mips/setjmp.S +@@ -0,0 +1,39 @@ ++.set noreorder ++ ++.global __setjmp ++.global _setjmp ++.global setjmp ++.type __setjmp,@function ++.type _setjmp,@function ++.type setjmp,@function ++__setjmp: ++_setjmp: ++setjmp: ++ sw $ra, 0($4) ++ sw $sp, 4($4) ++ sw $16, 8($4) ++ sw $17, 12($4) ++ sw $18, 16($4) ++ sw $19, 20($4) ++ sw $20, 24($4) ++ sw $21, 28($4) ++ sw $22, 32($4) ++ sw $23, 36($4) ++ sw $30, 40($4) ++ sw $28, 44($4) ++#ifndef __mips_soft_float ++ swc1 $20, 56($4) ++ swc1 $21, 60($4) ++ swc1 $22, 64($4) ++ swc1 $23, 68($4) ++ swc1 $24, 72($4) ++ swc1 $25, 76($4) ++ swc1 $26, 80($4) ++ swc1 $27, 84($4) ++ swc1 $28, 88($4) ++ swc1 $29, 92($4) ++ swc1 $30, 96($4) ++ swc1 $31, 100($4) ++#endif ++ jr $ra ++ li $2, 0 +--- a/src/setjmp/mips/setjmp.s ++++ /dev/null +@@ -1,37 +0,0 @@ +-.set noreorder +- +-.global __setjmp +-.global _setjmp +-.global setjmp +-.type __setjmp,@function +-.type _setjmp,@function +-.type setjmp,@function +-__setjmp: +-_setjmp: +-setjmp: +- sw $ra, 0($4) +- sw $sp, 4($4) +- sw $16, 8($4) +- sw $17, 12($4) +- sw $18, 16($4) +- sw $19, 20($4) +- sw $20, 24($4) +- sw $21, 28($4) +- sw $22, 32($4) +- sw $23, 36($4) +- sw $30, 40($4) +- sw $28, 44($4) +- swc1 $20, 56($4) +- swc1 $21, 60($4) +- swc1 $22, 64($4) +- swc1 $23, 68($4) +- swc1 $24, 72($4) +- swc1 $25, 76($4) +- swc1 $26, 80($4) +- swc1 $27, 84($4) +- swc1 $28, 88($4) +- swc1 $29, 92($4) +- swc1 $30, 96($4) +- swc1 $31, 100($4) +- jr $ra +- li $2, 0 +--- a/src/setjmp/mipsel-sf/longjmp.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../mips-sf/longjmp.s +--- a/src/setjmp/mipsel-sf/setjmp.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../mips-sf/setjmp.s +--- a/src/setjmp/sh-nofpu/longjmp.s ++++ /dev/null +@@ -1,22 +0,0 @@ +-.global _longjmp +-.global longjmp +-.type _longjmp, @function +-.type longjmp, @function +-_longjmp: +-longjmp: +- mov.l @r4+, r8 +- mov.l @r4+, r9 +- mov.l @r4+, r10 +- mov.l @r4+, r11 +- mov.l @r4+, r12 +- mov.l @r4+, r13 +- mov.l @r4+, r14 +- mov.l @r4+, r15 +- lds.l @r4+, pr +- +- tst r5, r5 +- movt r0 +- add r5, r0 +- +- rts +- nop +--- a/src/setjmp/sh-nofpu/longjmp.sub ++++ /dev/null +@@ -1 +0,0 @@ +-longjmp.s +--- a/src/setjmp/sh-nofpu/setjmp.s ++++ /dev/null +@@ -1,24 +0,0 @@ +-.global ___setjmp +-.hidden ___setjmp +-.global __setjmp +-.global _setjmp +-.global setjmp +-.type __setjmp, @function +-.type _setjmp, @function +-.type setjmp, @function +-___setjmp: +-__setjmp: +-_setjmp: +-setjmp: +- add #36, r4 +- sts.l pr, @-r4 +- mov.l r15 @-r4 +- mov.l r14, @-r4 +- mov.l r13, @-r4 +- mov.l r12, @-r4 +- mov.l r11, @-r4 +- mov.l r10, @-r4 +- mov.l r9, @-r4 +- mov.l r8, @-r4 +- rts +- mov #0, r0 +--- a/src/setjmp/sh-nofpu/setjmp.sub ++++ /dev/null +@@ -1 +0,0 @@ +-setjmp.s +--- /dev/null ++++ b/src/setjmp/sh/longjmp.S +@@ -0,0 +1,28 @@ ++.global _longjmp ++.global longjmp ++.type _longjmp, @function ++.type longjmp, @function ++_longjmp: ++longjmp: ++ mov.l @r4+, r8 ++ mov.l @r4+, r9 ++ mov.l @r4+, r10 ++ mov.l @r4+, r11 ++ mov.l @r4+, r12 ++ mov.l @r4+, r13 ++ mov.l @r4+, r14 ++ mov.l @r4+, r15 ++ lds.l @r4+, pr ++#if __SH_FPU_ANY__ || __SH4__ ++ fmov.s @r4+, fr12 ++ fmov.s @r4+, fr13 ++ fmov.s @r4+, fr14 ++ fmov.s @r4+, fr15 ++#endif ++ ++ tst r5, r5 ++ movt r0 ++ add r5, r0 ++ ++ rts ++ nop +--- a/src/setjmp/sh/longjmp.s ++++ /dev/null +@@ -1,26 +0,0 @@ +-.global _longjmp +-.global longjmp +-.type _longjmp, @function +-.type longjmp, @function +-_longjmp: +-longjmp: +- mov.l @r4+, r8 +- mov.l @r4+, r9 +- mov.l @r4+, r10 +- mov.l @r4+, r11 +- mov.l @r4+, r12 +- mov.l @r4+, r13 +- mov.l @r4+, r14 +- mov.l @r4+, r15 +- lds.l @r4+, pr +- fmov.s @r4+, fr12 +- fmov.s @r4+, fr13 +- fmov.s @r4+, fr14 +- fmov.s @r4+, fr15 +- +- tst r5, r5 +- movt r0 +- add r5, r0 +- +- rts +- nop +--- /dev/null ++++ b/src/setjmp/sh/setjmp.S +@@ -0,0 +1,32 @@ ++.global ___setjmp ++.hidden ___setjmp ++.global __setjmp ++.global _setjmp ++.global setjmp ++.type __setjmp, @function ++.type _setjmp, @function ++.type setjmp, @function ++___setjmp: ++__setjmp: ++_setjmp: ++setjmp: ++#if __SH_FPU_ANY__ || __SH4__ ++ add #52, r4 ++ fmov.s fr15, @-r4 ++ fmov.s fr14, @-r4 ++ fmov.s fr13, @-r4 ++ fmov.s fr12, @-r4 ++#else ++ add #36, r4 ++#endif ++ sts.l pr, @-r4 ++ mov.l r15, @-r4 ++ mov.l r14, @-r4 ++ mov.l r13, @-r4 ++ mov.l r12, @-r4 ++ mov.l r11, @-r4 ++ mov.l r10, @-r4 ++ mov.l r9, @-r4 ++ mov.l r8, @-r4 ++ rts ++ mov #0, r0 +--- a/src/setjmp/sh/setjmp.s ++++ /dev/null +@@ -1,28 +0,0 @@ +-.global ___setjmp +-.hidden ___setjmp +-.global __setjmp +-.global _setjmp +-.global setjmp +-.type __setjmp, @function +-.type _setjmp, @function +-.type setjmp, @function +-___setjmp: +-__setjmp: +-_setjmp: +-setjmp: +- add #52, r4 +- fmov.s fr15, @-r4 +- fmov.s fr14, @-r4 +- fmov.s fr13, @-r4 +- fmov.s fr12, @-r4 +- sts.l pr, @-r4 +- mov.l r15, @-r4 +- mov.l r14, @-r4 +- mov.l r13, @-r4 +- mov.l r12, @-r4 +- mov.l r11, @-r4 +- mov.l r10, @-r4 +- mov.l r9, @-r4 +- mov.l r8, @-r4 +- rts +- mov #0, r0 +--- a/src/setjmp/sheb-nofpu/longjmp.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../sh-nofpu/longjmp.s +--- a/src/setjmp/sheb-nofpu/setjmp.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../sh-nofpu/setjmp.s +--- a/src/signal/arm/restore.s ++++ b/src/signal/arm/restore.s +@@ -1,3 +1,5 @@ ++.syntax unified ++ + .global __restore + .type __restore,%function + __restore: +--- a/src/signal/arm/sigsetjmp.s ++++ b/src/signal/arm/sigsetjmp.s +@@ -1,3 +1,4 @@ ++.syntax unified + .global sigsetjmp + .global __sigsetjmp + .type sigsetjmp,%function +--- a/src/signal/sigaction.c ++++ b/src/signal/sigaction.c +@@ -17,10 +17,6 @@ void __get_handler_set(sigset_t *set) + int __libc_sigaction(int sig, const struct sigaction *restrict sa, struct sigaction *restrict old) + { + struct k_sigaction ksa, ksa_old; +- if (sig >= (unsigned)_NSIG) { +- errno = EINVAL; +- return -1; +- } + if (sa) { + if ((uintptr_t)sa->sa_handler > 1UL) { + a_or_l(handler_set+(sig-1)/(8*sizeof(long)), +@@ -57,7 +53,7 @@ int __libc_sigaction(int sig, const stru + + int __sigaction(int sig, const struct sigaction *restrict sa, struct sigaction *restrict old) + { +- if (sig-32U < 3) { ++ if (sig-32U < 3 || sig-1U >= _NSIG-1) { + errno = EINVAL; + return -1; + } +--- a/src/signal/sigsetjmp_tail.c ++++ b/src/signal/sigsetjmp_tail.c +@@ -2,9 +2,7 @@ + #include <signal.h> + #include "syscall.h" + +-#ifdef SHARED + __attribute__((__visibility__("hidden"))) +-#endif + int __sigsetjmp_tail(sigjmp_buf jb, int ret) + { + void *p = jb->__ss; +--- a/src/stdio/getdelim.c ++++ b/src/stdio/getdelim.c +@@ -27,17 +27,18 @@ ssize_t getdelim(char **restrict s, size + for (;;) { + z = memchr(f->rpos, delim, f->rend - f->rpos); + k = z ? z - f->rpos + 1 : f->rend - f->rpos; +- if (i+k >= *n) { ++ if (i+k+1 >= *n) { + if (k >= SIZE_MAX/2-i) goto oom; +- *n = i+k+2; +- if (*n < SIZE_MAX/4) *n *= 2; +- tmp = realloc(*s, *n); ++ size_t m = i+k+2; ++ if (!z && m < SIZE_MAX/4) m += m/2; ++ tmp = realloc(*s, m); + if (!tmp) { +- *n = i+k+2; +- tmp = realloc(*s, *n); ++ m = i+k+2; ++ tmp = realloc(*s, m); + if (!tmp) goto oom; + } + *s = tmp; ++ *n = m; + } + memcpy(*s+i, f->rpos, k); + f->rpos += k; +--- /dev/null ++++ b/src/string/arm/__aeabi_memclr.c +@@ -0,0 +1,9 @@ ++#include <string.h> ++#include "libc.h" ++ ++void __aeabi_memclr(void *dest, size_t n) ++{ ++ memset(dest, 0, n); ++} ++weak_alias(__aeabi_memclr, __aeabi_memclr4); ++weak_alias(__aeabi_memclr, __aeabi_memclr8); +--- /dev/null ++++ b/src/string/arm/__aeabi_memcpy.c +@@ -0,0 +1,9 @@ ++#include <string.h> ++#include "libc.h" ++ ++void __aeabi_memcpy(void *restrict dest, const void *restrict src, size_t n) ++{ ++ memcpy(dest, src, n); ++} ++weak_alias(__aeabi_memcpy, __aeabi_memcpy4); ++weak_alias(__aeabi_memcpy, __aeabi_memcpy8); +--- /dev/null ++++ b/src/string/arm/__aeabi_memmove.c +@@ -0,0 +1,9 @@ ++#include <string.h> ++#include "libc.h" ++ ++void __aeabi_memmove(void *dest, const void *src, size_t n) ++{ ++ memmove(dest, src, n); ++} ++weak_alias(__aeabi_memmove, __aeabi_memmove4); ++weak_alias(__aeabi_memmove, __aeabi_memmove8); +--- /dev/null ++++ b/src/string/arm/__aeabi_memset.c +@@ -0,0 +1,9 @@ ++#include <string.h> ++#include "libc.h" ++ ++void __aeabi_memset(void *dest, size_t n, int c) ++{ ++ memset(dest, c, n); ++} ++weak_alias(__aeabi_memset, __aeabi_memset4); ++weak_alias(__aeabi_memset, __aeabi_memset8); +--- /dev/null ++++ b/src/string/arm/memcpy.c +@@ -0,0 +1,3 @@ ++#if __ARMEB__ ++#include "../memcpy.c" ++#endif +--- /dev/null ++++ b/src/string/arm/memcpy_le.S +@@ -0,0 +1,383 @@ ++#ifndef __ARMEB__ ++ ++/* ++ * Copyright (C) 2008 The Android Open Source Project ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ++ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ++ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ++ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ++ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS ++ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED ++ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ++ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ */ ++ ++ ++/* ++ * Optimized memcpy() for ARM. ++ * ++ * note that memcpy() always returns the destination pointer, ++ * so we have to preserve R0. ++ */ ++ ++/* ++ * This file has been modified from the original for use in musl libc. ++ * The main changes are: addition of .type memcpy,%function to make the ++ * code safely callable from thumb mode, adjusting the return ++ * instructions to be compatible with pre-thumb ARM cpus, and removal ++ * of prefetch code that is not compatible with older cpus. ++ */ ++ ++.syntax unified ++ ++.global memcpy ++.type memcpy,%function ++memcpy: ++ /* The stack must always be 64-bits aligned to be compliant with the ++ * ARM ABI. Since we have to save R0, we might as well save R4 ++ * which we can use for better pipelining of the reads below ++ */ ++ .fnstart ++ .save {r0, r4, lr} ++ stmfd sp!, {r0, r4, lr} ++ /* Making room for r5-r11 which will be spilled later */ ++ .pad #28 ++ sub sp, sp, #28 ++ ++ /* it simplifies things to take care of len<4 early */ ++ cmp r2, #4 ++ blo copy_last_3_and_return ++ ++ /* compute the offset to align the source ++ * offset = (4-(src&3))&3 = -src & 3 ++ */ ++ rsb r3, r1, #0 ++ ands r3, r3, #3 ++ beq src_aligned ++ ++ /* align source to 32 bits. We need to insert 2 instructions between ++ * a ldr[b|h] and str[b|h] because byte and half-word instructions ++ * stall 2 cycles. ++ */ ++ movs r12, r3, lsl #31 ++ sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ ++ ldrbmi r3, [r1], #1 ++ ldrbcs r4, [r1], #1 ++ ldrbcs r12,[r1], #1 ++ strbmi r3, [r0], #1 ++ strbcs r4, [r0], #1 ++ strbcs r12,[r0], #1 ++ ++src_aligned: ++ ++ /* see if src and dst are aligned together (congruent) */ ++ eor r12, r0, r1 ++ tst r12, #3 ++ bne non_congruent ++ ++ /* Use post-incriment mode for stm to spill r5-r11 to reserved stack ++ * frame. Don't update sp. ++ */ ++ stmea sp, {r5-r11} ++ ++ /* align the destination to a cache-line */ ++ rsb r3, r0, #0 ++ ands r3, r3, #0x1C ++ beq congruent_aligned32 ++ cmp r3, r2 ++ andhi r3, r2, #0x1C ++ ++ /* conditionnaly copies 0 to 7 words (length in r3) */ ++ movs r12, r3, lsl #28 ++ ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ ++ ldmmi r1!, {r8, r9} /* 8 bytes */ ++ stmcs r0!, {r4, r5, r6, r7} ++ stmmi r0!, {r8, r9} ++ tst r3, #0x4 ++ ldrne r10,[r1], #4 /* 4 bytes */ ++ strne r10,[r0], #4 ++ sub r2, r2, r3 ++ ++congruent_aligned32: ++ /* ++ * here source is aligned to 32 bytes. ++ */ ++ ++cached_aligned32: ++ subs r2, r2, #32 ++ blo less_than_32_left ++ ++ /* ++ * We preload a cache-line up to 64 bytes ahead. On the 926, this will ++ * stall only until the requested world is fetched, but the linefill ++ * continues in the the background. ++ * While the linefill is going, we write our previous cache-line ++ * into the write-buffer (which should have some free space). ++ * When the linefill is done, the writebuffer will ++ * start dumping its content into memory ++ * ++ * While all this is going, we then load a full cache line into ++ * 8 registers, this cache line should be in the cache by now ++ * (or partly in the cache). ++ * ++ * This code should work well regardless of the source/dest alignment. ++ * ++ */ ++ ++ /* Align the preload register to a cache-line because the cpu does ++ * "critical word first" (the first word requested is loaded first). ++ */ ++ @ bic r12, r1, #0x1F ++ @ add r12, r12, #64 ++ ++1: ldmia r1!, { r4-r11 } ++ subs r2, r2, #32 ++ ++ /* ++ * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi ++ * for ARM9 preload will not be safely guarded by the preceding subs. ++ * When it is safely guarded the only possibility to have SIGSEGV here ++ * is because the caller overstates the length. ++ */ ++ @ ldrhi r3, [r12], #32 /* cheap ARM9 preload */ ++ stmia r0!, { r4-r11 } ++ bhs 1b ++ ++ add r2, r2, #32 ++ ++less_than_32_left: ++ /* ++ * less than 32 bytes left at this point (length in r2) ++ */ ++ ++ /* skip all this if there is nothing to do, which should ++ * be a common case (if not executed the code below takes ++ * about 16 cycles) ++ */ ++ tst r2, #0x1F ++ beq 1f ++ ++ /* conditionnaly copies 0 to 31 bytes */ ++ movs r12, r2, lsl #28 ++ ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ ++ ldmmi r1!, {r8, r9} /* 8 bytes */ ++ stmcs r0!, {r4, r5, r6, r7} ++ stmmi r0!, {r8, r9} ++ movs r12, r2, lsl #30 ++ ldrcs r3, [r1], #4 /* 4 bytes */ ++ ldrhmi r4, [r1], #2 /* 2 bytes */ ++ strcs r3, [r0], #4 ++ strhmi r4, [r0], #2 ++ tst r2, #0x1 ++ ldrbne r3, [r1] /* last byte */ ++ strbne r3, [r0] ++ ++ /* we're done! restore everything and return */ ++1: ldmfd sp!, {r5-r11} ++ ldmfd sp!, {r0, r4, lr} ++ bx lr ++ ++ /********************************************************************/ ++ ++non_congruent: ++ /* ++ * here source is aligned to 4 bytes ++ * but destination is not. ++ * ++ * in the code below r2 is the number of bytes read ++ * (the number of bytes written is always smaller, because we have ++ * partial words in the shift queue) ++ */ ++ cmp r2, #4 ++ blo copy_last_3_and_return ++ ++ /* Use post-incriment mode for stm to spill r5-r11 to reserved stack ++ * frame. Don't update sp. ++ */ ++ stmea sp, {r5-r11} ++ ++ /* compute shifts needed to align src to dest */ ++ rsb r5, r0, #0 ++ and r5, r5, #3 /* r5 = # bytes in partial words */ ++ mov r12, r5, lsl #3 /* r12 = right */ ++ rsb lr, r12, #32 /* lr = left */ ++ ++ /* read the first word */ ++ ldr r3, [r1], #4 ++ sub r2, r2, #4 ++ ++ /* write a partial word (0 to 3 bytes), such that destination ++ * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) ++ */ ++ movs r5, r5, lsl #31 ++ strbmi r3, [r0], #1 ++ movmi r3, r3, lsr #8 ++ strbcs r3, [r0], #1 ++ movcs r3, r3, lsr #8 ++ strbcs r3, [r0], #1 ++ movcs r3, r3, lsr #8 ++ ++ cmp r2, #4 ++ blo partial_word_tail ++ ++ /* Align destination to 32 bytes (cache line boundary) */ ++1: tst r0, #0x1c ++ beq 2f ++ ldr r5, [r1], #4 ++ sub r2, r2, #4 ++ orr r4, r3, r5, lsl lr ++ mov r3, r5, lsr r12 ++ str r4, [r0], #4 ++ cmp r2, #4 ++ bhs 1b ++ blo partial_word_tail ++ ++ /* copy 32 bytes at a time */ ++2: subs r2, r2, #32 ++ blo less_than_thirtytwo ++ ++ /* Use immediate mode for the shifts, because there is an extra cycle ++ * for register shifts, which could account for up to 50% of ++ * performance hit. ++ */ ++ ++ cmp r12, #24 ++ beq loop24 ++ cmp r12, #8 ++ beq loop8 ++ ++loop16: ++ ldr r12, [r1], #4 ++1: mov r4, r12 ++ ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} ++ subs r2, r2, #32 ++ ldrhs r12, [r1], #4 ++ orr r3, r3, r4, lsl #16 ++ mov r4, r4, lsr #16 ++ orr r4, r4, r5, lsl #16 ++ mov r5, r5, lsr #16 ++ orr r5, r5, r6, lsl #16 ++ mov r6, r6, lsr #16 ++ orr r6, r6, r7, lsl #16 ++ mov r7, r7, lsr #16 ++ orr r7, r7, r8, lsl #16 ++ mov r8, r8, lsr #16 ++ orr r8, r8, r9, lsl #16 ++ mov r9, r9, lsr #16 ++ orr r9, r9, r10, lsl #16 ++ mov r10, r10, lsr #16 ++ orr r10, r10, r11, lsl #16 ++ stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} ++ mov r3, r11, lsr #16 ++ bhs 1b ++ b less_than_thirtytwo ++ ++loop8: ++ ldr r12, [r1], #4 ++1: mov r4, r12 ++ ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} ++ subs r2, r2, #32 ++ ldrhs r12, [r1], #4 ++ orr r3, r3, r4, lsl #24 ++ mov r4, r4, lsr #8 ++ orr r4, r4, r5, lsl #24 ++ mov r5, r5, lsr #8 ++ orr r5, r5, r6, lsl #24 ++ mov r6, r6, lsr #8 ++ orr r6, r6, r7, lsl #24 ++ mov r7, r7, lsr #8 ++ orr r7, r7, r8, lsl #24 ++ mov r8, r8, lsr #8 ++ orr r8, r8, r9, lsl #24 ++ mov r9, r9, lsr #8 ++ orr r9, r9, r10, lsl #24 ++ mov r10, r10, lsr #8 ++ orr r10, r10, r11, lsl #24 ++ stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} ++ mov r3, r11, lsr #8 ++ bhs 1b ++ b less_than_thirtytwo ++ ++loop24: ++ ldr r12, [r1], #4 ++1: mov r4, r12 ++ ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} ++ subs r2, r2, #32 ++ ldrhs r12, [r1], #4 ++ orr r3, r3, r4, lsl #8 ++ mov r4, r4, lsr #24 ++ orr r4, r4, r5, lsl #8 ++ mov r5, r5, lsr #24 ++ orr r5, r5, r6, lsl #8 ++ mov r6, r6, lsr #24 ++ orr r6, r6, r7, lsl #8 ++ mov r7, r7, lsr #24 ++ orr r7, r7, r8, lsl #8 ++ mov r8, r8, lsr #24 ++ orr r8, r8, r9, lsl #8 ++ mov r9, r9, lsr #24 ++ orr r9, r9, r10, lsl #8 ++ mov r10, r10, lsr #24 ++ orr r10, r10, r11, lsl #8 ++ stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} ++ mov r3, r11, lsr #24 ++ bhs 1b ++ ++less_than_thirtytwo: ++ /* copy the last 0 to 31 bytes of the source */ ++ rsb r12, lr, #32 /* we corrupted r12, recompute it */ ++ add r2, r2, #32 ++ cmp r2, #4 ++ blo partial_word_tail ++ ++1: ldr r5, [r1], #4 ++ sub r2, r2, #4 ++ orr r4, r3, r5, lsl lr ++ mov r3, r5, lsr r12 ++ str r4, [r0], #4 ++ cmp r2, #4 ++ bhs 1b ++ ++partial_word_tail: ++ /* we have a partial word in the input buffer */ ++ movs r5, lr, lsl #(31-3) ++ strbmi r3, [r0], #1 ++ movmi r3, r3, lsr #8 ++ strbcs r3, [r0], #1 ++ movcs r3, r3, lsr #8 ++ strbcs r3, [r0], #1 ++ ++ /* Refill spilled registers from the stack. Don't update sp. */ ++ ldmfd sp, {r5-r11} ++ ++copy_last_3_and_return: ++ movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ ++ ldrbmi r2, [r1], #1 ++ ldrbcs r3, [r1], #1 ++ ldrbcs r12,[r1] ++ strbmi r2, [r0], #1 ++ strbcs r3, [r0], #1 ++ strbcs r12,[r0] ++ ++ /* we're done! restore sp and spilled registers and return */ ++ add sp, sp, #28 ++ ldmfd sp!, {r0, r4, lr} ++ bx lr ++ ++#endif +--- a/src/string/armel/memcpy.s ++++ /dev/null +@@ -1,381 +0,0 @@ +-/* +- * Copyright (C) 2008 The Android Open Source Project +- * All rights reserved. +- * +- * Redistribution and use in source and binary forms, with or without +- * modification, are permitted provided that the following conditions +- * are met: +- * * Redistributions of source code must retain the above copyright +- * notice, this list of conditions and the following disclaimer. +- * * Redistributions in binary form must reproduce the above copyright +- * notice, this list of conditions and the following disclaimer in +- * the documentation and/or other materials provided with the +- * distribution. +- * +- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +- * SUCH DAMAGE. +- */ +- +- +-/* +- * Optimized memcpy() for ARM. +- * +- * note that memcpy() always returns the destination pointer, +- * so we have to preserve R0. +- */ +- +-/* +- * This file has been modified from the original for use in musl libc. +- * The main changes are: addition of .type memcpy,%function to make the +- * code safely callable from thumb mode, adjusting the return +- * instructions to be compatible with pre-thumb ARM cpus, and removal +- * of prefetch code that is not compatible with older cpus. +- */ +- +-.global memcpy +-.type memcpy,%function +-memcpy: +- /* The stack must always be 64-bits aligned to be compliant with the +- * ARM ABI. Since we have to save R0, we might as well save R4 +- * which we can use for better pipelining of the reads below +- */ +- .fnstart +- .save {r0, r4, lr} +- stmfd sp!, {r0, r4, lr} +- /* Making room for r5-r11 which will be spilled later */ +- .pad #28 +- sub sp, sp, #28 +- +- /* it simplifies things to take care of len<4 early */ +- cmp r2, #4 +- blo copy_last_3_and_return +- +- /* compute the offset to align the source +- * offset = (4-(src&3))&3 = -src & 3 +- */ +- rsb r3, r1, #0 +- ands r3, r3, #3 +- beq src_aligned +- +- /* align source to 32 bits. We need to insert 2 instructions between +- * a ldr[b|h] and str[b|h] because byte and half-word instructions +- * stall 2 cycles. +- */ +- movs r12, r3, lsl #31 +- sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ +- .word 0x44d13001 /* ldrbmi r3, [r1], #1 */ +- .word 0x24d14001 /* ldrbcs r4, [r1], #1 */ +- .word 0x24d1c001 /* ldrbcs r12,[r1], #1 */ +- .word 0x44c03001 /* strbmi r3, [r0], #1 */ +- .word 0x24c04001 /* strbcs r4, [r0], #1 */ +- .word 0x24c0c001 /* strbcs r12,[r0], #1 */ +- +-src_aligned: +- +- /* see if src and dst are aligned together (congruent) */ +- eor r12, r0, r1 +- tst r12, #3 +- bne non_congruent +- +- /* Use post-incriment mode for stm to spill r5-r11 to reserved stack +- * frame. Don't update sp. +- */ +- stmea sp, {r5-r11} +- +- /* align the destination to a cache-line */ +- rsb r3, r0, #0 +- ands r3, r3, #0x1C +- beq congruent_aligned32 +- cmp r3, r2 +- andhi r3, r2, #0x1C +- +- /* conditionnaly copies 0 to 7 words (length in r3) */ +- movs r12, r3, lsl #28 +- ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ +- ldmmi r1!, {r8, r9} /* 8 bytes */ +- stmcs r0!, {r4, r5, r6, r7} +- stmmi r0!, {r8, r9} +- tst r3, #0x4 +- ldrne r10,[r1], #4 /* 4 bytes */ +- strne r10,[r0], #4 +- sub r2, r2, r3 +- +-congruent_aligned32: +- /* +- * here source is aligned to 32 bytes. +- */ +- +-cached_aligned32: +- subs r2, r2, #32 +- blo less_than_32_left +- +- /* +- * We preload a cache-line up to 64 bytes ahead. On the 926, this will +- * stall only until the requested world is fetched, but the linefill +- * continues in the the background. +- * While the linefill is going, we write our previous cache-line +- * into the write-buffer (which should have some free space). +- * When the linefill is done, the writebuffer will +- * start dumping its content into memory +- * +- * While all this is going, we then load a full cache line into +- * 8 registers, this cache line should be in the cache by now +- * (or partly in the cache). +- * +- * This code should work well regardless of the source/dest alignment. +- * +- */ +- +- /* Align the preload register to a cache-line because the cpu does +- * "critical word first" (the first word requested is loaded first). +- */ +- @ bic r12, r1, #0x1F +- @ add r12, r12, #64 +- +-1: ldmia r1!, { r4-r11 } +- subs r2, r2, #32 +- +- /* +- * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi +- * for ARM9 preload will not be safely guarded by the preceding subs. +- * When it is safely guarded the only possibility to have SIGSEGV here +- * is because the caller overstates the length. +- */ +- @ ldrhi r3, [r12], #32 /* cheap ARM9 preload */ +- stmia r0!, { r4-r11 } +- bhs 1b +- +- add r2, r2, #32 +- +-less_than_32_left: +- /* +- * less than 32 bytes left at this point (length in r2) +- */ +- +- /* skip all this if there is nothing to do, which should +- * be a common case (if not executed the code below takes +- * about 16 cycles) +- */ +- tst r2, #0x1F +- beq 1f +- +- /* conditionnaly copies 0 to 31 bytes */ +- movs r12, r2, lsl #28 +- ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ +- ldmmi r1!, {r8, r9} /* 8 bytes */ +- stmcs r0!, {r4, r5, r6, r7} +- stmmi r0!, {r8, r9} +- movs r12, r2, lsl #30 +- ldrcs r3, [r1], #4 /* 4 bytes */ +- .word 0x40d140b2 /* ldrhmi r4, [r1], #2 */ /* 2 bytes */ +- strcs r3, [r0], #4 +- .word 0x40c040b2 /* strhmi r4, [r0], #2 */ +- tst r2, #0x1 +- .word 0x15d13000 /* ldrbne r3, [r1] */ /* last byte */ +- .word 0x15c03000 /* strbne r3, [r0] */ +- +- /* we're done! restore everything and return */ +-1: ldmfd sp!, {r5-r11} +- ldmfd sp!, {r0, r4, lr} +- tst lr, #1 +- moveq pc, lr +- bx lr +- +- /********************************************************************/ +- +-non_congruent: +- /* +- * here source is aligned to 4 bytes +- * but destination is not. +- * +- * in the code below r2 is the number of bytes read +- * (the number of bytes written is always smaller, because we have +- * partial words in the shift queue) +- */ +- cmp r2, #4 +- blo copy_last_3_and_return +- +- /* Use post-incriment mode for stm to spill r5-r11 to reserved stack +- * frame. Don't update sp. +- */ +- stmea sp, {r5-r11} +- +- /* compute shifts needed to align src to dest */ +- rsb r5, r0, #0 +- and r5, r5, #3 /* r5 = # bytes in partial words */ +- mov r12, r5, lsl #3 /* r12 = right */ +- rsb lr, r12, #32 /* lr = left */ +- +- /* read the first word */ +- ldr r3, [r1], #4 +- sub r2, r2, #4 +- +- /* write a partial word (0 to 3 bytes), such that destination +- * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) +- */ +- movs r5, r5, lsl #31 +- .word 0x44c03001 /* strbmi r3, [r0], #1 */ +- movmi r3, r3, lsr #8 +- .word 0x24c03001 /* strbcs r3, [r0], #1 */ +- movcs r3, r3, lsr #8 +- .word 0x24c03001 /* strbcs r3, [r0], #1 */ +- movcs r3, r3, lsr #8 +- +- cmp r2, #4 +- blo partial_word_tail +- +- /* Align destination to 32 bytes (cache line boundary) */ +-1: tst r0, #0x1c +- beq 2f +- ldr r5, [r1], #4 +- sub r2, r2, #4 +- orr r4, r3, r5, lsl lr +- mov r3, r5, lsr r12 +- str r4, [r0], #4 +- cmp r2, #4 +- bhs 1b +- blo partial_word_tail +- +- /* copy 32 bytes at a time */ +-2: subs r2, r2, #32 +- blo less_than_thirtytwo +- +- /* Use immediate mode for the shifts, because there is an extra cycle +- * for register shifts, which could account for up to 50% of +- * performance hit. +- */ +- +- cmp r12, #24 +- beq loop24 +- cmp r12, #8 +- beq loop8 +- +-loop16: +- ldr r12, [r1], #4 +-1: mov r4, r12 +- ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} +- subs r2, r2, #32 +- ldrhs r12, [r1], #4 +- orr r3, r3, r4, lsl #16 +- mov r4, r4, lsr #16 +- orr r4, r4, r5, lsl #16 +- mov r5, r5, lsr #16 +- orr r5, r5, r6, lsl #16 +- mov r6, r6, lsr #16 +- orr r6, r6, r7, lsl #16 +- mov r7, r7, lsr #16 +- orr r7, r7, r8, lsl #16 +- mov r8, r8, lsr #16 +- orr r8, r8, r9, lsl #16 +- mov r9, r9, lsr #16 +- orr r9, r9, r10, lsl #16 +- mov r10, r10, lsr #16 +- orr r10, r10, r11, lsl #16 +- stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} +- mov r3, r11, lsr #16 +- bhs 1b +- b less_than_thirtytwo +- +-loop8: +- ldr r12, [r1], #4 +-1: mov r4, r12 +- ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} +- subs r2, r2, #32 +- ldrhs r12, [r1], #4 +- orr r3, r3, r4, lsl #24 +- mov r4, r4, lsr #8 +- orr r4, r4, r5, lsl #24 +- mov r5, r5, lsr #8 +- orr r5, r5, r6, lsl #24 +- mov r6, r6, lsr #8 +- orr r6, r6, r7, lsl #24 +- mov r7, r7, lsr #8 +- orr r7, r7, r8, lsl #24 +- mov r8, r8, lsr #8 +- orr r8, r8, r9, lsl #24 +- mov r9, r9, lsr #8 +- orr r9, r9, r10, lsl #24 +- mov r10, r10, lsr #8 +- orr r10, r10, r11, lsl #24 +- stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} +- mov r3, r11, lsr #8 +- bhs 1b +- b less_than_thirtytwo +- +-loop24: +- ldr r12, [r1], #4 +-1: mov r4, r12 +- ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} +- subs r2, r2, #32 +- ldrhs r12, [r1], #4 +- orr r3, r3, r4, lsl #8 +- mov r4, r4, lsr #24 +- orr r4, r4, r5, lsl #8 +- mov r5, r5, lsr #24 +- orr r5, r5, r6, lsl #8 +- mov r6, r6, lsr #24 +- orr r6, r6, r7, lsl #8 +- mov r7, r7, lsr #24 +- orr r7, r7, r8, lsl #8 +- mov r8, r8, lsr #24 +- orr r8, r8, r9, lsl #8 +- mov r9, r9, lsr #24 +- orr r9, r9, r10, lsl #8 +- mov r10, r10, lsr #24 +- orr r10, r10, r11, lsl #8 +- stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} +- mov r3, r11, lsr #24 +- bhs 1b +- +-less_than_thirtytwo: +- /* copy the last 0 to 31 bytes of the source */ +- rsb r12, lr, #32 /* we corrupted r12, recompute it */ +- add r2, r2, #32 +- cmp r2, #4 +- blo partial_word_tail +- +-1: ldr r5, [r1], #4 +- sub r2, r2, #4 +- orr r4, r3, r5, lsl lr +- mov r3, r5, lsr r12 +- str r4, [r0], #4 +- cmp r2, #4 +- bhs 1b +- +-partial_word_tail: +- /* we have a partial word in the input buffer */ +- movs r5, lr, lsl #(31-3) +- .word 0x44c03001 /* strbmi r3, [r0], #1 */ +- movmi r3, r3, lsr #8 +- .word 0x24c03001 /* strbcs r3, [r0], #1 */ +- movcs r3, r3, lsr #8 +- .word 0x24c03001 /* strbcs r3, [r0], #1 */ +- +- /* Refill spilled registers from the stack. Don't update sp. */ +- ldmfd sp, {r5-r11} +- +-copy_last_3_and_return: +- movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ +- .word 0x44d12001 /* ldrbmi r2, [r1], #1 */ +- .word 0x24d13001 /* ldrbcs r3, [r1], #1 */ +- .word 0x25d1c000 /* ldrbcs r12,[r1] */ +- .word 0x44c02001 /* strbmi r2, [r0], #1 */ +- .word 0x24c03001 /* strbcs r3, [r0], #1 */ +- .word 0x25c0c000 /* strbcs r12,[r0] */ +- +- /* we're done! restore sp and spilled registers and return */ +- add sp, sp, #28 +- ldmfd sp!, {r0, r4, lr} +- tst lr, #1 +- moveq pc, lr +- bx lr +--- a/src/string/armel/memcpy.sub ++++ /dev/null +@@ -1 +0,0 @@ +-memcpy.s +--- a/src/string/armhf/memcpy.sub ++++ /dev/null +@@ -1 +0,0 @@ +-../armel/memcpy.s +--- a/src/thread/__syscall_cp.c ++++ b/src/thread/__syscall_cp.c +@@ -1,9 +1,7 @@ + #include "pthread_impl.h" + #include "syscall.h" + +-#ifdef SHARED + __attribute__((__visibility__("hidden"))) +-#endif + long __syscall_cp_c(); + + static long sccp(syscall_arg_t nr, +--- a/src/thread/__tls_get_addr.c ++++ b/src/thread/__tls_get_addr.c +@@ -1,16 +1,16 @@ + #include <stddef.h> + #include "pthread_impl.h" ++#include "libc.h" ++ ++__attribute__((__visibility__("hidden"))) ++void *__tls_get_new(size_t *); + + void *__tls_get_addr(size_t *v) + { + pthread_t self = __pthread_self(); +-#ifdef SHARED +- __attribute__((__visibility__("hidden"))) +- void *__tls_get_new(size_t *); + if (v[0]<=(size_t)self->dtv[0]) + return (char *)self->dtv[v[0]]+v[1]+DTP_OFFSET; + return __tls_get_new(v); +-#else +- return (char *)self->dtv[1]+v[1]+DTP_OFFSET; +-#endif + } ++ ++weak_alias(__tls_get_addr, __tls_get_new); +--- a/src/thread/aarch64/syscall_cp.s ++++ b/src/thread/aarch64/syscall_cp.s +@@ -17,7 +17,7 @@ + __syscall_cp_asm: + __cp_begin: + ldr w0,[x0] +- cbnz w0,1f ++ cbnz w0,__cp_cancel + mov x8,x1 + mov x0,x2 + mov x1,x3 +@@ -28,6 +28,5 @@ __cp_begin: + svc 0 + __cp_end: + ret +- +- // cbnz might not be able to jump far enough +-1: b __cancel ++__cp_cancel: ++ b __cancel +--- /dev/null ++++ b/src/thread/arm/__set_thread_area.c +@@ -0,0 +1,49 @@ ++#include <stdint.h> ++#include <elf.h> ++#include "pthread_impl.h" ++#include "libc.h" ++ ++#define HWCAP_TLS (1 << 15) ++ ++extern const unsigned char __attribute__((__visibility__("hidden"))) ++ __a_barrier_dummy[], __a_barrier_oldkuser[], ++ __a_barrier_v6[], __a_barrier_v7[], ++ __a_cas_dummy[], __a_cas_v6[], __a_cas_v7[], ++ __a_gettp_dummy[]; ++ ++#define __a_barrier_kuser 0xffff0fa0 ++#define __a_cas_kuser 0xffff0fc0 ++#define __a_gettp_kuser 0xffff0fe0 ++ ++extern uintptr_t __attribute__((__visibility__("hidden"))) ++ __a_barrier_ptr, __a_cas_ptr, __a_gettp_ptr; ++ ++#define SET(op,ver) (__a_##op##_ptr = \ ++ (uintptr_t)__a_##op##_##ver - (uintptr_t)__a_##op##_dummy) ++ ++int __set_thread_area(void *p) ++{ ++#if !__ARM_ARCH_7A__ && !__ARM_ARCH_7R__ && __ARM_ARCH < 7 ++ if (__hwcap & HWCAP_TLS) { ++ size_t *aux; ++ SET(cas, v7); ++ SET(barrier, v7); ++ for (aux=libc.auxv; *aux; aux+=2) { ++ if (*aux != AT_PLATFORM) continue; ++ const char *s = (void *)aux[1]; ++ if (s[0]!='v' || s[1]!='6' || s[2]-'0'<10u) break; ++ SET(cas, v6); ++ SET(barrier, v6); ++ break; ++ } ++ } else { ++ int ver = *(int *)0xffff0ffc; ++ SET(gettp, kuser); ++ SET(cas, kuser); ++ SET(barrier, kuser); ++ if (ver < 2) a_crash(); ++ if (ver < 3) SET(barrier, oldkuser); ++ } ++#endif ++ return __syscall(0xf0005, p); ++} +--- a/src/thread/arm/__set_thread_area.s ++++ /dev/null +@@ -1 +0,0 @@ +-/* Replaced by C code in arch/arm/src */ +--- a/src/thread/arm/__unmapself.s ++++ b/src/thread/arm/__unmapself.s +@@ -1,3 +1,4 @@ ++.syntax unified + .text + .global __unmapself + .type __unmapself,%function +--- /dev/null ++++ b/src/thread/arm/atomics.s +@@ -0,0 +1,111 @@ ++.syntax unified ++.text ++ ++.global __a_barrier ++.hidden __a_barrier ++.type __a_barrier,%function ++__a_barrier: ++ ldr ip,1f ++ ldr ip,[pc,ip] ++ add pc,pc,ip ++1: .word __a_barrier_ptr-1b ++.global __a_barrier_dummy ++.hidden __a_barrier_dummy ++__a_barrier_dummy: ++ bx lr ++.global __a_barrier_oldkuser ++.hidden __a_barrier_oldkuser ++__a_barrier_oldkuser: ++ push {r0,r1,r2,r3,ip,lr} ++ mov r1,r0 ++ mov r2,sp ++ ldr ip,=0xffff0fc0 ++ mov lr,pc ++ mov pc,ip ++ pop {r0,r1,r2,r3,ip,lr} ++ bx lr ++.global __a_barrier_v6 ++.hidden __a_barrier_v6 ++__a_barrier_v6: ++ mcr p15,0,r0,c7,c10,5 ++ bx lr ++.global __a_barrier_v7 ++.hidden __a_barrier_v7 ++__a_barrier_v7: ++ .word 0xf57ff05b /* dmb ish */ ++ bx lr ++ ++.global __a_cas ++.hidden __a_cas ++.type __a_cas,%function ++__a_cas: ++ ldr ip,1f ++ ldr ip,[pc,ip] ++ add pc,pc,ip ++1: .word __a_cas_ptr-1b ++.global __a_cas_dummy ++.hidden __a_cas_dummy ++__a_cas_dummy: ++ mov r3,r0 ++ ldr r0,[r2] ++ subs r0,r3,r0 ++ streq r1,[r2] ++ bx lr ++.global __a_cas_v6 ++.hidden __a_cas_v6 ++__a_cas_v6: ++ mov r3,r0 ++ mcr p15,0,r0,c7,c10,5 ++1: .word 0xe1920f9f /* ldrex r0,[r2] */ ++ subs r0,r3,r0 ++ .word 0x01820f91 /* strexeq r0,r1,[r2] */ ++ teqeq r0,#1 ++ beq 1b ++ mcr p15,0,r0,c7,c10,5 ++ bx lr ++.global __a_cas_v7 ++.hidden __a_cas_v7 ++__a_cas_v7: ++ mov r3,r0 ++ .word 0xf57ff05b /* dmb ish */ ++1: .word 0xe1920f9f /* ldrex r0,[r2] */ ++ subs r0,r3,r0 ++ .word 0x01820f91 /* strexeq r0,r1,[r2] */ ++ teqeq r0,#1 ++ beq 1b ++ .word 0xf57ff05b /* dmb ish */ ++ bx lr ++ ++.global __aeabi_read_tp ++.type __aeabi_read_tp,%function ++__aeabi_read_tp: ++ ++.global __a_gettp ++.hidden __a_gettp ++.type __a_gettp,%function ++__a_gettp: ++ ldr r0,1f ++ ldr r0,[pc,r0] ++ add pc,pc,r0 ++1: .word __a_gettp_ptr-1b ++.global __a_gettp_dummy ++.hidden __a_gettp_dummy ++__a_gettp_dummy: ++ mrc p15,0,r0,c13,c0,3 ++ bx lr ++ ++.data ++.global __a_barrier_ptr ++.hidden __a_barrier_ptr ++__a_barrier_ptr: ++ .word 0 ++ ++.global __a_cas_ptr ++.hidden __a_cas_ptr ++__a_cas_ptr: ++ .word 0 ++ ++.global __a_gettp_ptr ++.hidden __a_gettp_ptr ++__a_gettp_ptr: ++ .word 0 +--- a/src/thread/arm/clone.s ++++ b/src/thread/arm/clone.s +@@ -1,3 +1,4 @@ ++.syntax unified + .text + .global __clone + .type __clone,%function +@@ -15,8 +16,6 @@ __clone: + tst r0,r0 + beq 1f + ldmfd sp!,{r4,r5,r6,r7} +- tst lr,#1 +- moveq pc,lr + bx lr + + 1: mov r0,r6 +--- a/src/thread/arm/syscall_cp.s ++++ b/src/thread/arm/syscall_cp.s +@@ -1,3 +1,4 @@ ++.syntax unified + .global __cp_begin + .hidden __cp_begin + .global __cp_end +@@ -22,8 +23,6 @@ __cp_begin: + svc 0 + __cp_end: + ldmfd sp!,{r4,r5,r6,r7,lr} +- tst lr,#1 +- moveq pc,lr + bx lr + __cp_cancel: + ldmfd sp!,{r4,r5,r6,r7,lr} +--- a/src/thread/microblaze/syscall_cp.s ++++ b/src/thread/microblaze/syscall_cp.s +@@ -11,7 +11,7 @@ + __syscall_cp_asm: + __cp_begin: + lwi r5, r5, 0 +- bnei r5, __cancel ++ bnei r5, __cp_cancel + addi r12, r6, 0 + add r5, r7, r0 + add r6, r8, r0 +@@ -23,3 +23,5 @@ __cp_begin: + __cp_end: + rtsd r15, 8 + nop ++__cp_cancel: ++ bri __cancel +--- a/src/thread/or1k/syscall_cp.s ++++ b/src/thread/or1k/syscall_cp.s +@@ -12,7 +12,7 @@ __syscall_cp_asm: + __cp_begin: + l.lwz r3, 0(r3) + l.sfeqi r3, 0 +- l.bnf __cancel ++ l.bnf __cp_cancel + l.ori r11, r4, 0 + l.ori r3, r5, 0 + l.ori r4, r6, 0 +@@ -24,3 +24,6 @@ __cp_begin: + __cp_end: + l.jr r9 + l.nop ++__cp_cancel: ++ l.j __cancel ++ l.nop +--- a/src/thread/powerpc/syscall_cp.s ++++ b/src/thread/powerpc/syscall_cp.s +@@ -38,7 +38,7 @@ __cp_begin: + cmpwi cr7, 0, 0 #compare r0 with 0, store result in cr7. + beq+ cr7, 1f #jump to label 1 if r0 was 0 + +- b __cancel #else call cancel ++ b __cp_cancel #else call cancel + 1: + #ok, the cancel flag was not set + # syscall: number goes to r0, the rest 3-8 +@@ -55,3 +55,5 @@ __cp_end: + #else negate result. + neg 3, 3 + blr ++__cp_cancel: ++ b __cancel +--- a/src/thread/pthread_cancel.c ++++ b/src/thread/pthread_cancel.c +@@ -1,12 +1,11 @@ ++#define _GNU_SOURCE + #include <string.h> + #include "pthread_impl.h" + #include "syscall.h" + #include "libc.h" + +-#ifdef SHARED + __attribute__((__visibility__("hidden"))) +-#endif +-long __cancel(), __cp_cancel(), __syscall_cp_asm(), __syscall_cp_c(); ++long __cancel(), __syscall_cp_asm(), __syscall_cp_c(); + + long __cancel() + { +@@ -17,12 +16,6 @@ long __cancel() + return -ECANCELED; + } + +-/* If __syscall_cp_asm has adjusted the stack pointer, it must provide a +- * definition of __cp_cancel to undo those adjustments and call __cancel. +- * Otherwise, __cancel provides a definition for __cp_cancel. */ +- +-weak_alias(__cancel, __cp_cancel); +- + long __syscall_cp_asm(volatile void *, syscall_arg_t, + syscall_arg_t, syscall_arg_t, syscall_arg_t, + syscall_arg_t, syscall_arg_t, syscall_arg_t); +@@ -52,24 +45,22 @@ static void _sigaddset(sigset_t *set, in + set->__bits[s/8/sizeof *set->__bits] |= 1UL<<(s&8*sizeof *set->__bits-1); + } + +-#ifdef SHARED + __attribute__((__visibility__("hidden"))) +-#endif +-extern const char __cp_begin[1], __cp_end[1]; ++extern const char __cp_begin[1], __cp_end[1], __cp_cancel[1]; + + static void cancel_handler(int sig, siginfo_t *si, void *ctx) + { + pthread_t self = __pthread_self(); + ucontext_t *uc = ctx; +- const char *ip = ((char **)&uc->uc_mcontext)[CANCEL_REG_IP]; ++ uintptr_t pc = uc->uc_mcontext.MC_PC; + + a_barrier(); + if (!self->cancel || self->canceldisable == PTHREAD_CANCEL_DISABLE) return; + + _sigaddset(&uc->uc_sigmask, SIGCANCEL); + +- if (self->cancelasync || ip >= __cp_begin && ip < __cp_end) { +- ((char **)&uc->uc_mcontext)[CANCEL_REG_IP] = (char *)__cp_cancel; ++ if (self->cancelasync || pc >= (uintptr_t)__cp_begin && pc < (uintptr_t)__cp_end) { ++ uc->uc_mcontext.MC_PC = (uintptr_t)__cp_cancel; + return; + } + +--- /dev/null ++++ b/src/thread/sh/__set_thread_area.c +@@ -0,0 +1,40 @@ ++#include "pthread_impl.h" ++#include "libc.h" ++#include <elf.h> ++ ++/* Also perform sh-specific init */ ++ ++#define CPU_HAS_LLSC 0x0040 ++#define CPU_HAS_CAS_L 0x0400 ++ ++__attribute__((__visibility__("hidden"))) ++extern const char __sh_cas_gusa[], __sh_cas_llsc[], __sh_cas_imask[], __sh_cas_cas_l[]; ++ ++__attribute__((__visibility__("hidden"))) ++const void *__sh_cas_ptr; ++ ++__attribute__((__visibility__("hidden"))) ++unsigned __sh_nommu; ++ ++int __set_thread_area(void *p) ++{ ++ size_t *aux; ++ __asm__ __volatile__ ( "ldc %0, gbr" : : "r"(p) : "memory" ); ++#ifndef __SH4A__ ++ __sh_cas_ptr = __sh_cas_gusa; ++#if !defined(__SH3__) && !defined(__SH4__) ++ for (aux=libc.auxv; *aux; aux+=2) { ++ if (*aux != AT_PLATFORM) continue; ++ const char *s = (void *)aux[1]; ++ if (s[0]!='s' || s[1]!='h' || s[2]!='2' || s[3]-'0'<10u) break; ++ __sh_cas_ptr = __sh_cas_imask; ++ __sh_nommu = 1; ++ } ++#endif ++ if (__hwcap & CPU_HAS_CAS_L) ++ __sh_cas_ptr = __sh_cas_cas_l; ++ else if (__hwcap & CPU_HAS_LLSC) ++ __sh_cas_ptr = __sh_cas_llsc; ++#endif ++ return 0; ++} +--- /dev/null ++++ b/src/thread/sh/atomics.s +@@ -0,0 +1,65 @@ ++/* Contract for all versions is same as cas.l r2,r3,@r0 ++ * pr and r1 are also clobbered (by jsr & r1 as temp). ++ * r0,r2,r4-r15 must be preserved. ++ * r3 contains result (==r2 iff cas succeeded). */ ++ ++ .align 2 ++.global __sh_cas_gusa ++.hidden __sh_cas_gusa ++__sh_cas_gusa: ++ mov.l r5,@-r15 ++ mov.l r4,@-r15 ++ mov r0,r4 ++ mova 1f,r0 ++ mov r15,r1 ++ mov #(0f-1f),r15 ++0: mov.l @r4,r5 ++ cmp/eq r5,r2 ++ bf 1f ++ mov.l r3,@r4 ++1: mov r1,r15 ++ mov r5,r3 ++ mov r4,r0 ++ mov.l @r15+,r4 ++ rts ++ mov.l @r15+,r5 ++ ++.global __sh_cas_llsc ++.hidden __sh_cas_llsc ++__sh_cas_llsc: ++ mov r0,r1 ++ synco ++0: movli.l @r1,r0 ++ cmp/eq r0,r2 ++ bf 1f ++ mov r3,r0 ++ movco.l r0,@r1 ++ bf 0b ++ mov r2,r0 ++1: synco ++ mov r0,r3 ++ rts ++ mov r1,r0 ++ ++.global __sh_cas_imask ++.hidden __sh_cas_imask ++__sh_cas_imask: ++ mov r0,r1 ++ stc sr,r0 ++ mov.l r0,@-r15 ++ or #0xf0,r0 ++ ldc r0,sr ++ mov.l @r1,r0 ++ cmp/eq r0,r2 ++ bf 1f ++ mov.l r3,@r1 ++1: ldc.l @r15+,sr ++ mov r0,r3 ++ rts ++ mov r1,r0 ++ ++.global __sh_cas_cas_l ++.hidden __sh_cas_cas_l ++__sh_cas_cas_l: ++ rts ++ .word 0x2323 /* cas.l r2,r3,@r0 */ +--- a/src/thread/sh/syscall_cp.s ++++ b/src/thread/sh/syscall_cp.s +@@ -14,17 +14,8 @@ __syscall_cp_asm: + __cp_begin: + mov.l @r4, r4 + tst r4, r4 +- bt 2f +- +- mov.l L1, r0 +- braf r0 +- nop +-1: +- +-.align 2 +-L1: .long __cancel@PLT-(1b-.) +- +-2: mov r5, r3 ++ bf __cp_cancel ++ mov r5, r3 + mov r6, r4 + mov r7, r5 + mov.l @r15, r6 +@@ -43,3 +34,12 @@ __cp_end: + + rts + nop ++ ++__cp_cancel: ++ mov.l 2f, r0 ++ braf r0 ++ nop ++1: ++ ++.align 2 ++2: .long __cancel@PCREL-(1b-.) +--- a/src/thread/x32/syscall_cp.s ++++ b/src/thread/x32/syscall_cp.s +@@ -14,7 +14,7 @@ __syscall_cp_internal: + __cp_begin: + mov (%rdi),%eax + test %eax,%eax +- jnz __cancel ++ jnz __cp_cancel + mov %rdi,%r11 + mov %rsi,%rax + mov %rdx,%rdi +@@ -27,3 +27,5 @@ __cp_begin: + syscall + __cp_end: + ret ++__cp_cancel: ++ jmp __cancel +--- a/src/thread/x86_64/syscall_cp.s ++++ b/src/thread/x86_64/syscall_cp.s +@@ -14,7 +14,7 @@ __syscall_cp_asm: + __cp_begin: + mov (%rdi),%eax + test %eax,%eax +- jnz __cancel ++ jnz __cp_cancel + mov %rdi,%r11 + mov %rsi,%rax + mov %rdx,%rdi +@@ -27,3 +27,5 @@ __cp_begin: + syscall + __cp_end: + ret ++__cp_cancel: ++ jmp __cancel |