This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

RE: Support __aeabi_memcpy, __aeabi_memcpy4 and __aeabi_memcpy8 routines in the arm backend.


Hi,

This patch is used to resubmit the previous aeabi_memcpy patch. 

Implementations of the _aeabi_memcpy* functions are allowed to corrupt only the integer core registers permitted to be corrupted by the [AAPCS] (r0-r3, ip, lr, and CPSR). But the FP registers(d0~d7) are used in the memcpy function for ARMv7-A. So we can't just use alias to support these aeabi functions.

In this patch, the previous memcpy codes which did not call the FP registers are used to implement the __aeabi_memcpy for ARMv7-A. So the efficiency is lower than the current memcpy function in memcpy-armv7a.S. The __aeabi_memcpy4 and __aeabi_memcpy8 functions are also realized for ARMv7-A. 

For all the other targets, we simply use alias to support __aeabi_memcpy*.

No make check regression on ARMv7-a with the configuration "-march=armv7-a/-mfpu=neon/-mfloat-abi=hard".

Patch also attached for convenience.

Thanks and Best Regards,
Hale Wang

=======================================
newlib/ChangeLog:

2014-07-29  Hale Wang  <hale.wang@arm.com>
	
	* libc/machine/arm/aeabi_memcpy.c: New file.
	* libc/machine/arm/aeabi_memcpy-armv7a.S: New file.
	* libc/machine/arm/Makefile.am: Add dependencies.
	* libc/machine/arm/Makefile.in: Regenerated.

=======================================
diff --git a/newlib/libc/machine/arm/Makefile.am b/newlib/libc/machine/arm/Makefile.am
index fb33926..939bf93 100644
--- a/newlib/libc/machine/arm/Makefile.am
+++ b/newlib/libc/machine/arm/Makefile.am
@@ -10,7 +10,8 @@ noinst_LIBRARIES = lib.a
 
 lib_a_SOURCES = setjmp.S access.c strlen.c strcmp.S strcpy.c \
 	        memcpy.S memcpy-stub.c memchr-stub.c memchr.S \
-		strlen.c strlen-armv7.S
+		strlen.c strlen-armv7.S aeabi_memcpy.c \
+		aeabi_memcpy-armv7a.S
 lib_a_CCASFLAGS=$(AM_CCASFLAGS)
 lib_a_CFLAGS = $(AM_CFLAGS)
 
diff --git a/newlib/libc/machine/arm/Makefile.in b/newlib/libc/machine/arm/Makefile.in
index 1ccfac5..c94e803 100644
--- a/newlib/libc/machine/arm/Makefile.in
+++ b/newlib/libc/machine/arm/Makefile.in
@@ -74,7 +74,8 @@ am_lib_a_OBJECTS = lib_a-setjmp.$(OBJEXT) lib_a-access.$(OBJEXT) \
 	lib_a-strcpy.$(OBJEXT) lib_a-memcpy.$(OBJEXT) \
 	lib_a-memcpy-stub.$(OBJEXT) lib_a-memchr-stub.$(OBJEXT) \
 	lib_a-memchr.$(OBJEXT) lib_a-strlen.$(OBJEXT) \
-	lib_a-strlen-armv7.$(OBJEXT)
+	lib_a-strlen-armv7.$(OBJEXT) lib_a-aeabi_memcpy.$(OBJEXT) \
+	lib_a-aeabi_memcpy-armv7a.$(OBJEXT)
 lib_a_OBJECTS = $(am_lib_a_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp =
@@ -202,7 +203,8 @@ AM_CCASFLAGS = $(INCLUDES)
 noinst_LIBRARIES = lib.a
 lib_a_SOURCES = setjmp.S access.c strlen.c strcmp.S strcpy.c \
 	        memcpy.S memcpy-stub.c memchr-stub.c memchr.S \
-		strlen.c strlen-armv7.S
+		strlen.c strlen-armv7.S aeabi_memcpy.c \
+		aeabi_memcpy-armv7a.S
 
 lib_a_CCASFLAGS = $(AM_CCASFLAGS)
 lib_a_CFLAGS = $(AM_CFLAGS)
@@ -300,6 +302,12 @@ lib_a-strlen-armv7.o: strlen-armv7.S
 lib_a-strlen-armv7.obj: strlen-armv7.S
 	$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-strlen-armv7.obj `if test -f 'strlen-armv7.S'; then $(CYGPATH_W) 'strlen-armv7.S'; else $(CYGPATH_W) '$(srcdir)/strlen-armv7.S'; fi`
 
+lib_a-aeabi_memcpy-armv7a.o: aeabi_memcpy-armv7a.S
+	$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-aeabi_memcpy-armv7a.o `test -f 'aeabi_memcpy-armv7a.S' || echo '$(srcdir)/'`aeabi_memcpy-armv7a.S
+
+lib_a-aeabi_memcpy-armv7a.obj: aeabi_memcpy-armv7a.S
+	$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-aeabi_memcpy-armv7a.obj `if test -f 'aeabi_memcpy-armv7a.S'; then $(CYGPATH_W) 'aeabi_memcpy-armv7a.S'; else $(CYGPATH_W) '$(srcdir)/aeabi_memcpy-armv7a.S'; fi`
+
 .c.o:
 	$(COMPILE) -c $<
 
@@ -336,6 +344,12 @@ lib_a-memchr-stub.o: memchr-stub.c
 lib_a-memchr-stub.obj: memchr-stub.c
 	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CFLAGS) $(CFLAGS) -c -o lib_a-memchr-stub.obj `if test -f 'memchr-stub.c'; then $(CYGPATH_W) 'memchr-stub.c'; else $(CYGPATH_W) '$(srcdir)/memchr-stub.c'; fi`
 
+lib_a-aeabi_memcpy.o: aeabi_memcpy.c
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CFLAGS) $(CFLAGS) -c -o lib_a-aeabi_memcpy.o `test -f 'aeabi_memcpy.c' || echo '$(srcdir)/'`aeabi_memcpy.c
+
+lib_a-aeabi_memcpy.obj: aeabi_memcpy.c
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CFLAGS) $(CFLAGS) -c -o lib_a-aeabi_memcpy.obj `if test -f 'aeabi_memcpy.c'; then $(CYGPATH_W) 'aeabi_memcpy.c'; else $(CYGPATH_W) '$(srcdir)/aeabi_memcpy.c'; fi`
+
 ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
 	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
 	unique=`for i in $$list; do \
diff --git a/newlib/libc/machine/arm/aeabi_memcpy-armv7a.S b/newlib/libc/machine/arm/aeabi_memcpy-armv7a.S
new file mode 100644
index 0000000..53e3330
--- /dev/null
+++ b/newlib/libc/machine/arm/aeabi_memcpy-armv7a.S
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arm_asm.h"
+
+/* NOTE: This ifdef MUST match the one in aeabi_memcpy.c.  */
+#if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
+	(defined (__ARM_NEON__) || !defined (__SOFTFP__))
+
+	.syntax unified
+	.global __aeabi_memcpy
+	.type   __aeabi_memcpy, %function
+__aeabi_memcpy:
+	/* Assumes that n >= 0, and dst, src are valid pointers.
+          If there is at least 8 bytes to copy, use LDRD/STRD.
+          If src and dst are misaligned with different offsets,
+          first copy byte by byte until dst is aligned,
+          and then copy using LDRD/STRD and shift if needed.
+          When less than 8 left, copy a word and then byte by byte.  */
+
+       /* Save registers (r0 holds the return value):
+          optimized push {r0, r4, r5, lr}.
+          To try and improve performance, stack layout changed,
+          i.e., not keeping the stack looking like users expect
+          (highest numbered register at highest address).  */
+        push {r0, lr}
+        strd r4, r5, [sp, #-8]!
+
+        /* Get copying of tiny blocks out of the way first.  */
+        /* Is there at least 4 bytes to copy?  */
+        subs    r2, r2, #4
+        blt     copy_less_than_4       /* If n < 4.  */
+
+        /* Check word alignment.  */
+        ands    ip, r0, #3             /* ip = last 2 bits of dst.  */
+        bne     dst_not_word_aligned   /* If dst is not word-aligned.  */
+
+        /* Get here if dst is word-aligned.  */
+        ands    ip, r1, #3             /* ip = last 2 bits of src.  */
+        bne     src_not_word_aligned   /* If src is not word-aligned.  */
+word_aligned:
+        /* Get here if source and dst both are word-aligned.
+           The number of bytes remaining to copy is r2+4.  */
+
+        /* Is there is at least 64 bytes to copy?  */
+        subs    r2, r2, #60
+        blt     copy_less_than_64                /* If r2 + 4 < 64.  */
+
+        /* First, align the destination buffer to 8-bytes,
+           to make sure double loads and stores don't cross cache line boundary,
+           as they are then more expensive even if the data is in the cache
+           (require two load/store issue cycles instead of one).
+           If only one of the buffers is not 8-bytes aligned,
+           then it's more important to align dst than src,
+           because there is more penalty for stores
+           than loads that cross cacheline boundary.
+           This check and realignment are only worth doing
+           if there is a lot to copy.  */
+
+        /* Get here if dst is word aligned,
+           i.e., the 2 least significant bits are 0.
+           If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
+           then copy 1 word (4 bytes).  */
+        ands    r3, r0, #4
+        beq     two_word_aligned  /* If dst already two-word aligned.  */
+        ldr     r3, [r1], #4
+        str     r3, [r0], #4
+        subs    r2, r2, #4
+        blt     copy_less_than_64
+
+two_word_aligned:
+        /* TODO: Align to cacheline (useful for PLD optimization).  */
+
+        /* Every loop iteration copies 64 bytes.  */
+1:
+        .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
+        ldrd    r4, r5, [r1, \offset]
+        strd    r4, r5, [r0, \offset]
+        .endr
+
+        add     r0, r0, #64
+        add     r1, r1, #64
+        subs    r2, r2, #64
+        bge     1b                     /* If there is more to copy.  */
+
+copy_less_than_64:
+
+        /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
+           Restore the count if there is more than 7 bytes to copy.  */
+        adds    r2, r2, #56
+        blt     copy_less_than_8
+
+        /* Copy 8 bytes at a time.  */
+2:
+        ldrd    r4, r5, [r1], #8
+        strd    r4, r5, [r0], #8
+        subs    r2, r2, #8
+        bge     2b                     /* If there is more to copy.  */
+
+copy_less_than_8:
+
+        /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
+           Check if there is more to copy.  */
+        cmn     r2, #8
+        beq     return                          /* If r2 + 8 == 0.  */
+
+        /* Restore the count if there is more than 3 bytes to copy.  */
+        adds    r2, r2, #4
+        blt     copy_less_than_4
+
+        /* Copy 4 bytes.  */
+        ldr     r3, [r1], #4
+        str     r3, [r0], #4
+
+copy_less_than_4:
+        /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */
+
+        /* Restore the count, check if there is more to copy.  */
+        adds    r2, r2, #4
+        beq     return                          /* If r2 == 0.  */
+
+        /* Get here with r2 is in {1,2,3}={01,10,11}.  */
+        /* Logical shift left r2, insert 0s, update flags.  */
+        lsls    r2, r2, #31
+
+        /* Copy byte by byte.
+           Condition ne means the last bit of r2 is 0.
+           Condition cs means the second to last bit of r2 is set,
+           i.e., r2 is 1 or 3.  */
+        itt     ne
+        ldrbne  r3, [r1], #1
+        strbne  r3, [r0], #1
+
+        itttt   cs
+        ldrbcs  r4, [r1], #1
+        ldrbcs  r5, [r1]
+        strbcs  r4, [r0], #1
+        strbcs  r5, [r0]
+
+return:
+        /* Restore registers: optimized pop {r0, r4, r5, pc}   */
+        ldrd r4, r5, [sp], #8
+        pop {r0, pc}         /* This is the only return point of memcpy.  */
+
+dst_not_word_aligned:
+
+       /* Get here when dst is not aligned and ip has the last 2 bits of dst,
+          i.e., ip is the offset of dst from word.
+          The number of bytes that remains to copy is r2 + 4,
+          i.e., there are at least 4 bytes to copy.
+          Write a partial word (0 to 3 bytes), such that dst becomes
+	  word-aligned.  */
+
+       /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
+          then there are (4 - ip) bytes to fill up to align dst to the next
+	  word.  */
+        rsb     ip, ip, #4                 /* ip = #4 - ip.  */
+        cmp     ip, #2
+
+       /* Copy byte by byte with conditionals.  */
+        itt     gt
+        ldrbgt  r3, [r1], #1
+        strbgt  r3, [r0], #1
+
+        itt     ge
+        ldrbge  r4, [r1], #1
+        strbge  r4, [r0], #1
+
+        ldrb    lr, [r1], #1
+        strb    lr, [r0], #1
+
+       /* Update the count.
+          ip holds the number of bytes we have just copied.  */
+        subs    r2, r2, ip                        /* r2 = r2 - ip.  */
+        blt     copy_less_than_4                  /* If r2 < ip.  */
+
+       /* Get here if there are more than 4 bytes to copy.
+          Check if src is aligned.  If beforehand src and dst were not word
+	  aligned but congruent (same offset), then now they are both
+	  word-aligned, and we can copy the rest efficiently (without
+	  shifting).  */
+        ands    ip, r1, #3                    /* ip = last 2 bits of src.  */
+        beq     word_aligned                  /* If r1 is word-aligned.  */
+
+src_not_word_aligned:
+       /* Get here when src is not word-aligned, but dst is word-aligned.
+          The number of bytes that remains to copy is r2+4.  */
+
+       /* Copy word by word using LDR when alignment can be done in hardware,
+          i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
+        subs    r2, r2, #60
+        blt     8f
+
+7:
+        /* Copy 64 bytes in every loop iteration.  */
+        .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
+        ldr     r3, [r1, \offset]
+        str     r3, [r0, \offset]
+        .endr
+
+        add     r0, r0, #64
+        add     r1, r1, #64
+        subs    r2, r2, #64
+        bge     7b
+
+8:
+        /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
+           Check if there is more than 3 bytes to copy.  */
+        adds    r2, r2, #60
+        blt     copy_less_than_4
+
+9:
+       /* Get here if there is less than 64 but at least 4 bytes to copy,
+          where the number of bytes to copy is r2+4.  */
+        ldr     r3, [r1], #4
+        str     r3, [r0], #4
+        subs    r2, r2, #4
+        bge     9b
+
+        b       copy_less_than_4
+
+
+	.syntax unified
+	.global __aeabi_memcpy4
+	.type   __aeabi_memcpy4, %function
+__aeabi_memcpy4:
+	/* Assumes that both of its arguments are 4-byte aligned.  */
+
+        push {r0, lr}
+        strd r4, r5, [sp, #-8]!
+
+        /* Is there at least 4 bytes to copy?  */
+        subs    r2, r2, #4
+        blt     copy_less_than_4       /* If n < 4.  */
+
+	bl	word_aligned
+
+	.syntax unified
+	.global __aeabi_memcpy8
+	.type   __aeabi_memcpy8, %function
+__aeabi_memcpy8:
+	/* Assumes that both of its arguments are 8-byte aligned.  */
+
+        push {r0, lr}
+        strd r4, r5, [sp, #-8]!
+
+	/* Is there at least 4 bytes to copy?  */
+        subs    r2, r2, #4
+        blt     copy_less_than_4	/* If n < 4.  */
+
+        /* Is there at least 8 bytes to copy?  */
+        subs    r2, r2, #4
+        blt     copy_less_than_8	/* If n < 8.  */
+
+	/* Is there at least 64 bytes to copy?  */
+	subs	r2, r2, #56
+	blt	copy_less_than_64	/* if n + 8 < 64.  */
+
+	bl	two_word_aligned
+
+#endif
diff --git a/newlib/libc/machine/arm/aeabi_memcpy.c b/newlib/libc/machine/arm/aeabi_memcpy.c
new file mode 100644
index 0000000..9837c35
--- /dev/null
+++ b/newlib/libc/machine/arm/aeabi_memcpy.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <_ansi.h>
+
+/* According to the Run-time ABI for the ARM Architecture.  This
+   function is allowed to corrupt only the integer core register
+   permitted to be corrupted by the [AAPCS] (r0-r3, ip, lr, and
+   CPSR).
+
+   The FP registers are used in memcpy for target __ARM_ARCH_7A.
+   Therefore, we can't just simply use alias to support the function
+   aeabi_memcpy for target __ARM_ARCH_7A.  Instead, we choose the
+   previous versions of memcpy to suppport it as an alternative.  */
+
+/* NOTE: This ifdef MUST match the one in aeabi_memcpy-armv7a.S.  */
+#if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
+	(defined (__ARM_NEON__) || !defined (__SOFTFP__))
+
+/* Defined in aeabi_memcpy-armv7a.S.  */
+
+#else
+/* Support the alias for the __aeabi_memcpy which may
+   assume memory alignment.  */
+void __aeabi_memcpy4 (void *dest, const void *source, size_t n)
+	_ATTRIBUTE ((alias ("__aeabi_memcpy")));
+
+void __aeabi_memcpy8 (void *dest, const void *source, size_t n)
+	_ATTRIBUTE ((alias ("__aeabi_memcpy")));
+
+/* Support the routine __aeabi_memcpy.  Can't alias to memcpy
+   because it's not defined in the same translation unit.  */
+void __aeabi_memcpy (void *dest, const void *source, size_t n)
+{
+  extern void memcpy (void *dest, const void *source, size_t n);
+  memcpy (dest, source, n);
+}
+#endif

=======================================

Attachment: aeabi_memcpy_9.patch
Description: Binary data


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]