[PATCH, v5] ARM: Add Cortex-A15 optimized NEON and VFP memcpy routines, with IFUNC.
Will Newton
will.newton@linaro.org
Tue Apr 30 16:54:00 GMT 2013
Add a high performance memcpy routine optimized for Cortex-A15 with
variants for use in the presence of NEON and VFP hardware selected
at runtime using indirect function support.
This was tested on armv7l-unknown-linux-gnueabihf. One new testsuite
failure was introduced (elf/ifuncmain5picstatic) which was caused by
a bug in ld. A fix for that ld issue has been committed to binutils
on 2012-04-29.
ports/ChangeLog.arm:
2013-04-30 Will Newton <will.newton@linaro.org>
* sysdeps/arm/armv7/multiarch/Makefile: New file.
* sysdeps/arm/armv7/multiarch/aeabi_memcpy.c: Likewise.
* sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Likewise.
* sysdeps/arm/armv7/multiarch/memcpy.S: Likewise.
* sysdeps/arm/armv7/multiarch/memcpy_impl.S: Likewise.
* sysdeps/arm/armv7/multiarch/memcpy_neon.S: Likewise.
* sysdeps/arm/armv7/multiarch/memcpy_vfp.S: Likewise.
* sysdeps/arm/configure.in: Require binutils 2.21 to build
for armv7.
---
ports/sysdeps/arm/armv7/multiarch/Makefile | 3 +
ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c | 33 ++
.../sysdeps/arm/armv7/multiarch/ifunc-impl-list.c | 44 ++
ports/sysdeps/arm/armv7/multiarch/memcpy.S | 59 ++
ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S | 640 +++++++++++++++++++++
ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S | 3 +
ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S | 3 +
ports/sysdeps/arm/configure.in | 8 +
8 files changed, 793 insertions(+)
create mode 100644 ports/sysdeps/arm/armv7/multiarch/Makefile
create mode 100644 ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c
create mode 100644 ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy.S
create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
Changes in v5:
- Move the configure check for binutils 2.21 to ports/sysdeps/arm.
diff --git a/ports/sysdeps/arm/armv7/multiarch/Makefile b/ports/sysdeps/arm/armv7/multiarch/Makefile
new file mode 100644
index 0000000..e834cc9
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_neon memcpy_vfp
+endif
diff --git a/ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c b/ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c
new file mode 100644
index 0000000..1f163b3
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2005-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+void *__memcpy_arm (void *dest, const void *src, size_t n);
+
+/* Copy memory like memcpy, but no return value required. Can't alias
+ to memcpy because it's not defined in the same translation
+ unit. */
+void
+__aeabi_memcpy (void *dest, const void *src, size_t n)
+{
+ __memcpy_arm (dest, src, n);
+}
+
+/* Versions of the above which may assume memory alignment. */
+strong_alias (__aeabi_memcpy, __aeabi_memcpy4)
+strong_alias (__aeabi_memcpy, __aeabi_memcpy8)
diff --git a/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000..b877eba
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
@@ -0,0 +1,44 @@
+/* Enumerate available IFUNC implementations of a function. arm version.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <ldsodefs.h>
+#include <sysdep.h>
+#include <ifunc-impl-list.h>
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+ NAME and return the number of valid entries. */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ size_t max)
+{
+ size_t i = 0;
+ int hwcap;
+
+ hwcap = GLRO(dl_hwcap);
+
+ IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_NEON,
+ __memcpy_neon)
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_VFPv3,
+ __memcpy_vfp)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
+
+ return i;
+}
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy.S b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
new file mode 100644
index 0000000..9e423e6
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
@@ -0,0 +1,59 @@
+/* Multiple versions of memcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define NO_THUMB
+#include <sysdep.h>
+#include <rtld-global-offsets.h>
+
+#if !defined NOT_IN_libc
+ .text
+ENTRY(memcpy)
+ .type memcpy, %gnu_indirect_function
+ ldr r1, .Lmemcpy_arm
+ tst r0, #HWCAP_ARM_VFP
+ ldrne r1, .Lmemcpy_vfp
+ tst r0, #HWCAP_ARM_NEON
+ ldrne r1, .Lmemcpy_neon
+1:
+ add r0, r1, pc
+ DO_RET(lr)
+
+.Lmemcpy_arm:
+ .long C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS
+.Lmemcpy_neon:
+ .long C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS
+.Lmemcpy_vfp:
+ .long C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS
+
+END(memcpy)
+
+libc_hidden_builtin_def (memcpy)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(x, y)
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+#define memcpy __memcpy_arm
+
+#endif
+
+#include "memcpy_impl.S"
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
new file mode 100644
index 0000000..5e4ba06
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
@@ -0,0 +1,640 @@
+/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>.
+
+ This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+ of VFP or NEON when built with the appropriate flags.
+
+ Assumptions:
+
+ ARMv6 (ARMv7-a if using Neon)
+ ARM state
+ Unaligned accesses
+ LDRD/STRD support unaligned word accesses
+
+ */
+#ifndef NO_THUMB
+#define NO_THUMB
+#endif
+#include <sysdep.h>
+
+ .syntax unified
+ /* This implementation requires ARM state. */
+ .arm
+
+#ifdef MEMCPY_NEON
+
+ .fpu neon
+ .arch armv7-a
+# define FRAME_SIZE 4
+# define USE_VFP
+# define USE_NEON
+
+#elif defined (MEMCPY_VFP)
+
+ .arch armv6
+ .fpu vfpv2
+# define FRAME_SIZE 32
+# define USE_VFP
+
+#else
+ .arch armv6
+# define FRAME_SIZE 32
+
+#endif
+
+#define ALIGN(addr, align) addr:align
+
+#define INSN_SIZE 4
+
+/* Call parameters. */
+#define dstin r0
+#define src r1
+#define count r2
+
+/* Locals. */
+#define tmp1 r3
+#define dst ip
+#define tmp2 r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers. */
+#define A_l r2 /* Call-clobbered. */
+#define A_h r3 /* Call-clobbered. */
+#define B_l r4
+#define B_h r5
+#define C_l r6
+#define C_h r7
+#define D_l r8
+#define D_h r9
+#endif
+
+/* Number of lines ahead to pre-fetch data. If you change this the code
+ below will need adjustment to compensate. */
+
+#define prefetch_lines 5
+
+#ifdef USE_VFP
+ .macro cpy_line_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+
+ .macro cpy_tail_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+#endif
+
+ .p2align 6
+ENTRY(memcpy)
+
+ mov dst, dstin /* Preserve dstin, we need to return it. */
+ cmp count, #64
+ bge .Lcpy_not_short
+ /* Deal with small copies quickly by dropping straight into the
+ exit block. */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+ and tmp1, count, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+
+ tst count, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+#else
+ /* Copy up to 15 full words of data. May not be aligned. */
+ /* Cannot use VFP for unaligned data. */
+ and tmp1, count, #0x3c
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(60 - PC_OFS/2 + INSN_SIZE/2)
+ /* Jump directly into the sequence below at the correct offset. */
+ add pc, pc, tmp1, lsl #1
+
+ ldr tmp1, [src, #-60] /* 15 words to go. */
+ str tmp1, [dst, #-60]
+
+ ldr tmp1, [src, #-56] /* 14 words to go. */
+ str tmp1, [dst, #-56]
+ ldr tmp1, [src, #-52]
+ str tmp1, [dst, #-52]
+
+ ldr tmp1, [src, #-48] /* 12 words to go. */
+ str tmp1, [dst, #-48]
+ ldr tmp1, [src, #-44]
+ str tmp1, [dst, #-44]
+
+ ldr tmp1, [src, #-40] /* 10 words to go. */
+ str tmp1, [dst, #-40]
+ ldr tmp1, [src, #-36]
+ str tmp1, [dst, #-36]
+
+ ldr tmp1, [src, #-32] /* 8 words to go. */
+ str tmp1, [dst, #-32]
+ ldr tmp1, [src, #-28]
+ str tmp1, [dst, #-28]
+
+ ldr tmp1, [src, #-24] /* 6 words to go. */
+ str tmp1, [dst, #-24]
+ ldr tmp1, [src, #-20]
+ str tmp1, [dst, #-20]
+
+ ldr tmp1, [src, #-16] /* 4 words to go. */
+ str tmp1, [dst, #-16]
+ ldr tmp1, [src, #-12]
+ str tmp1, [dst, #-12]
+
+ ldr tmp1, [src, #-8] /* 2 words to go. */
+ str tmp1, [dst, #-8]
+ ldr tmp1, [src, #-4]
+ str tmp1, [dst, #-4]
+#endif
+
+ lsls count, count, #31
+ ldrhcs tmp1, [src], #2
+ ldrbne src, [src] /* Src is dead, use as a scratch. */
+ strhcs tmp1, [dst], #2
+ strbne src, [dst]
+ bx lr
+
+.Lcpy_not_short:
+ /* At least 64 bytes to copy, but don't know the alignment yet. */
+ str tmp2, [sp, #-FRAME_SIZE]!
+ cfi_adjust_cfa_offset (FRAME_SIZE)
+ cfi_rel_offset (tmp2, 0)
+ cfi_remember_state
+ and tmp2, src, #3
+ and tmp1, dst, #3
+ cmp tmp1, tmp2
+ bne .Lcpy_notaligned
+
+#ifdef USE_VFP
+ /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
+ that the FP pipeline is much better at streaming loads and
+ stores. This is outside the critical loop. */
+ vmov.f32 s0, s0
+#endif
+
+ /* SRC and DST have the same mutual 32-bit alignment, but we may
+ still need to pre-copy some bytes to get to natural alignment.
+ We bring DST into full 64-bit alignment. */
+ lsls tmp2, dst, #29
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src], #1
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst], #1
+
+1:
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ blt .Ltail63aligned
+
+ cmp tmp2, #512
+ bge .Lcpy_body_long
+
+.Lcpy_body_medium: /* Count in tmp2. */
+#ifdef USE_VFP
+1:
+ vldr d0, [src, #0]
+ subs tmp2, tmp2, #64
+ vldr d1, [src, #8]
+ vstr d0, [dst, #0]
+ vldr d0, [src, #16]
+ vstr d1, [dst, #8]
+ vldr d1, [src, #24]
+ vstr d0, [dst, #16]
+ vldr d0, [src, #32]
+ vstr d1, [dst, #24]
+ vldr d1, [src, #40]
+ vstr d0, [dst, #32]
+ vldr d0, [src, #48]
+ vstr d1, [dst, #40]
+ vldr d1, [src, #56]
+ vstr d0, [dst, #48]
+ add src, src, #64
+ vstr d1, [dst, #56]
+ add dst, dst, #64
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
+ add pc, pc, tmp1
+
+ vldr d0, [src, #-56] /* 14 words to go. */
+ vstr d0, [dst, #-56]
+ vldr d0, [src, #-48] /* 12 words to go. */
+ vstr d0, [dst, #-48]
+ vldr d0, [src, #-40] /* 10 words to go. */
+ vstr d0, [dst, #-40]
+ vldr d0, [src, #-32] /* 8 words to go. */
+ vstr d0, [dst, #-32]
+ vldr d0, [src, #-24] /* 6 words to go. */
+ vstr d0, [dst, #-24]
+ vldr d0, [src, #-16] /* 4 words to go. */
+ vstr d0, [dst, #-16]
+ vldr d0, [src, #-8] /* 2 words to go. */
+ vstr d0, [dst, #-8]
+#else
+ sub src, src, #8
+ sub dst, dst, #8
+1:
+ ldrd A_l, A_h, [src, #8]
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #16]
+ strd A_l, A_h, [dst, #16]
+ ldrd A_l, A_h, [src, #24]
+ strd A_l, A_h, [dst, #24]
+ ldrd A_l, A_h, [src, #32]
+ strd A_l, A_h, [dst, #32]
+ ldrd A_l, A_h, [src, #40]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #48]
+ strd A_l, A_h, [dst, #48]
+ ldrd A_l, A_h, [src, #56]
+ strd A_l, A_h, [dst, #56]
+ ldrd A_l, A_h, [src, #64]!
+ strd A_l, A_h, [dst, #64]!
+ subs tmp2, tmp2, #64
+ bge 1b
+ tst tmp2, #0x3f
+ bne 1f
+ ldr tmp2,[sp], #FRAME_SIZE
+ cfi_adjust_cfa_offset (-FRAME_SIZE)
+ cfi_restore (tmp2)
+ bx lr
+
+ cfi_restore_state
+ cfi_remember_state
+1:
+ add src, src, #8
+ add dst, dst, #8
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
+ we know that the src and dest are 32-bit aligned so we can use
+ LDRD/STRD to improve efficiency. */
+ /* TMP2 is now negative, but we don't care about that. The bottom
+ six bits still tell us how many bytes are left to copy. */
+
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
+ add pc, pc, tmp1
+ ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
+ strd A_l, A_h, [dst, #-56]
+ ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
+ strd A_l, A_h, [dst, #-48]
+ ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
+ strd A_l, A_h, [dst, #-40]
+ ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
+ strd A_l, A_h, [dst, #-32]
+ ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
+ strd A_l, A_h, [dst, #-24]
+ ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
+ strd A_l, A_h, [dst, #-16]
+ ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
+ strd A_l, A_h, [dst, #-8]
+
+#endif
+ tst tmp2, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+ lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src]
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst]
+
+.Ldone:
+ ldr tmp2, [sp], #FRAME_SIZE
+ cfi_adjust_cfa_offset (-FRAME_SIZE)
+ cfi_restore (tmp2)
+ bx lr
+
+ cfi_restore_state
+ cfi_remember_state
+
+.Lcpy_body_long: /* Count in tmp2. */
+
+ /* Long copy. We know that there's at least (prefetch_lines * 64)
+ bytes to go. */
+#ifdef USE_VFP
+ /* Don't use PLD. Instead, read some data in advance of the current
+ copy position into a register. This should act like a PLD
+ operation but we won't have to repeat the transfer. */
+
+ vldr d3, [src, #0]
+ vldr d4, [src, #64]
+ vldr d5, [src, #128]
+ vldr d6, [src, #192]
+ vldr d7, [src, #256]
+
+ vldr d0, [src, #8]
+ vldr d1, [src, #16]
+ vldr d2, [src, #24]
+ add src, src, #32
+
+ subs tmp2, tmp2, #prefetch_lines * 64 * 2
+ blt 2f
+1:
+ cpy_line_vfp d3, 0
+ cpy_line_vfp d4, 64
+ cpy_line_vfp d5, 128
+ add dst, dst, #3 * 64
+ add src, src, #3 * 64
+ cpy_line_vfp d6, 0
+ cpy_line_vfp d7, 64
+ add dst, dst, #2 * 64
+ add src, src, #2 * 64
+ subs tmp2, tmp2, #prefetch_lines * 64
+ bge 1b
+
+2:
+ cpy_tail_vfp d3, 0
+ cpy_tail_vfp d4, 64
+ cpy_tail_vfp d5, 128
+ add src, src, #3 * 64
+ add dst, dst, #3 * 64
+ cpy_tail_vfp d6, 0
+ vstr d7, [dst, #64]
+ vldr d7, [src, #64]
+ vstr d0, [dst, #64 + 8]
+ vldr d0, [src, #64 + 8]
+ vstr d1, [dst, #64 + 16]
+ vldr d1, [src, #64 + 16]
+ vstr d2, [dst, #64 + 24]
+ vldr d2, [src, #64 + 24]
+ vstr d7, [dst, #64 + 32]
+ add src, src, #96
+ vstr d0, [dst, #64 + 40]
+ vstr d1, [dst, #64 + 48]
+ vstr d2, [dst, #64 + 56]
+ add dst, dst, #128
+ add tmp2, tmp2, #prefetch_lines * 64
+ b .Lcpy_body_medium
+#else
+ /* Long copy. Use an SMS style loop to maximize the I/O
+ bandwidth of the core. We don't have enough spare registers
+ to synthesise prefetching, so use PLD operations. */
+ /* Pre-bias src and dst. */
+ sub src, src, #8
+ sub dst, dst, #8
+ pld [src, #8]
+ pld [src, #72]
+ subs tmp2, tmp2, #64
+ pld [src, #136]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ cfi_rel_offset (B_l, 8)
+ cfi_rel_offset (B_h, 12)
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ cfi_rel_offset (C_l, 16)
+ cfi_rel_offset (C_h, 20)
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ cfi_rel_offset (D_l, 24)
+ cfi_rel_offset (D_h, 28)
+ pld [src, #200]
+ ldrd D_l, D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #232]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldrd D_l, D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldrd D_l, D_h, [src, #32]
+ bcs 2b
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #40
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ cfi_restore (B_l)
+ cfi_restore (B_h)
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ cfi_restore (C_l)
+ cfi_restore (C_h)
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ cfi_restore (D_l)
+ cfi_restore (D_h)
+ add dst, dst, #72
+ tst tmp2, #0x3f
+ bne .Ltail63aligned
+ ldr tmp2, [sp], #FRAME_SIZE
+ cfi_adjust_cfa_offset (-FRAME_SIZE)
+ cfi_restore (tmp2)
+ bx lr
+#endif
+
+ cfi_restore_state
+ cfi_remember_state
+
+.Lcpy_notaligned:
+ pld [src]
+ pld [src, #64]
+ /* There's at least 64 bytes to copy, but there is no mutual
+ alignment. */
+ /* Bring DST to 64-bit alignment. */
+ lsls tmp2, dst, #29
+ pld [src, #(2 * 64)]
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrbne tmp1, [src], #1
+ ldrhcs tmp2, [src], #2
+ strbne tmp1, [dst], #1
+ strhcs tmp2, [dst], #2
+1:
+ pld [src, #(3 * 64)]
+ subs count, count, #64
+ ldrmi tmp2, [sp], #FRAME_SIZE
+ bmi .Ltail63unaligned
+ pld [src, #(4 * 64)]
+
+#ifdef USE_NEON
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bmi 2f
+1:
+ pld [src, #(4 * 64)]
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vld1.8 {d0-d3}, [src]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bpl 1b
+2:
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ ands count, count, #0x3f
+#else
+ /* Use an SMS style loop to maximize the I/O bandwidth. */
+ sub src, src, #4
+ sub dst, dst, #8
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ cfi_rel_offset (B_l, 8)
+ cfi_rel_offset (B_h, 12)
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ cfi_rel_offset (C_l, 16)
+ cfi_rel_offset (C_h, 20)
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ cfi_rel_offset (D_l, 24)
+ cfi_rel_offset (D_h, 28)
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #(5 * 64) - (32 - 4)]
+ strd A_l, A_h, [dst, #40]
+ ldr A_l, [src, #36]
+ ldr A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldr B_l, [src, #44]
+ ldr B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldr C_l, [src, #52]
+ ldr C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldr D_l, [src, #60]
+ ldr D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]
+ bcs 2b
+
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #36
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ cfi_restore (B_l)
+ cfi_restore (B_h)
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ cfi_restore (C_l)
+ cfi_restore (C_h)
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ cfi_restore (D_l)
+ cfi_restore (D_h)
+ add dst, dst, #72
+ ands count, tmp2, #0x3f
+#endif
+ ldr tmp2, [sp], #FRAME_SIZE
+ cfi_adjust_cfa_offset (-FRAME_SIZE)
+ cfi_restore (tmp2)
+ bne .Ltail63unaligned
+ bx lr
+
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
new file mode 100644
index 0000000..c0ef1f8
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
@@ -0,0 +1,3 @@
+#define MEMCPY_NEON
+#define memcpy __memcpy_neon
+#include "memcpy_impl.S"
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
new file mode 100644
index 0000000..d21b702
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
@@ -0,0 +1,3 @@
+#define MEMCPY_VFP
+#define memcpy __memcpy_vfp
+#include "memcpy_impl.S"
diff --git a/ports/sysdeps/arm/configure.in b/ports/sysdeps/arm/configure.in
index d66500b..0d95cb9 100644
--- a/ports/sysdeps/arm/configure.in
+++ b/ports/sysdeps/arm/configure.in
@@ -44,3 +44,11 @@ if test $libc_cv_arm_pcs_vfp = yes; then
else
LIBC_CONFIG_VAR([default-abi], [soft])
fi
+
+# If we are configuring for armv7 we need binutils 2.21 to ensure that
+# NEON alignments are assembled correctly.
+if test $machine = arm/armv7; then
+ AC_CHECK_PROG_VER(AS, $AS, --version,
+ [GNU assembler.* \([0-9]*\.[0-9.]*\)],
+ [2.1[0-9][0-9]*|2.[2-9][1-9]*|[3-9].*|[1-9][0-9]*], AS=: critic_missing="$critic_missing as")
+fi
--
1.8.1.4
More information about the Libc-ports
mailing list