ARM: Add Cortex-A15 optimized NEON and VFP memcpy routines, with IFUNC.

author Will Newton <will.newton@linaro.org>

Wed, 8 May 2013 12:06:34 +0000 (12:06 +0000)

committer Joseph Myers <joseph@codesourcery.com>

Wed, 8 May 2013 12:06:34 +0000 (12:06 +0000)
author Will Newton <will.newton@linaro.org>
Wed, 8 May 2013 12:06:34 +0000 (12:06 +0000)
committer Joseph Myers <joseph@codesourcery.com>
Wed, 8 May 2013 12:06:34 +0000 (12:06 +0000)
diff --git a/ports/ChangeLog.arm b/ports/ChangeLog.arm

index 3f504e41b4f1df57259038abe9b18ebef90c7d0b..372096e71984c3f626ef23b1b708a1012346f6f5 100644 (file)
--- a/ports/ChangeLog.arm
+++ b/ports/ChangeLog.arm
@@ -1,3 +1,15 @@
+2013-05-08  Will Newton  <will.newton@linaro.org>
+
+       * sysdeps/arm/armv7/multiarch/Makefile: New file.
+       * sysdeps/arm/armv7/multiarch/aeabi_memcpy.c: Likewise.
+       * sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Likewise.
+       * sysdeps/arm/armv7/multiarch/memcpy.S: Likewise.
+       * sysdeps/arm/armv7/multiarch/memcpy_impl.S: Likewise.
+       * sysdeps/arm/armv7/multiarch/memcpy_neon.S: Likewise.
+       * sysdeps/arm/armv7/multiarch/memcpy_vfp.S: Likewise.
+       * sysdeps/arm/armv7/configure.in: Likewise.
+       * sysdeps/arm/armv7/configure: Generated.
+
  2013-05-07  Roland McGrath  <roland@hack.frob.com>
  
         * sysdeps/arm/dl-machine.h (elf_machine_dynamic): Use a plain C
diff --git a/ports/sysdeps/arm/armv7/configure b/ports/sysdeps/arm/armv7/configure

new file mode 100755 (executable)

index 0000000..a879ffb
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/configure
@@ -0,0 +1,72 @@
+# This file is generated from configure.in by Autoconf.  DO NOT EDIT!
+ # Local configure fragment for sysdeps/arm/armv7.
+
+# We need binutils 2.21 to ensure that NEON alignments are assembled correctly.
+libc_cv_arm_as_version_ok=yes
+for ac_prog in $AS
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AS+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AS"; then
+  ac_cv_prog_AS="$AS" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_AS="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AS=$ac_cv_prog_AS
+if test -n "$AS"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AS" >&5
+$as_echo "$AS" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$AS" && break
+done
+
+if test -z "$AS"; then
+  ac_verc_fail=yes
+else
+  # Found it, now check the version.
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking version of $AS" >&5
+$as_echo_n "checking version of $AS... " >&6; }
+  ac_prog_version=`$AS --version 2>&1 | sed -n 's/^.*GNU assembler.* \([0-9]*\.[0-9.]*\).*$/\1/p'`
+  case $ac_prog_version in
+    '') ac_prog_version="v. ?.??, bad"; ac_verc_fail=yes;;
+    2.1[0-9][0-9]*|2.2[1-9]*|2.[3-9][0-9]*|[3-9].*|[1-9][0-9]*)
+       ac_prog_version="$ac_prog_version, ok"; ac_verc_fail=no;;
+    *) ac_prog_version="$ac_prog_version, bad"; ac_verc_fail=yes;;
+
+  esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_prog_version" >&5
+$as_echo "$ac_prog_version" >&6; }
+fi
+if test $ac_verc_fail = yes; then
+  libc_cv_arm_as_version_ok=no
+fi
+
+
+if test $libc_cv_arm_as_version_ok != yes; then
+  as_fn_error $? "as version too old, at least 2.21 is required" "$LINENO" 5
+fi
diff --git a/ports/sysdeps/arm/armv7/configure.in b/ports/sysdeps/arm/armv7/configure.in

new file mode 100644 (file)

index 0000000..01e93ec
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/configure.in
@@ -0,0 +1,12 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/arm/armv7.
+
+# We need binutils 2.21 to ensure that NEON alignments are assembled correctly.
+libc_cv_arm_as_version_ok=yes
+AC_CHECK_PROG_VER(AS, $AS, --version,
+                  [GNU assembler.* \([0-9]*\.[0-9.]*\)],
+                  [2.1[0-9][0-9]*|2.2[1-9]*|2.[3-9][0-9]*|[3-9].*|[1-9][0-9]*], libc_cv_arm_as_version_ok=no)
+
+if test $libc_cv_arm_as_version_ok != yes; then
+  AC_MSG_ERROR([as version too old, at least 2.21 is required])
+fi
diff --git a/ports/sysdeps/arm/armv7/multiarch/Makefile b/ports/sysdeps/arm/armv7/multiarch/Makefile

new file mode 100644 (file)

index 0000000..e834cc9
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_neon memcpy_vfp
+endif
diff --git a/ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c b/ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c

new file mode 100644 (file)

index 0000000..c6a2a98
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c
@@ -0,0 +1,2 @@
+/* Empty file to override sysdeps/arm version. See memcpy.S for definitions
+   of these functions.  */
diff --git a/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c

new file mode 100644 (file)

index 0000000..0dcdcd6
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
@@ -0,0 +1,44 @@
+/* Enumerate available IFUNC implementations of a function.  ARM version.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <ldsodefs.h>
+#include <sysdep.h>
+#include <ifunc-impl-list.h>
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+   NAME and return the number of valid entries.  */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                       size_t max)
+{
+  size_t i = 0;
+  int hwcap;
+
+  hwcap = GLRO(dl_hwcap);
+
+  IFUNC_IMPL (i, name, memcpy,
+             IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_NEON,
+                             __memcpy_neon)
+             IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_VFP,
+                             __memcpy_vfp)
+             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
+
+  return i;
+}
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy.S b/ports/sysdeps/arm/armv7/multiarch/memcpy.S

new file mode 100644 (file)

index 0000000..1b12465
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
@@ -0,0 +1,73 @@
+/* Multiple versions of memcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Thumb requires excess IT instructions here.  */
+#define NO_THUMB
+#include <sysdep.h>
+#include <rtld-global-offsets.h>
+
+#if !defined NOT_IN_libc
+       .text
+ENTRY(memcpy)
+       .type   memcpy, %gnu_indirect_function
+#ifdef __SOFTFP__
+       ldr     r1, .Lmemcpy_arm
+       tst     r0, #HWCAP_ARM_VFP
+       ldrne   r1, .Lmemcpy_vfp
+#else
+       ldr     r1, .Lmemcpy_vfp
+#endif
+       tst     r0, #HWCAP_ARM_NEON
+       ldrne   r1, .Lmemcpy_neon
+1:
+       add     r0, r1, pc
+       DO_RET(lr)
+
+#ifdef __SOFTFP__
+.Lmemcpy_arm:
+       .long   C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS
+#endif
+.Lmemcpy_neon:
+       .long   C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS
+.Lmemcpy_vfp:
+       .long   C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS
+
+END(memcpy)
+
+libc_hidden_builtin_def (memcpy)
+
+/* These versions of memcpy are defined not to clobber any VFP or NEON
+   registers so they must always call the ARM variant of the memcpy code.  */
+strong_alias (__memcpy_arm, __aeabi_memcpy)
+strong_alias (__memcpy_arm, __aeabi_memcpy4)
+strong_alias (__memcpy_arm, __aeabi_memcpy8)
+libc_hidden_def (__memcpy_arm)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(x, y)
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+#define memcpy __memcpy_arm
+
+#endif
+
+#include "memcpy_impl.S"
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S

new file mode 100644 (file)

index 0000000..f83276a
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
@@ -0,0 +1,642 @@
+/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.
+
+   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+   of VFP or NEON when built with the appropriate flags.
+
+   Assumptions:
+
+    ARMv6 (ARMv7-a if using Neon)
+    ARM state
+    Unaligned accesses
+    LDRD/STRD support unaligned word accesses
+
+ */
+
+/* Thumb cannot encode negative immediate offsets in memory operations.  */
+#ifndef NO_THUMB
+#define NO_THUMB
+#endif
+#include <sysdep.h>
+
+       .syntax unified
+       /* This implementation requires ARM state.  */
+       .arm
+
+#ifdef MEMCPY_NEON
+
+       .fpu    neon
+       .arch   armv7-a
+# define FRAME_SIZE    4
+# define USE_VFP
+# define USE_NEON
+
+#elif defined (MEMCPY_VFP)
+
+       .arch   armv6
+       .fpu    vfpv2
+# define FRAME_SIZE    32
+# define USE_VFP
+
+#else
+       .arch   armv6
+# define FRAME_SIZE    32
+
+#endif
+
+#define ALIGN(addr, align) addr:align
+
+#define INSN_SIZE      4
+
+/* Call parameters.  */
+#define dstin  r0
+#define src    r1
+#define count  r2
+
+/* Locals.  */
+#define tmp1   r3
+#define dst    ip
+#define tmp2   r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers.  */
+#define        A_l     r2              /* Call-clobbered.  */
+#define        A_h     r3              /* Call-clobbered.  */
+#define        B_l     r4
+#define        B_h     r5
+#define        C_l     r6
+#define        C_h     r7
+#define        D_l     r8
+#define        D_h     r9
+#endif
+
+/* Number of lines ahead to pre-fetch data.  If you change this the code
+   below will need adjustment to compensate.  */
+
+#define prefetch_lines 5
+
+#ifdef USE_VFP
+       .macro  cpy_line_vfp vreg, base
+       vstr    \vreg, [dst, #\base]
+       vldr    \vreg, [src, #\base]
+       vstr    d0, [dst, #\base + 8]
+       vldr    d0, [src, #\base + 8]
+       vstr    d1, [dst, #\base + 16]
+       vldr    d1, [src, #\base + 16]
+       vstr    d2, [dst, #\base + 24]
+       vldr    d2, [src, #\base + 24]
+       vstr    \vreg, [dst, #\base + 32]
+       vldr    \vreg, [src, #\base + prefetch_lines * 64 - 32]
+       vstr    d0, [dst, #\base + 40]
+       vldr    d0, [src, #\base + 40]
+       vstr    d1, [dst, #\base + 48]
+       vldr    d1, [src, #\base + 48]
+       vstr    d2, [dst, #\base + 56]
+       vldr    d2, [src, #\base + 56]
+       .endm
+
+       .macro  cpy_tail_vfp vreg, base
+       vstr    \vreg, [dst, #\base]
+       vldr    \vreg, [src, #\base]
+       vstr    d0, [dst, #\base + 8]
+       vldr    d0, [src, #\base + 8]
+       vstr    d1, [dst, #\base + 16]
+       vldr    d1, [src, #\base + 16]
+       vstr    d2, [dst, #\base + 24]
+       vldr    d2, [src, #\base + 24]
+       vstr    \vreg, [dst, #\base + 32]
+       vstr    d0, [dst, #\base + 40]
+       vldr    d0, [src, #\base + 40]
+       vstr    d1, [dst, #\base + 48]
+       vldr    d1, [src, #\base + 48]
+       vstr    d2, [dst, #\base + 56]
+       vldr    d2, [src, #\base + 56]
+       .endm
+#endif
+
+       .p2align 6
+ENTRY(memcpy)
+
+       mov     dst, dstin      /* Preserve dstin, we need to return it.  */
+       cmp     count, #64
+       bge     .Lcpy_not_short
+       /* Deal with small copies quickly by dropping straight into the
+          exit block.  */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+       and     tmp1, count, #0x38
+       rsb     tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
+       add     pc, pc, tmp1
+       vld1.8  {d0}, [src]!    /* 14 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 12 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 10 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 8 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 6 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 4 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 2 words to go.  */
+       vst1.8  {d0}, [dst]!
+
+       tst     count, #4
+       ldrne   tmp1, [src], #4
+       strne   tmp1, [dst], #4
+#else
+       /* Copy up to 15 full words of data.  May not be aligned.  */
+       /* Cannot use VFP for unaligned data.  */
+       and     tmp1, count, #0x3c
+       add     dst, dst, tmp1
+       add     src, src, tmp1
+       rsb     tmp1, tmp1, #(60 - PC_OFS/2 + INSN_SIZE/2)
+       /* Jump directly into the sequence below at the correct offset.  */
+       add     pc, pc, tmp1, lsl #1
+
+       ldr     tmp1, [src, #-60]       /* 15 words to go.  */
+       str     tmp1, [dst, #-60]
+
+       ldr     tmp1, [src, #-56]       /* 14 words to go.  */
+       str     tmp1, [dst, #-56]
+       ldr     tmp1, [src, #-52]
+       str     tmp1, [dst, #-52]
+
+       ldr     tmp1, [src, #-48]       /* 12 words to go.  */
+       str     tmp1, [dst, #-48]
+       ldr     tmp1, [src, #-44]
+       str     tmp1, [dst, #-44]
+
+       ldr     tmp1, [src, #-40]       /* 10 words to go.  */
+       str     tmp1, [dst, #-40]
+       ldr     tmp1, [src, #-36]
+       str     tmp1, [dst, #-36]
+
+       ldr     tmp1, [src, #-32]       /* 8 words to go.  */
+       str     tmp1, [dst, #-32]
+       ldr     tmp1, [src, #-28]
+       str     tmp1, [dst, #-28]
+
+       ldr     tmp1, [src, #-24]       /* 6 words to go.  */
+       str     tmp1, [dst, #-24]
+       ldr     tmp1, [src, #-20]
+       str     tmp1, [dst, #-20]
+
+       ldr     tmp1, [src, #-16]       /* 4 words to go.  */
+       str     tmp1, [dst, #-16]
+       ldr     tmp1, [src, #-12]
+       str     tmp1, [dst, #-12]
+
+       ldr     tmp1, [src, #-8]        /* 2 words to go.  */
+       str     tmp1, [dst, #-8]
+       ldr     tmp1, [src, #-4]
+       str     tmp1, [dst, #-4]
+#endif
+
+       lsls    count, count, #31
+       ldrhcs  tmp1, [src], #2
+       ldrbne  src, [src]              /* Src is dead, use as a scratch.  */
+       strhcs  tmp1, [dst], #2
+       strbne  src, [dst]
+       bx      lr
+
+.Lcpy_not_short:
+       /* At least 64 bytes to copy, but don't know the alignment yet.  */
+       str     tmp2, [sp, #-FRAME_SIZE]!
+       cfi_adjust_cfa_offset (FRAME_SIZE)
+       cfi_rel_offset (tmp2, 0)
+       cfi_remember_state
+       and     tmp2, src, #3
+       and     tmp1, dst, #3
+       cmp     tmp1, tmp2
+       bne     .Lcpy_notaligned
+
+#ifdef USE_VFP
+       /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
+          that the FP pipeline is much better at streaming loads and
+          stores.  This is outside the critical loop.  */
+       vmov.f32        s0, s0
+#endif
+
+       /* SRC and DST have the same mutual 32-bit alignment, but we may
+          still need to pre-copy some bytes to get to natural alignment.
+          We bring DST into full 64-bit alignment.  */
+       lsls    tmp2, dst, #29
+       beq     1f
+       rsbs    tmp2, tmp2, #0
+       sub     count, count, tmp2, lsr #29
+       ldrmi   tmp1, [src], #4
+       strmi   tmp1, [dst], #4
+       lsls    tmp2, tmp2, #2
+       ldrhcs  tmp1, [src], #2
+       ldrbne  tmp2, [src], #1
+       strhcs  tmp1, [dst], #2
+       strbne  tmp2, [dst], #1
+
+1:
+       subs    tmp2, count, #64        /* Use tmp2 for count.  */
+       blt     .Ltail63aligned
+
+       cmp     tmp2, #512
+       bge     .Lcpy_body_long
+
+.Lcpy_body_medium:                     /* Count in tmp2.  */
+#ifdef USE_VFP
+1:
+       vldr    d0, [src, #0]
+       subs    tmp2, tmp2, #64
+       vldr    d1, [src, #8]
+       vstr    d0, [dst, #0]
+       vldr    d0, [src, #16]
+       vstr    d1, [dst, #8]
+       vldr    d1, [src, #24]
+       vstr    d0, [dst, #16]
+       vldr    d0, [src, #32]
+       vstr    d1, [dst, #24]
+       vldr    d1, [src, #40]
+       vstr    d0, [dst, #32]
+       vldr    d0, [src, #48]
+       vstr    d1, [dst, #40]
+       vldr    d1, [src, #56]
+       vstr    d0, [dst, #48]
+       add     src, src, #64
+       vstr    d1, [dst, #56]
+       add     dst, dst, #64
+       bge     1b
+       tst     tmp2, #0x3f
+       beq     .Ldone
+
+.Ltail63aligned:                       /* Count in tmp2.  */
+       and     tmp1, tmp2, #0x38
+       add     dst, dst, tmp1
+       add     src, src, tmp1
+       rsb     tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
+       add     pc, pc, tmp1
+
+       vldr    d0, [src, #-56] /* 14 words to go.  */
+       vstr    d0, [dst, #-56]
+       vldr    d0, [src, #-48] /* 12 words to go.  */
+       vstr    d0, [dst, #-48]
+       vldr    d0, [src, #-40] /* 10 words to go.  */
+       vstr    d0, [dst, #-40]
+       vldr    d0, [src, #-32] /* 8 words to go.  */
+       vstr    d0, [dst, #-32]
+       vldr    d0, [src, #-24] /* 6 words to go.  */
+       vstr    d0, [dst, #-24]
+       vldr    d0, [src, #-16] /* 4 words to go.  */
+       vstr    d0, [dst, #-16]
+       vldr    d0, [src, #-8]  /* 2 words to go.  */
+       vstr    d0, [dst, #-8]
+#else
+       sub     src, src, #8
+       sub     dst, dst, #8
+1:
+       ldrd    A_l, A_h, [src, #8]
+       strd    A_l, A_h, [dst, #8]
+       ldrd    A_l, A_h, [src, #16]
+       strd    A_l, A_h, [dst, #16]
+       ldrd    A_l, A_h, [src, #24]
+       strd    A_l, A_h, [dst, #24]
+       ldrd    A_l, A_h, [src, #32]
+       strd    A_l, A_h, [dst, #32]
+       ldrd    A_l, A_h, [src, #40]
+       strd    A_l, A_h, [dst, #40]
+       ldrd    A_l, A_h, [src, #48]
+       strd    A_l, A_h, [dst, #48]
+       ldrd    A_l, A_h, [src, #56]
+       strd    A_l, A_h, [dst, #56]
+       ldrd    A_l, A_h, [src, #64]!
+       strd    A_l, A_h, [dst, #64]!
+       subs    tmp2, tmp2, #64
+       bge     1b
+       tst     tmp2, #0x3f
+       bne     1f
+       ldr     tmp2,[sp], #FRAME_SIZE
+       cfi_adjust_cfa_offset (-FRAME_SIZE)
+       cfi_restore (tmp2)
+       bx      lr
+
+       cfi_restore_state
+       cfi_remember_state
+1:
+       add     src, src, #8
+       add     dst, dst, #8
+
+.Ltail63aligned:                       /* Count in tmp2.  */
+       /* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
+          we know that the src and dest are 32-bit aligned so we can use
+          LDRD/STRD to improve efficiency.  */
+       /* TMP2 is now negative, but we don't care about that.  The bottom
+          six bits still tell us how many bytes are left to copy.  */
+
+       and     tmp1, tmp2, #0x38
+       add     dst, dst, tmp1
+       add     src, src, tmp1
+       rsb     tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
+       add     pc, pc, tmp1
+       ldrd    A_l, A_h, [src, #-56]   /* 14 words to go.  */
+       strd    A_l, A_h, [dst, #-56]
+       ldrd    A_l, A_h, [src, #-48]   /* 12 words to go.  */
+       strd    A_l, A_h, [dst, #-48]
+       ldrd    A_l, A_h, [src, #-40]   /* 10 words to go.  */
+       strd    A_l, A_h, [dst, #-40]
+       ldrd    A_l, A_h, [src, #-32]   /* 8 words to go.  */
+       strd    A_l, A_h, [dst, #-32]
+       ldrd    A_l, A_h, [src, #-24]   /* 6 words to go.  */
+       strd    A_l, A_h, [dst, #-24]
+       ldrd    A_l, A_h, [src, #-16]   /* 4 words to go.  */
+       strd    A_l, A_h, [dst, #-16]
+       ldrd    A_l, A_h, [src, #-8]    /* 2 words to go.  */
+       strd    A_l, A_h, [dst, #-8]
+
+#endif
+       tst     tmp2, #4
+       ldrne   tmp1, [src], #4
+       strne   tmp1, [dst], #4
+       lsls    tmp2, tmp2, #31         /* Count (tmp2) now dead. */
+       ldrhcs  tmp1, [src], #2
+       ldrbne  tmp2, [src]
+       strhcs  tmp1, [dst], #2
+       strbne  tmp2, [dst]
+
+.Ldone:
+       ldr     tmp2, [sp], #FRAME_SIZE
+       cfi_adjust_cfa_offset (-FRAME_SIZE)
+       cfi_restore (tmp2)
+       bx      lr
+
+       cfi_restore_state
+       cfi_remember_state
+
+.Lcpy_body_long:                       /* Count in tmp2.  */
+
+       /* Long copy.  We know that there's at least (prefetch_lines * 64)
+          bytes to go.  */
+#ifdef USE_VFP
+       /* Don't use PLD.  Instead, read some data in advance of the current
+          copy position into a register.  This should act like a PLD
+          operation but we won't have to repeat the transfer.  */
+
+       vldr    d3, [src, #0]
+       vldr    d4, [src, #64]
+       vldr    d5, [src, #128]
+       vldr    d6, [src, #192]
+       vldr    d7, [src, #256]
+
+       vldr    d0, [src, #8]
+       vldr    d1, [src, #16]
+       vldr    d2, [src, #24]
+       add     src, src, #32
+
+       subs    tmp2, tmp2, #prefetch_lines * 64 * 2
+       blt     2f
+1:
+       cpy_line_vfp    d3, 0
+       cpy_line_vfp    d4, 64
+       cpy_line_vfp    d5, 128
+       add     dst, dst, #3 * 64
+       add     src, src, #3 * 64
+       cpy_line_vfp    d6, 0
+       cpy_line_vfp    d7, 64
+       add     dst, dst, #2 * 64
+       add     src, src, #2 * 64
+       subs    tmp2, tmp2, #prefetch_lines * 64
+       bge     1b
+
+2:
+       cpy_tail_vfp    d3, 0
+       cpy_tail_vfp    d4, 64
+       cpy_tail_vfp    d5, 128
+       add     src, src, #3 * 64
+       add     dst, dst, #3 * 64
+       cpy_tail_vfp    d6, 0
+       vstr    d7, [dst, #64]
+       vldr    d7, [src, #64]
+       vstr    d0, [dst, #64 + 8]
+       vldr    d0, [src, #64 + 8]
+       vstr    d1, [dst, #64 + 16]
+       vldr    d1, [src, #64 + 16]
+       vstr    d2, [dst, #64 + 24]
+       vldr    d2, [src, #64 + 24]
+       vstr    d7, [dst, #64 + 32]
+       add     src, src, #96
+       vstr    d0, [dst, #64 + 40]
+       vstr    d1, [dst, #64 + 48]
+       vstr    d2, [dst, #64 + 56]
+       add     dst, dst, #128
+       add     tmp2, tmp2, #prefetch_lines * 64
+       b       .Lcpy_body_medium
+#else
+       /* Long copy.  Use an SMS style loop to maximize the I/O
+          bandwidth of the core.  We don't have enough spare registers
+          to synthesise prefetching, so use PLD operations.  */
+       /* Pre-bias src and dst.  */
+       sub     src, src, #8
+       sub     dst, dst, #8
+       pld     [src, #8]
+       pld     [src, #72]
+       subs    tmp2, tmp2, #64
+       pld     [src, #136]
+       ldrd    A_l, A_h, [src, #8]
+       strd    B_l, B_h, [sp, #8]
+       cfi_rel_offset (B_l, 8)
+       cfi_rel_offset (B_h, 12)
+       ldrd    B_l, B_h, [src, #16]
+       strd    C_l, C_h, [sp, #16]
+       cfi_rel_offset (C_l, 16)
+       cfi_rel_offset (C_h, 20)
+       ldrd    C_l, C_h, [src, #24]
+       strd    D_l, D_h, [sp, #24]
+       cfi_rel_offset (D_l, 24)
+       cfi_rel_offset (D_h, 28)
+       pld     [src, #200]
+       ldrd    D_l, D_h, [src, #32]!
+       b       1f
+       .p2align        6
+2:
+       pld     [src, #232]
+       strd    A_l, A_h, [dst, #40]
+       ldrd    A_l, A_h, [src, #40]
+       strd    B_l, B_h, [dst, #48]
+       ldrd    B_l, B_h, [src, #48]
+       strd    C_l, C_h, [dst, #56]
+       ldrd    C_l, C_h, [src, #56]
+       strd    D_l, D_h, [dst, #64]!
+       ldrd    D_l, D_h, [src, #64]!
+       subs    tmp2, tmp2, #64
+1:
+       strd    A_l, A_h, [dst, #8]
+       ldrd    A_l, A_h, [src, #8]
+       strd    B_l, B_h, [dst, #16]
+       ldrd    B_l, B_h, [src, #16]
+       strd    C_l, C_h, [dst, #24]
+       ldrd    C_l, C_h, [src, #24]
+       strd    D_l, D_h, [dst, #32]
+       ldrd    D_l, D_h, [src, #32]
+       bcs     2b
+       /* Save the remaining bytes and restore the callee-saved regs.  */
+       strd    A_l, A_h, [dst, #40]
+       add     src, src, #40
+       strd    B_l, B_h, [dst, #48]
+       ldrd    B_l, B_h, [sp, #8]
+       cfi_restore (B_l)
+       cfi_restore (B_h)
+       strd    C_l, C_h, [dst, #56]
+       ldrd    C_l, C_h, [sp, #16]
+       cfi_restore (C_l)
+       cfi_restore (C_h)
+       strd    D_l, D_h, [dst, #64]
+       ldrd    D_l, D_h, [sp, #24]
+       cfi_restore (D_l)
+       cfi_restore (D_h)
+       add     dst, dst, #72
+       tst     tmp2, #0x3f
+       bne     .Ltail63aligned
+       ldr     tmp2, [sp], #FRAME_SIZE
+       cfi_adjust_cfa_offset (-FRAME_SIZE)
+       cfi_restore (tmp2)
+       bx      lr
+#endif
+
+       cfi_restore_state
+       cfi_remember_state
+
+.Lcpy_notaligned:
+       pld     [src]
+       pld     [src, #64]
+       /* There's at least 64 bytes to copy, but there is no mutual
+          alignment.  */
+       /* Bring DST to 64-bit alignment.  */
+       lsls    tmp2, dst, #29
+       pld     [src, #(2 * 64)]
+       beq     1f
+       rsbs    tmp2, tmp2, #0
+       sub     count, count, tmp2, lsr #29
+       ldrmi   tmp1, [src], #4
+       strmi   tmp1, [dst], #4
+       lsls    tmp2, tmp2, #2
+       ldrbne  tmp1, [src], #1
+       ldrhcs  tmp2, [src], #2
+       strbne  tmp1, [dst], #1
+       strhcs  tmp2, [dst], #2
+1:
+       pld     [src, #(3 * 64)]
+       subs    count, count, #64
+       ldrmi   tmp2, [sp], #FRAME_SIZE
+       bmi     .Ltail63unaligned
+       pld     [src, #(4 * 64)]
+
+#ifdef USE_NEON
+       vld1.8  {d0-d3}, [src]!
+       vld1.8  {d4-d7}, [src]!
+       subs    count, count, #64
+       bmi     2f
+1:
+       pld     [src, #(4 * 64)]
+       vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
+       vld1.8  {d0-d3}, [src]!
+       vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
+       vld1.8  {d4-d7}, [src]!
+       subs    count, count, #64
+       bpl     1b
+2:
+       vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
+       vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
+       ands    count, count, #0x3f
+#else
+       /* Use an SMS style loop to maximize the I/O bandwidth.  */
+       sub     src, src, #4
+       sub     dst, dst, #8
+       subs    tmp2, count, #64        /* Use tmp2 for count.  */
+       ldr     A_l, [src, #4]
+       ldr     A_h, [src, #8]
+       strd    B_l, B_h, [sp, #8]
+       cfi_rel_offset (B_l, 8)
+       cfi_rel_offset (B_h, 12)
+       ldr     B_l, [src, #12]
+       ldr     B_h, [src, #16]
+       strd    C_l, C_h, [sp, #16]
+       cfi_rel_offset (C_l, 16)
+       cfi_rel_offset (C_h, 20)
+       ldr     C_l, [src, #20]
+       ldr     C_h, [src, #24]
+       strd    D_l, D_h, [sp, #24]
+       cfi_rel_offset (D_l, 24)
+       cfi_rel_offset (D_h, 28)
+       ldr     D_l, [src, #28]
+       ldr     D_h, [src, #32]!
+       b       1f
+       .p2align        6
+2:
+       pld     [src, #(5 * 64) - (32 - 4)]
+       strd    A_l, A_h, [dst, #40]
+       ldr     A_l, [src, #36]
+       ldr     A_h, [src, #40]
+       strd    B_l, B_h, [dst, #48]
+       ldr     B_l, [src, #44]
+       ldr     B_h, [src, #48]
+       strd    C_l, C_h, [dst, #56]
+       ldr     C_l, [src, #52]
+       ldr     C_h, [src, #56]
+       strd    D_l, D_h, [dst, #64]!
+       ldr     D_l, [src, #60]
+       ldr     D_h, [src, #64]!
+       subs    tmp2, tmp2, #64
+1:
+       strd    A_l, A_h, [dst, #8]
+       ldr     A_l, [src, #4]
+       ldr     A_h, [src, #8]
+       strd    B_l, B_h, [dst, #16]
+       ldr     B_l, [src, #12]
+       ldr     B_h, [src, #16]
+       strd    C_l, C_h, [dst, #24]
+       ldr     C_l, [src, #20]
+       ldr     C_h, [src, #24]
+       strd    D_l, D_h, [dst, #32]
+       ldr     D_l, [src, #28]
+       ldr     D_h, [src, #32]
+       bcs     2b
+
+       /* Save the remaining bytes and restore the callee-saved regs.  */
+       strd    A_l, A_h, [dst, #40]
+       add     src, src, #36
+       strd    B_l, B_h, [dst, #48]
+       ldrd    B_l, B_h, [sp, #8]
+       cfi_restore (B_l)
+       cfi_restore (B_h)
+       strd    C_l, C_h, [dst, #56]
+       ldrd    C_l, C_h, [sp, #16]
+       cfi_restore (C_l)
+       cfi_restore (C_h)
+       strd    D_l, D_h, [dst, #64]
+       ldrd    D_l, D_h, [sp, #24]
+       cfi_restore (D_l)
+       cfi_restore (D_h)
+       add     dst, dst, #72
+       ands    count, tmp2, #0x3f
+#endif
+       ldr     tmp2, [sp], #FRAME_SIZE
+       cfi_adjust_cfa_offset (-FRAME_SIZE)
+       cfi_restore (tmp2)
+       bne     .Ltail63unaligned
+       bx      lr
+
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S

new file mode 100644 (file)

index 0000000..c0ef1f8
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
@@ -0,0 +1,3 @@
+#define MEMCPY_NEON
+#define memcpy __memcpy_neon
+#include "memcpy_impl.S"
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S

new file mode 100644 (file)

index 0000000..d21b702
--- /dev/null
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
@@ -0,0 +1,3 @@
+#define MEMCPY_VFP
+#define memcpy __memcpy_vfp
+#include "memcpy_impl.S"
author	Will Newton <will.newton@linaro.org>
	Wed, 8 May 2013 12:06:34 +0000 (12:06 +0000)
committer	Joseph Myers <joseph@codesourcery.com>
	Wed, 8 May 2013 12:06:34 +0000 (12:06 +0000)
ports/ChangeLog.arm		patch \| blob \| blame \| history
ports/sysdeps/arm/armv7/configure	[new file with mode: 0755]	patch \| blob
ports/sysdeps/arm/armv7/configure.in	[new file with mode: 0644]	patch \| blob
ports/sysdeps/arm/armv7/multiarch/Makefile	[new file with mode: 0644]	patch \| blob
ports/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c	[new file with mode: 0644]	patch \| blob
ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c	[new file with mode: 0644]	patch \| blob
ports/sysdeps/arm/armv7/multiarch/memcpy.S	[new file with mode: 0644]	patch \| blob
ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S	[new file with mode: 0644]	patch \| blob
ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S	[new file with mode: 0644]	patch \| blob
ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S	[new file with mode: 0644]	patch \| blob