[PATCH] ARM: Add Cortex-A15 optimized NEON and VFP memcpy routines, with IFUNC.

Mon Apr 15 10:01:00 GMT 2013

Attached are a set of benchmarks of the new code versus the existing
memcpy implementation on a Cortex-A15 platform.

On 15 April 2013 10:56, Will Newton <will.newton@linaro.org> wrote:
>
> Add a high performance memcpy routine optimized for Cortex-A15 with
> variants for use in the presence of NEON and VFP hardware, selected
> at runtime using indirect function support.
>
> This was tested on armv7l-unknown-linux-gnueabihf. One new testsuite
> failure was introduced (elf/ifuncmain5picstatic) which was caused by
> a bug in ld. A fix for that ld issue has been submitted here:
>
>   http://sourceware.org/ml/binutils/2013-04/msg00143.html
>
> ports/ChangeLog.arm:
>
> 2013-04-15  Will Newton  <will.newton@linaro.org>
>
>         * sysdeps/arm/armv7/multiarch/Makefile: New file.
>         * sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Likewise.
>         * sysdeps/arm/armv7/multiarch/memcpy.S: Likewise.
>         * sysdeps/arm/armv7/multiarch/memcpy_impl.S: Likewise.
>         * sysdeps/arm/armv7/multiarch/memcpy_neon.S: Likewise.
>         * sysdeps/arm/armv7/multiarch/memcpy_vfp.S: Likewise.
>
> Signed-off-by: Will Newton <will.newton@linaro.org>
> ---
>  ports/sysdeps/arm/armv7/multiarch/Makefile         |   3 +
>  .../sysdeps/arm/armv7/multiarch/ifunc-impl-list.c  |  46 ++
>  ports/sysdeps/arm/armv7/multiarch/memcpy.S         |  96 ++++
>  ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S    | 600 +++++++++++++++++++++
>  ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S    |   3 +
>  ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S     |   3 +
>  6 files changed, 751 insertions(+)
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/Makefile
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy.S
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
>
> diff --git a/ports/sysdeps/arm/armv7/multiarch/Makefile b/ports/sysdeps/arm/armv7/multiarch/Makefile
> new file mode 100644
> index 0000000..e834cc9
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/Makefile
> @@ -0,0 +1,3 @@
> +ifeq ($(subdir),string)
> +sysdep_routines += memcpy_neon memcpy_vfp
> +endif
> diff --git a/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
> new file mode 100644
> index 0000000..176288b
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
> @@ -0,0 +1,46 @@
> +/* Enumerate available IFUNC implementations of a function.  arm version.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <assert.h>
> +#include <string.h>
> +#include <wchar.h>
> +#include <ldsodefs.h>
> +#include <sysdep.h>
> +#include <ifunc-impl-list.h>
> +
> +/* Fill ARRAY of MAX elements with IFUNC implementations for function
> +   NAME and return the number of valid entries.  */
> +
> +size_t
> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> +                       size_t max)
> +{
> +  size_t i = 0;
> +  int hwcap;
> +
> +  hwcap = GLRO(dl_hwcap);
> +
> +  IFUNC_IMPL (i, name, memcpy,
> +             IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_NEON,
> +                             __memcpy_neon)
> +             IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_VFPv3,
> +                             __memcpy_vfp)
> +             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
> +
> +  return i;
> +}
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy.S b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
> new file mode 100644
> index 0000000..a9e2faf
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
> @@ -0,0 +1,96 @@
> +/* Multiple versions of memcpy
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <rtld-global-offsets.h>
> +#include <arm-features.h>
> +
> +#if !defined NOT_IN_libc
> +       .text
> +ENTRY(memcpy)
> +       .type   memcpy, %gnu_indirect_function
> +# ifdef PIC
> +       ldr     a3, 1f
> +0:     add     a3, pc, a3
> +# endif
> +
> +       tst     a1, #HWCAP_ARM_NEON
> +       beq     .Lno_neon
> +# ifdef PIC
> +       ldr     a4, .Lmemcpy_neon
> +       ldr     r0, [a3, a4]
> +# else
> +       ldr     r0, .Lmemcpy_neon
> +# endif
> +       b       .Lreturn
> +.Lno_neon:
> +
> +       tst     a1, #HWCAP_ARM_VFP
> +       beq     .Lno_vfp
> +# ifdef PIC
> +       ldr     a4, .Lmemcpy_vfp
> +       ldr     r0, [a3, a4]
> +# else
> +       ldr     r0, .Lmemcpy_vfp
> +# endif
> +       b       .Lreturn
> +.Lno_vfp:
> +# ifdef PIC
> +       ldr     a4, .Lmemcpy_arm
> +       ldr     r0, [a3, a4]
> +# else
> +       ldr     r0, .Lmemcpy_arm
> +# endif
> +
> +.Lreturn:
> +       DO_RET(lr)
> +
> +# ifdef PIC
> +1:     .long   _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS
> +.Lmemcpy_neon:
> +       .long   C_SYMBOL_NAME(__memcpy_neon)(GOT)
> +.Lmemcpy_vfp:
> +       .long   C_SYMBOL_NAME(__memcpy_vfp)(GOT)
> +.Lmemcpy_arm:
> +       .long   C_SYMBOL_NAME(__memcpy_arm)(GOT)
> +# else
> +.Lmemcpy_neon:
> +       .long   C_SYMBOL_NAME(__memcpy_neon)
> +.Lmemcpy_vfp:
> +       .long   C_SYMBOL_NAME(__memcpy_vfp)
> +.Lmemcpy_arm:
> +       .long   C_SYMBOL_NAME(__memcpy_arm)
> +# endif
> +
> +END(memcpy)
> +
> +libc_hidden_builtin_def (memcpy)
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +#undef weak_alias
> +#define weak_alias(x, y)
> +#undef libc_hidden_def
> +#define libc_hidden_def(name)
> +
> +#define memcpy __memcpy_arm
> +
> +#endif
> +
> +#include "memcpy_impl.S"
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
> new file mode 100644
> index 0000000..2c466d25
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
> @@ -0,0 +1,600 @@
> +/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.
> +
> +   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
> +   of VFP or NEON when built with the appropriate flags.
> +
> +   Assumptions:
> +
> +    ARMv6 (ARMv7-a if using Neon)
> +    ARM state
> +    Unaligned accesses
> +    LDRD/STRD support unaligned word accesses
> +
> + */
> +
> +#include <sysdep.h>
> +
> +       .syntax unified
> +       /* This implementation requires ARM state.  */
> +       .arm
> +
> +#ifdef MEMCPY_NEON
> +
> +       .fpu    neon
> +       .arch   armv7-a
> +# define FRAME_SIZE    4
> +# define USE_VFP
> +# define USE_NEON
> +
> +#elif defined (MEMCPY_VFP)
> +
> +       .arch   armv6
> +       .fpu    vfpv2
> +# define FRAME_SIZE    32
> +# define USE_VFP
> +
> +#else
> +       .arch   armv6
> +# define FRAME_SIZE    32
> +
> +#endif
> +
> +/* Old versions of GAS incorrectly implement the NEON align semantics.  */
> +#ifdef BROKEN_ASM_NEON_ALIGN
> +#define ALIGN(addr, align) addr,:align
> +#else
> +#define ALIGN(addr, align) addr:align
> +#endif
> +
> +#define PC_OFFSET      8       /* PC pipeline compensation.  */
> +#define INSN_SIZE      4
> +
> +/* Call parameters.  */
> +#define dstin  r0
> +#define src    r1
> +#define count  r2
> +
> +/* Locals.  */
> +#define tmp1   r3
> +#define dst    ip
> +#define tmp2   r10
> +
> +#ifndef USE_NEON
> +/* For bulk copies using GP registers.  */
> +#define        A_l     r2              /* Call-clobbered.  */
> +#define        A_h     r3              /* Call-clobbered.  */
> +#define        B_l     r4
> +#define        B_h     r5
> +#define        C_l     r6
> +#define        C_h     r7
> +#define        D_l     r8
> +#define        D_h     r9
> +#endif
> +
> +/* Number of lines ahead to pre-fetch data.  If you change this the code
> +   below will need adjustment to compensate.  */
> +
> +#define prefetch_lines 5
> +
> +#ifdef USE_VFP
> +       .macro  cpy_line_vfp vreg, base
> +       vstr    \vreg, [dst, #\base]
> +       vldr    \vreg, [src, #\base]
> +       vstr    d0, [dst, #\base + 8]
> +       vldr    d0, [src, #\base + 8]
> +       vstr    d1, [dst, #\base + 16]
> +       vldr    d1, [src, #\base + 16]
> +       vstr    d2, [dst, #\base + 24]
> +       vldr    d2, [src, #\base + 24]
> +       vstr    \vreg, [dst, #\base + 32]
> +       vldr    \vreg, [src, #\base + prefetch_lines * 64 - 32]
> +       vstr    d0, [dst, #\base + 40]
> +       vldr    d0, [src, #\base + 40]
> +       vstr    d1, [dst, #\base + 48]
> +       vldr    d1, [src, #\base + 48]
> +       vstr    d2, [dst, #\base + 56]
> +       vldr    d2, [src, #\base + 56]
> +       .endm
> +
> +       .macro  cpy_tail_vfp vreg, base
> +       vstr    \vreg, [dst, #\base]
> +       vldr    \vreg, [src, #\base]
> +       vstr    d0, [dst, #\base + 8]
> +       vldr    d0, [src, #\base + 8]
> +       vstr    d1, [dst, #\base + 16]
> +       vldr    d1, [src, #\base + 16]
> +       vstr    d2, [dst, #\base + 24]
> +       vldr    d2, [src, #\base + 24]
> +       vstr    \vreg, [dst, #\base + 32]
> +       vstr    d0, [dst, #\base + 40]
> +       vldr    d0, [src, #\base + 40]
> +       vstr    d1, [dst, #\base + 48]
> +       vldr    d1, [src, #\base + 48]
> +       vstr    d2, [dst, #\base + 56]
> +       vldr    d2, [src, #\base + 56]
> +       .endm
> +#endif
> +
> +       .p2align 6
> +ENTRY(memcpy)
> +
> +       mov     dst, dstin      /* Preserve dstin, we need to return it.  */
> +       cmp     count, #64
> +       bge     .Lcpy_not_short
> +       /* Deal with small copies quickly by dropping straight into the
> +          exit block.  */
> +
> +.Ltail63unaligned:
> +#ifdef USE_NEON
> +       and     tmp1, count, #0x38
> +       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> +       add     pc, pc, tmp1
> +       vld1.8  {d0}, [src]!    /* 14 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 12 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 10 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 8 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 6 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 4 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 2 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +
> +       tst     count, #4
> +       ldrne   tmp1, [src], #4
> +       strne   tmp1, [dst], #4
> +#else
> +       /* Copy up to 15 full words of data.  May not be aligned.  */
> +       /* Cannot use VFP for unaligned data.  */
> +       and     tmp1, count, #0x3c
> +       add     dst, dst, tmp1
> +       add     src, src, tmp1
> +       rsb     tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
> +       /* Jump directly into the sequence below at the correct offset.  */
> +       add     pc, pc, tmp1, lsl #1
> +
> +       ldr     tmp1, [src, #-60]       /* 15 words to go.  */
> +       str     tmp1, [dst, #-60]
> +
> +       ldr     tmp1, [src, #-56]       /* 14 words to go.  */
> +       str     tmp1, [dst, #-56]
> +       ldr     tmp1, [src, #-52]
> +       str     tmp1, [dst, #-52]
> +
> +       ldr     tmp1, [src, #-48]       /* 12 words to go.  */
> +       str     tmp1, [dst, #-48]
> +       ldr     tmp1, [src, #-44]
> +       str     tmp1, [dst, #-44]
> +
> +       ldr     tmp1, [src, #-40]       /* 10 words to go.  */
> +       str     tmp1, [dst, #-40]
> +       ldr     tmp1, [src, #-36]
> +       str     tmp1, [dst, #-36]
> +
> +       ldr     tmp1, [src, #-32]       /* 8 words to go.  */
> +       str     tmp1, [dst, #-32]
> +       ldr     tmp1, [src, #-28]
> +       str     tmp1, [dst, #-28]
> +
> +       ldr     tmp1, [src, #-24]       /* 6 words to go.  */
> +       str     tmp1, [dst, #-24]
> +       ldr     tmp1, [src, #-20]
> +       str     tmp1, [dst, #-20]
> +
> +       ldr     tmp1, [src, #-16]       /* 4 words to go.  */
> +       str     tmp1, [dst, #-16]
> +       ldr     tmp1, [src, #-12]
> +       str     tmp1, [dst, #-12]
> +
> +       ldr     tmp1, [src, #-8]        /* 2 words to go.  */
> +       str     tmp1, [dst, #-8]
> +       ldr     tmp1, [src, #-4]
> +       str     tmp1, [dst, #-4]
> +#endif
> +
> +       lsls    count, count, #31
> +       ldrhcs  tmp1, [src], #2
> +       ldrbne  src, [src]              /* Src is dead, use as a scratch.  */
> +       strhcs  tmp1, [dst], #2
> +       strbne  src, [dst]
> +       bx      lr
> +
> +.Lcpy_not_short:
> +       /* At least 64 bytes to copy, but don't know the alignment yet.  */
> +       str     tmp2, [sp, #-FRAME_SIZE]!
> +       and     tmp2, src, #3
> +       and     tmp1, dst, #3
> +       cmp     tmp1, tmp2
> +       bne     .Lcpy_notaligned
> +
> +#ifdef USE_VFP
> +       /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
> +          that the FP pipeline is much better at streaming loads and
> +          stores.  This is outside the critical loop.  */
> +       vmov.f32        s0, s0
> +#endif
> +
> +       /* SRC and DST have the same mutual 32-bit alignment, but we may
> +          still need to pre-copy some bytes to get to natural alignment.
> +          We bring DST into full 64-bit alignment.  */
> +       lsls    tmp2, dst, #29
> +       beq     1f
> +       rsbs    tmp2, tmp2, #0
> +       sub     count, count, tmp2, lsr #29
> +       ldrmi   tmp1, [src], #4
> +       strmi   tmp1, [dst], #4
> +       lsls    tmp2, tmp2, #2
> +       ldrhcs  tmp1, [src], #2
> +       ldrbne  tmp2, [src], #1
> +       strhcs  tmp1, [dst], #2
> +       strbne  tmp2, [dst], #1
> +
> +1:
> +       subs    tmp2, count, #64        /* Use tmp2 for count.  */
> +       blt     .Ltail63aligned
> +
> +       cmp     tmp2, #512
> +       bge     .Lcpy_body_long
> +
> +.Lcpy_body_medium:                     /* Count in tmp2.  */
> +#ifdef USE_VFP
> +1:
> +       vldr    d0, [src, #0]
> +       subs    tmp2, tmp2, #64
> +       vldr    d1, [src, #8]
> +       vstr    d0, [dst, #0]
> +       vldr    d0, [src, #16]
> +       vstr    d1, [dst, #8]
> +       vldr    d1, [src, #24]
> +       vstr    d0, [dst, #16]
> +       vldr    d0, [src, #32]
> +       vstr    d1, [dst, #24]
> +       vldr    d1, [src, #40]
> +       vstr    d0, [dst, #32]
> +       vldr    d0, [src, #48]
> +       vstr    d1, [dst, #40]
> +       vldr    d1, [src, #56]
> +       vstr    d0, [dst, #48]
> +       add     src, src, #64
> +       vstr    d1, [dst, #56]
> +       add     dst, dst, #64
> +       bge     1b
> +       tst     tmp2, #0x3f
> +       beq     .Ldone
> +
> +.Ltail63aligned:                       /* Count in tmp2.  */
> +       and     tmp1, tmp2, #0x38
> +       add     dst, dst, tmp1
> +       add     src, src, tmp1
> +       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> +       add     pc, pc, tmp1
> +
> +       vldr    d0, [src, #-56] /* 14 words to go.  */
> +       vstr    d0, [dst, #-56]
> +       vldr    d0, [src, #-48] /* 12 words to go.  */
> +       vstr    d0, [dst, #-48]
> +       vldr    d0, [src, #-40] /* 10 words to go.  */
> +       vstr    d0, [dst, #-40]
> +       vldr    d0, [src, #-32] /* 8 words to go.  */
> +       vstr    d0, [dst, #-32]
> +       vldr    d0, [src, #-24] /* 6 words to go.  */
> +       vstr    d0, [dst, #-24]
> +       vldr    d0, [src, #-16] /* 4 words to go.  */
> +       vstr    d0, [dst, #-16]
> +       vldr    d0, [src, #-8]  /* 2 words to go.  */
> +       vstr    d0, [dst, #-8]
> +#else
> +       sub     src, src, #8
> +       sub     dst, dst, #8
> +1:
> +       ldrd    A_l, A_h, [src, #8]
> +       strd    A_l, A_h, [dst, #8]
> +       ldrd    A_l, A_h, [src, #16]
> +       strd    A_l, A_h, [dst, #16]
> +       ldrd    A_l, A_h, [src, #24]
> +       strd    A_l, A_h, [dst, #24]
> +       ldrd    A_l, A_h, [src, #32]
> +       strd    A_l, A_h, [dst, #32]
> +       ldrd    A_l, A_h, [src, #40]
> +       strd    A_l, A_h, [dst, #40]
> +       ldrd    A_l, A_h, [src, #48]
> +       strd    A_l, A_h, [dst, #48]
> +       ldrd    A_l, A_h, [src, #56]
> +       strd    A_l, A_h, [dst, #56]
> +       ldrd    A_l, A_h, [src, #64]!
> +       strd    A_l, A_h, [dst, #64]!
> +       subs    tmp2, tmp2, #64
> +       bge     1b
> +       tst     tmp2, #0x3f
> +       bne     1f
> +       ldr     tmp2,[sp], #FRAME_SIZE
> +       bx      lr
> +1:
> +       add     src, src, #8
> +       add     dst, dst, #8
> +
> +.Ltail63aligned:                       /* Count in tmp2.  */
> +       /* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
> +          we know that the src and dest are 32-bit aligned so we can use
> +          LDRD/STRD to improve efficiency.  */
> +       /* TMP2 is now negative, but we don't care about that.  The bottom
> +          six bits still tell us how many bytes are left to copy.  */
> +
> +       and     tmp1, tmp2, #0x38
> +       add     dst, dst, tmp1
> +       add     src, src, tmp1
> +       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> +       add     pc, pc, tmp1
> +       ldrd    A_l, A_h, [src, #-56]   /* 14 words to go.  */
> +       strd    A_l, A_h, [dst, #-56]
> +       ldrd    A_l, A_h, [src, #-48]   /* 12 words to go.  */
> +       strd    A_l, A_h, [dst, #-48]
> +       ldrd    A_l, A_h, [src, #-40]   /* 10 words to go.  */
> +       strd    A_l, A_h, [dst, #-40]
> +       ldrd    A_l, A_h, [src, #-32]   /* 8 words to go.  */
> +       strd    A_l, A_h, [dst, #-32]
> +       ldrd    A_l, A_h, [src, #-24]   /* 6 words to go.  */
> +       strd    A_l, A_h, [dst, #-24]
> +       ldrd    A_l, A_h, [src, #-16]   /* 4 words to go.  */
> +       strd    A_l, A_h, [dst, #-16]
> +       ldrd    A_l, A_h, [src, #-8]    /* 2 words to go.  */
> +       strd    A_l, A_h, [dst, #-8]
> +
> +#endif
> +       tst     tmp2, #4
> +       ldrne   tmp1, [src], #4
> +       strne   tmp1, [dst], #4
> +       lsls    tmp2, tmp2, #31         /* Count (tmp2) now dead. */
> +       ldrhcs  tmp1, [src], #2
> +       ldrbne  tmp2, [src]
> +       strhcs  tmp1, [dst], #2
> +       strbne  tmp2, [dst]
> +
> +.Ldone:
> +       ldr     tmp2, [sp], #FRAME_SIZE
> +       bx      lr
> +
> +.Lcpy_body_long:                       /* Count in tmp2.  */
> +
> +       /* Long copy.  We know that there's at least (prefetch_lines * 64)
> +          bytes to go.  */
> +#ifdef USE_VFP
> +       /* Don't use PLD.  Instead, read some data in advance of the current
> +          copy position into a register.  This should act like a PLD
> +          operation but we won't have to repeat the transfer.  */
> +
> +       vldr    d3, [src, #0]
> +       vldr    d4, [src, #64]
> +       vldr    d5, [src, #128]
> +       vldr    d6, [src, #192]
> +       vldr    d7, [src, #256]
> +
> +       vldr    d0, [src, #8]
> +       vldr    d1, [src, #16]
> +       vldr    d2, [src, #24]
> +       add     src, src, #32
> +
> +       subs    tmp2, tmp2, #prefetch_lines * 64 * 2
> +       blt     2f
> +1:
> +       cpy_line_vfp    d3, 0
> +       cpy_line_vfp    d4, 64
> +       cpy_line_vfp    d5, 128
> +       add     dst, dst, #3 * 64
> +       add     src, src, #3 * 64
> +       cpy_line_vfp    d6, 0
> +       cpy_line_vfp    d7, 64
> +       add     dst, dst, #2 * 64
> +       add     src, src, #2 * 64
> +       subs    tmp2, tmp2, #prefetch_lines * 64
> +       bge     1b
> +
> +2:
> +       cpy_tail_vfp    d3, 0
> +       cpy_tail_vfp    d4, 64
> +       cpy_tail_vfp    d5, 128
> +       add     src, src, #3 * 64
> +       add     dst, dst, #3 * 64
> +       cpy_tail_vfp    d6, 0
> +       vstr    d7, [dst, #64]
> +       vldr    d7, [src, #64]
> +       vstr    d0, [dst, #64 + 8]
> +       vldr    d0, [src, #64 + 8]
> +       vstr    d1, [dst, #64 + 16]
> +       vldr    d1, [src, #64 + 16]
> +       vstr    d2, [dst, #64 + 24]
> +       vldr    d2, [src, #64 + 24]
> +       vstr    d7, [dst, #64 + 32]
> +       add     src, src, #96
> +       vstr    d0, [dst, #64 + 40]
> +       vstr    d1, [dst, #64 + 48]
> +       vstr    d2, [dst, #64 + 56]
> +       add     dst, dst, #128
> +       add     tmp2, tmp2, #prefetch_lines * 64
> +       b       .Lcpy_body_medium
> +#else
> +       /* Long copy.  Use an SMS style loop to maximize the I/O
> +          bandwidth of the core.  We don't have enough spare registers
> +          to synthesise prefetching, so use PLD operations.  */
> +       /* Pre-bias src and dst.  */
> +       sub     src, src, #8
> +       sub     dst, dst, #8
> +       pld     [src, #8]
> +       pld     [src, #72]
> +       subs    tmp2, tmp2, #64
> +       pld     [src, #136]
> +       ldrd    A_l, A_h, [src, #8]
> +       strd    B_l, B_h, [sp, #8]
> +       ldrd    B_l, B_h, [src, #16]
> +       strd    C_l, C_h, [sp, #16]
> +       ldrd    C_l, C_h, [src, #24]
> +       strd    D_l, D_h, [sp, #24]
> +       pld     [src, #200]
> +       ldrd    D_l, D_h, [src, #32]!
> +       b       1f
> +       .p2align        6
> +2:
> +       pld     [src, #232]
> +       strd    A_l, A_h, [dst, #40]
> +       ldrd    A_l, A_h, [src, #40]
> +       strd    B_l, B_h, [dst, #48]
> +       ldrd    B_l, B_h, [src, #48]
> +       strd    C_l, C_h, [dst, #56]
> +       ldrd    C_l, C_h, [src, #56]
> +       strd    D_l, D_h, [dst, #64]!
> +       ldrd    D_l, D_h, [src, #64]!
> +       subs    tmp2, tmp2, #64
> +1:
> +       strd    A_l, A_h, [dst, #8]
> +       ldrd    A_l, A_h, [src, #8]
> +       strd    B_l, B_h, [dst, #16]
> +       ldrd    B_l, B_h, [src, #16]
> +       strd    C_l, C_h, [dst, #24]
> +       ldrd    C_l, C_h, [src, #24]
> +       strd    D_l, D_h, [dst, #32]
> +       ldrd    D_l, D_h, [src, #32]
> +       bcs     2b
> +       /* Save the remaining bytes and restore the callee-saved regs.  */
> +       strd    A_l, A_h, [dst, #40]
> +       add     src, src, #40
> +       strd    B_l, B_h, [dst, #48]
> +       ldrd    B_l, B_h, [sp, #8]
> +       strd    C_l, C_h, [dst, #56]
> +       ldrd    C_l, C_h, [sp, #16]
> +       strd    D_l, D_h, [dst, #64]
> +       ldrd    D_l, D_h, [sp, #24]
> +       add     dst, dst, #72
> +       tst     tmp2, #0x3f
> +       bne     .Ltail63aligned
> +       ldr     tmp2, [sp], #FRAME_SIZE
> +       bx      lr
> +#endif
> +
> +.Lcpy_notaligned:
> +       pld     [src]
> +       pld     [src, #64]
> +       /* There's at least 64 bytes to copy, but there is no mutual
> +          alignment.  */
> +       /* Bring DST to 64-bit alignment.  */
> +       lsls    tmp2, dst, #29
> +       pld     [src, #(2 * 64)]
> +       beq     1f
> +       rsbs    tmp2, tmp2, #0
> +       sub     count, count, tmp2, lsr #29
> +       ldrmi   tmp1, [src], #4
> +       strmi   tmp1, [dst], #4
> +       lsls    tmp2, tmp2, #2
> +       ldrbne  tmp1, [src], #1
> +       ldrhcs  tmp2, [src], #2
> +       strbne  tmp1, [dst], #1
> +       strhcs  tmp2, [dst], #2
> +1:
> +       pld     [src, #(3 * 64)]
> +       subs    count, count, #64
> +       ldrmi   tmp2, [sp], #FRAME_SIZE
> +       bmi     .Ltail63unaligned
> +       pld     [src, #(4 * 64)]
> +
> +#ifdef USE_NEON
> +       vld1.8  {d0-d3}, [src]!
> +       vld1.8  {d4-d7}, [src]!
> +       subs    count, count, #64
> +       bmi     2f
> +1:
> +       pld     [src, #(4 * 64)]
> +       vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
> +       vld1.8  {d0-d3}, [src]!
> +       vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
> +       vld1.8  {d4-d7}, [src]!
> +       subs    count, count, #64
> +       bpl     1b
> +2:
> +       vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
> +       vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
> +       ands    count, count, #0x3f
> +#else
> +       /* Use an SMS style loop to maximize the I/O bandwidth.  */
> +       sub     src, src, #4
> +       sub     dst, dst, #8
> +       subs    tmp2, count, #64        /* Use tmp2 for count.  */
> +       ldr     A_l, [src, #4]
> +       ldr     A_h, [src, #8]
> +       strd    B_l, B_h, [sp, #8]
> +       ldr     B_l, [src, #12]
> +       ldr     B_h, [src, #16]
> +       strd    C_l, C_h, [sp, #16]
> +       ldr     C_l, [src, #20]
> +       ldr     C_h, [src, #24]
> +       strd    D_l, D_h, [sp, #24]
> +       ldr     D_l, [src, #28]
> +       ldr     D_h, [src, #32]!
> +       b       1f
> +       .p2align        6
> +2:
> +       pld     [src, #(5 * 64) - (32 - 4)]
> +       strd    A_l, A_h, [dst, #40]
> +       ldr     A_l, [src, #36]
> +       ldr     A_h, [src, #40]
> +       strd    B_l, B_h, [dst, #48]
> +       ldr     B_l, [src, #44]
> +       ldr     B_h, [src, #48]
> +       strd    C_l, C_h, [dst, #56]
> +       ldr     C_l, [src, #52]
> +       ldr     C_h, [src, #56]
> +       strd    D_l, D_h, [dst, #64]!
> +       ldr     D_l, [src, #60]
> +       ldr     D_h, [src, #64]!
> +       subs    tmp2, tmp2, #64
> +1:
> +       strd    A_l, A_h, [dst, #8]
> +       ldr     A_l, [src, #4]
> +       ldr     A_h, [src, #8]
> +       strd    B_l, B_h, [dst, #16]
> +       ldr     B_l, [src, #12]
> +       ldr     B_h, [src, #16]
> +       strd    C_l, C_h, [dst, #24]
> +       ldr     C_l, [src, #20]
> +       ldr     C_h, [src, #24]
> +       strd    D_l, D_h, [dst, #32]
> +       ldr     D_l, [src, #28]
> +       ldr     D_h, [src, #32]
> +       bcs     2b
> +
> +       /* Save the remaining bytes and restore the callee-saved regs.  */
> +       strd    A_l, A_h, [dst, #40]
> +       add     src, src, #36
> +       strd    B_l, B_h, [dst, #48]
> +       ldrd    B_l, B_h, [sp, #8]
> +       strd    C_l, C_h, [dst, #56]
> +       ldrd    C_l, C_h, [sp, #16]
> +       strd    D_l, D_h, [dst, #64]
> +       ldrd    D_l, D_h, [sp, #24]
> +       add     dst, dst, #72
> +       ands    count, tmp2, #0x3f
> +#endif
> +       ldr     tmp2, [sp], #FRAME_SIZE
> +       bne     .Ltail63unaligned
> +       bx      lr
> +
> +END(memcpy)
> +libc_hidden_builtin_def (memcpy)
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
> new file mode 100644
> index 0000000..c0ef1f8
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
> @@ -0,0 +1,3 @@
> +#define MEMCPY_NEON
> +#define memcpy __memcpy_neon
> +#include "memcpy_impl.S"
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
> new file mode 100644
> index 0000000..d21b702
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
> @@ -0,0 +1,3 @@
> +#define MEMCPY_VFP
> +#define memcpy __memcpy_vfp
> +#include "memcpy_impl.S"
> --
> 1.8.1.4
>

-- 
Will Newton
Toolchain Working Group, Linaro
-------------- next part --------------
before:8:1000000000:1:19.584701: took 19.584701 s for 1000000000 calls to memcpy of 8 bytes.  ~719.069 MB/s corrected.
after:8:1000000000:1:10.692606: took 10.692606 s for 1000000000 calls to memcpy of 8 bytes.  ~4440.842 MB/s corrected.
before:8:1000000000:2:20.675438: took 20.675438 s for 1000000000 calls to memcpy of 8 bytes.  ~652.038 MB/s corrected.
after:8:1000000000:2:14.395184: took 14.395184 s for 1000000000 calls to memcpy of 8 bytes.  ~1407.486 MB/s corrected.
before:8:1000000000:4:17.108307: took 17.108307 s for 1000000000 calls to memcpy of 8 bytes.  ~937.997 MB/s corrected.
after:8:1000000000:4:11.282288: took 11.282288 s for 1000000000 calls to memcpy of 8 bytes.  ~3306.078 MB/s corrected.
before:8:1000000000:8:17.111425: took 17.111425 s for 1000000000 calls to memcpy of 8 bytes.  ~937.638 MB/s corrected.
after:8:1000000000:8:10.741472: took 10.741472 s for 1000000000 calls to memcpy of 8 bytes.  ~4318.024 MB/s corrected.
before:16:1000000000:1:21.596682: took 21.596682 s for 1000000000 calls to memcpy of 16 bytes.  ~1208.896 MB/s corrected.
after:16:1000000000:1:12.013657: took 12.013657 s for 1000000000 calls to memcpy of 16 bytes.  ~5020.896 MB/s corrected.
before:16:1000000000:2:20.647859: took 20.647859 s for 1000000000 calls to memcpy of 16 bytes.  ~1307.158 MB/s corrected.
after:16:1000000000:2:11.517764: took 11.517764 s for 1000000000 calls to memcpy of 16 bytes.  ~5999.924 MB/s corrected.
before:16:1000000000:4:19.232580: took 19.232580 s for 1000000000 calls to memcpy of 16 bytes.  ~1487.504 MB/s corrected.
after:16:1000000000:4:15.924562: took 15.924562 s for 1000000000 calls to memcpy of 16 bytes.  ~2195.521 MB/s corrected.
before:16:1000000000:8:18.877802: took 18.877802 s for 1000000000 calls to memcpy of 16 bytes.  ~1540.793 MB/s corrected.
after:16:1000000000:8:11.624877: took 11.624877 s for 1000000000 calls to memcpy of 16 bytes.  ~5757.431 MB/s corrected.
before:20:1000000000:1:21.827229: took 21.827229 s for 1000000000 calls to memcpy of 20 bytes.  ~1484.014 MB/s corrected.
after:20:1000000000:1:12.370149: took 12.370149 s for 1000000000 calls to memcpy of 20 bytes.  ~5617.202 MB/s corrected.
before:20:1000000000:2:21.827838: took 21.827838 s for 1000000000 calls to memcpy of 20 bytes.  ~1483.944 MB/s corrected.
after:20:1000000000:2:15.920191: took 15.920191 s for 1000000000 calls to memcpy of 20 bytes.  ~2746.128 MB/s corrected.
before:20:1000000000:4:19.467730: took 19.467730 s for 1000000000 calls to memcpy of 20 bytes.  ~1817.712 MB/s corrected.
after:20:1000000000:4:12.137896: took 12.137896 s for 1000000000 calls to memcpy of 20 bytes.  ~6029.624 MB/s corrected.
before:20:1000000000:8:19.467976: took 19.467976 s for 1000000000 calls to memcpy of 20 bytes.  ~1817.669 MB/s corrected.
after:20:1000000000:8:12.405835: took 12.405835 s for 1000000000 calls to memcpy of 20 bytes.  ~5558.782 MB/s corrected.
before:31:1000000000:1:24.187294: took 24.187294 s for 1000000000 calls to memcpy of 31 bytes.  ~1943.371 MB/s corrected.
after:31:1000000000:1:11.803614: took 11.803614 s for 1000000000 calls to memcpy of 31 bytes.  ~10450.252 MB/s corrected.
before:31:1000000000:2:24.191519: took 24.191519 s for 1000000000 calls to memcpy of 31 bytes.  ~1942.831 MB/s corrected.
after:31:1000000000:2:15.894483: took 15.894483 s for 1000000000 calls to memcpy of 31 bytes.  ~4272.313 MB/s corrected.
before:31:1000000000:4:21.237294: took 21.237294 s for 1000000000 calls to memcpy of 31 bytes.  ~2410.882 MB/s corrected.
after:31:1000000000:4:13.589293: took 13.589293 s for 1000000000 calls to memcpy of 31 bytes.  ~6406.472 MB/s corrected.
before:31:1000000000:8:21.239564: took 21.239564 s for 1000000000 calls to memcpy of 31 bytes.  ~2410.435 MB/s corrected.
after:31:1000000000:8:13.671633: took 13.671633 s for 1000000000 calls to memcpy of 31 bytes.  ~6294.166 MB/s corrected.
before:32:1000000000:1:24.187305: took 24.187305 s for 1000000000 calls to memcpy of 32 bytes.  ~2006.059 MB/s corrected.
after:32:1000000000:1:12.865460: took 12.865460 s for 1000000000 calls to memcpy of 32 bytes.  ~7843.402 MB/s corrected.
before:32:1000000000:2:24.187038: took 24.187038 s for 1000000000 calls to memcpy of 32 bytes.  ~2006.094 MB/s corrected.
after:32:1000000000:2:12.685619: took 12.685619 s for 1000000000 calls to memcpy of 32 bytes.  ~8223.504 MB/s corrected.
before:32:1000000000:4:23.007560: took 23.007560 s for 1000000000 calls to memcpy of 32 bytes.  ~2174.707 MB/s corrected.
after:32:1000000000:4:12.712468: took 12.712468 s for 1000000000 calls to memcpy of 32 bytes.  ~8164.434 MB/s corrected.
before:32:1000000000:8:23.007667: took 23.007667 s for 1000000000 calls to memcpy of 32 bytes.  ~2174.691 MB/s corrected.
after:32:1000000000:8:16.517152: took 16.517152 s for 1000000000 calls to memcpy of 32 bytes.  ~4046.055 MB/s corrected.
before:63:1000000000:1:31.266944: took 31.266944 s for 1000000000 calls to memcpy of 63 bytes.  ~2695.162 MB/s corrected.
after:63:1000000000:1:15.045383: took 15.045383 s for 1000000000 calls to memcpy of 63 bytes.  ~9896.826 MB/s corrected.
before:63:1000000000:2:31.266944: took 31.266944 s for 1000000000 calls to memcpy of 63 bytes.  ~2695.162 MB/s corrected.
after:63:1000000000:2:15.043915: took 15.043915 s for 1000000000 calls to memcpy of 63 bytes.  ~9899.219 MB/s corrected.
before:63:1000000000:4:28.316482: took 28.316482 s for 1000000000 calls to memcpy of 63 bytes.  ~3106.289 MB/s corrected.
after:63:1000000000:4:17.104123: took 17.104123 s for 1000000000 calls to memcpy of 63 bytes.  ~7390.530 MB/s corrected.
before:63:1000000000:8:28.316644: took 28.316644 s for 1000000000 calls to memcpy of 63 bytes.  ~3106.263 MB/s corrected.
after:63:1000000000:8:15.043495: took 15.043495 s for 1000000000 calls to memcpy of 63 bytes.  ~9899.904 MB/s corrected.
before:64:1000000000:1:31.266176: took 31.266176 s for 1000000000 calls to memcpy of 64 bytes.  ~2738.037 MB/s corrected.
after:64:1000000000:1:21.238190: took 21.238190 s for 1000000000 calls to memcpy of 64 bytes.  ~4976.940 MB/s corrected.
before:64:1000000000:2:31.266394: took 31.266394 s for 1000000000 calls to memcpy of 64 bytes.  ~2738.010 MB/s corrected.
after:64:1000000000:2:21.237760: took 21.237760 s for 1000000000 calls to memcpy of 64 bytes.  ~4977.115 MB/s corrected.
before:64:1000000000:4:28.906717: took 28.906717 s for 1000000000 calls to memcpy of 64 bytes.  ~3062.151 MB/s corrected.
after:64:1000000000:4:21.238589: took 21.238589 s for 1000000000 calls to memcpy of 64 bytes.  ~4976.778 MB/s corrected.
before:64:1000000000:8:28.906552: took 28.906552 s for 1000000000 calls to memcpy of 64 bytes.  ~3062.177 MB/s corrected.
after:64:1000000000:8:15.930619: took 15.930619 s for 1000000000 calls to memcpy of 64 bytes.  ~8774.437 MB/s corrected.
before:100:1000000000:1:37.755505: took 37.755505 s for 1000000000 calls to memcpy of 100 bytes.  ~3313.566 MB/s corrected.
after:100:1000000000:1:25.662495: took 25.662495 s for 1000000000 calls to memcpy of 100 bytes.  ~5714.767 MB/s corrected.
before:100:1000000000:2:37.755721: took 37.755721 s for 1000000000 calls to memcpy of 100 bytes.  ~3313.541 MB/s corrected.
after:100:1000000000:2:25.662721: took 25.662721 s for 1000000000 calls to memcpy of 100 bytes.  ~5714.690 MB/s corrected.
before:100:1000000000:4:35.396223: took 35.396223 s for 1000000000 calls to memcpy of 100 bytes.  ~3609.446 MB/s corrected.
after:100:1000000000:4:26.547438: took 26.547438 s for 1000000000 calls to memcpy of 100 bytes.  ~5426.980 MB/s corrected.
before:100:1000000000:8:35.396014: took 35.396014 s for 1000000000 calls to memcpy of 100 bytes.  ~3609.475 MB/s corrected.
after:100:1000000000:8:21.237772: took 21.237772 s for 1000000000 calls to memcpy of 100 bytes.  ~7776.734 MB/s corrected.
before:200:1000000000:1:56.633398: took 56.633398 s for 1000000000 calls to memcpy of 200 bytes.  ~4002.091 MB/s corrected.
after:200:1000000000:1:33.626722: took 33.626722 s for 1000000000 calls to memcpy of 200 bytes.  ~7737.057 MB/s corrected.
before:200:1000000000:2:56.633684: took 56.633684 s for 1000000000 calls to memcpy of 200 bytes.  ~4002.067 MB/s corrected.
after:200:1000000000:2:33.626692: took 33.626692 s for 1000000000 calls to memcpy of 200 bytes.  ~7737.066 MB/s corrected.
before:200:1000000000:4:54.273108: took 54.273108 s for 1000000000 calls to memcpy of 200 bytes.  ~4210.621 MB/s corrected.
after:200:1000000000:4:33.626391: took 33.626391 s for 1000000000 calls to memcpy of 200 bytes.  ~7737.160 MB/s corrected.
before:200:1000000000:8:54.273969: took 54.273969 s for 1000000000 calls to memcpy of 200 bytes.  ~4210.541 MB/s corrected.
after:200:1000000000:8:29.497408: took 29.497408 s for 1000000000 calls to memcpy of 200 bytes.  ~9293.799 MB/s corrected.