[PATCH] ARM: Add Cortex-A15 optimized NEON and VFP memcpy routines, with IFUNC.
Will Newton
will.newton@linaro.org
Mon Apr 15 10:01:00 GMT 2013
Attached are a set of benchmarks of the new code versus the existing
memcpy implementation on a Cortex-A15 platform.
On 15 April 2013 10:56, Will Newton <will.newton@linaro.org> wrote:
>
> Add a high performance memcpy routine optimized for Cortex-A15 with
> variants for use in the presence of NEON and VFP hardware, selected
> at runtime using indirect function support.
>
> This was tested on armv7l-unknown-linux-gnueabihf. One new testsuite
> failure was introduced (elf/ifuncmain5picstatic) which was caused by
> a bug in ld. A fix for that ld issue has been submitted here:
>
> http://sourceware.org/ml/binutils/2013-04/msg00143.html
>
> ports/ChangeLog.arm:
>
> 2013-04-15 Will Newton <will.newton@linaro.org>
>
> * sysdeps/arm/armv7/multiarch/Makefile: New file.
> * sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Likewise.
> * sysdeps/arm/armv7/multiarch/memcpy.S: Likewise.
> * sysdeps/arm/armv7/multiarch/memcpy_impl.S: Likewise.
> * sysdeps/arm/armv7/multiarch/memcpy_neon.S: Likewise.
> * sysdeps/arm/armv7/multiarch/memcpy_vfp.S: Likewise.
>
> Signed-off-by: Will Newton <will.newton@linaro.org>
> ---
> ports/sysdeps/arm/armv7/multiarch/Makefile | 3 +
> .../sysdeps/arm/armv7/multiarch/ifunc-impl-list.c | 46 ++
> ports/sysdeps/arm/armv7/multiarch/memcpy.S | 96 ++++
> ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S | 600 +++++++++++++++++++++
> ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S | 3 +
> ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S | 3 +
> 6 files changed, 751 insertions(+)
> create mode 100644 ports/sysdeps/arm/armv7/multiarch/Makefile
> create mode 100644 ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
> create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy.S
> create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
> create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
> create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
>
> diff --git a/ports/sysdeps/arm/armv7/multiarch/Makefile b/ports/sysdeps/arm/armv7/multiarch/Makefile
> new file mode 100644
> index 0000000..e834cc9
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/Makefile
> @@ -0,0 +1,3 @@
> +ifeq ($(subdir),string)
> +sysdep_routines += memcpy_neon memcpy_vfp
> +endif
> diff --git a/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
> new file mode 100644
> index 0000000..176288b
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
> @@ -0,0 +1,46 @@
> +/* Enumerate available IFUNC implementations of a function. arm version.
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <assert.h>
> +#include <string.h>
> +#include <wchar.h>
> +#include <ldsodefs.h>
> +#include <sysdep.h>
> +#include <ifunc-impl-list.h>
> +
> +/* Fill ARRAY of MAX elements with IFUNC implementations for function
> + NAME and return the number of valid entries. */
> +
> +size_t
> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> + size_t max)
> +{
> + size_t i = 0;
> + int hwcap;
> +
> + hwcap = GLRO(dl_hwcap);
> +
> + IFUNC_IMPL (i, name, memcpy,
> + IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_NEON,
> + __memcpy_neon)
> + IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_VFPv3,
> + __memcpy_vfp)
> + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
> +
> + return i;
> +}
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy.S b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
> new file mode 100644
> index 0000000..a9e2faf
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
> @@ -0,0 +1,96 @@
> +/* Multiple versions of memcpy
> + All versions must be listed in ifunc-impl-list.c.
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include <rtld-global-offsets.h>
> +#include <arm-features.h>
> +
> +#if !defined NOT_IN_libc
> + .text
> +ENTRY(memcpy)
> + .type memcpy, %gnu_indirect_function
> +# ifdef PIC
> + ldr a3, 1f
> +0: add a3, pc, a3
> +# endif
> +
> + tst a1, #HWCAP_ARM_NEON
> + beq .Lno_neon
> +# ifdef PIC
> + ldr a4, .Lmemcpy_neon
> + ldr r0, [a3, a4]
> +# else
> + ldr r0, .Lmemcpy_neon
> +# endif
> + b .Lreturn
> +.Lno_neon:
> +
> + tst a1, #HWCAP_ARM_VFP
> + beq .Lno_vfp
> +# ifdef PIC
> + ldr a4, .Lmemcpy_vfp
> + ldr r0, [a3, a4]
> +# else
> + ldr r0, .Lmemcpy_vfp
> +# endif
> + b .Lreturn
> +.Lno_vfp:
> +# ifdef PIC
> + ldr a4, .Lmemcpy_arm
> + ldr r0, [a3, a4]
> +# else
> + ldr r0, .Lmemcpy_arm
> +# endif
> +
> +.Lreturn:
> + DO_RET(lr)
> +
> +# ifdef PIC
> +1: .long _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS
> +.Lmemcpy_neon:
> + .long C_SYMBOL_NAME(__memcpy_neon)(GOT)
> +.Lmemcpy_vfp:
> + .long C_SYMBOL_NAME(__memcpy_vfp)(GOT)
> +.Lmemcpy_arm:
> + .long C_SYMBOL_NAME(__memcpy_arm)(GOT)
> +# else
> +.Lmemcpy_neon:
> + .long C_SYMBOL_NAME(__memcpy_neon)
> +.Lmemcpy_vfp:
> + .long C_SYMBOL_NAME(__memcpy_vfp)
> +.Lmemcpy_arm:
> + .long C_SYMBOL_NAME(__memcpy_arm)
> +# endif
> +
> +END(memcpy)
> +
> +libc_hidden_builtin_def (memcpy)
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +#undef weak_alias
> +#define weak_alias(x, y)
> +#undef libc_hidden_def
> +#define libc_hidden_def(name)
> +
> +#define memcpy __memcpy_arm
> +
> +#endif
> +
> +#include "memcpy_impl.S"
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
> new file mode 100644
> index 0000000..2c466d25
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
> @@ -0,0 +1,600 @@
> +/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>.
> +
> + This memcpy routine is optimised for Cortex-A15 cores and takes advantage
> + of VFP or NEON when built with the appropriate flags.
> +
> + Assumptions:
> +
> + ARMv6 (ARMv7-a if using Neon)
> + ARM state
> + Unaligned accesses
> + LDRD/STRD support unaligned word accesses
> +
> + */
> +
> +#include <sysdep.h>
> +
> + .syntax unified
> + /* This implementation requires ARM state. */
> + .arm
> +
> +#ifdef MEMCPY_NEON
> +
> + .fpu neon
> + .arch armv7-a
> +# define FRAME_SIZE 4
> +# define USE_VFP
> +# define USE_NEON
> +
> +#elif defined (MEMCPY_VFP)
> +
> + .arch armv6
> + .fpu vfpv2
> +# define FRAME_SIZE 32
> +# define USE_VFP
> +
> +#else
> + .arch armv6
> +# define FRAME_SIZE 32
> +
> +#endif
> +
> +/* Old versions of GAS incorrectly implement the NEON align semantics. */
> +#ifdef BROKEN_ASM_NEON_ALIGN
> +#define ALIGN(addr, align) addr,:align
> +#else
> +#define ALIGN(addr, align) addr:align
> +#endif
> +
> +#define PC_OFFSET 8 /* PC pipeline compensation. */
> +#define INSN_SIZE 4
> +
> +/* Call parameters. */
> +#define dstin r0
> +#define src r1
> +#define count r2
> +
> +/* Locals. */
> +#define tmp1 r3
> +#define dst ip
> +#define tmp2 r10
> +
> +#ifndef USE_NEON
> +/* For bulk copies using GP registers. */
> +#define A_l r2 /* Call-clobbered. */
> +#define A_h r3 /* Call-clobbered. */
> +#define B_l r4
> +#define B_h r5
> +#define C_l r6
> +#define C_h r7
> +#define D_l r8
> +#define D_h r9
> +#endif
> +
> +/* Number of lines ahead to pre-fetch data. If you change this the code
> + below will need adjustment to compensate. */
> +
> +#define prefetch_lines 5
> +
> +#ifdef USE_VFP
> + .macro cpy_line_vfp vreg, base
> + vstr \vreg, [dst, #\base]
> + vldr \vreg, [src, #\base]
> + vstr d0, [dst, #\base + 8]
> + vldr d0, [src, #\base + 8]
> + vstr d1, [dst, #\base + 16]
> + vldr d1, [src, #\base + 16]
> + vstr d2, [dst, #\base + 24]
> + vldr d2, [src, #\base + 24]
> + vstr \vreg, [dst, #\base + 32]
> + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
> + vstr d0, [dst, #\base + 40]
> + vldr d0, [src, #\base + 40]
> + vstr d1, [dst, #\base + 48]
> + vldr d1, [src, #\base + 48]
> + vstr d2, [dst, #\base + 56]
> + vldr d2, [src, #\base + 56]
> + .endm
> +
> + .macro cpy_tail_vfp vreg, base
> + vstr \vreg, [dst, #\base]
> + vldr \vreg, [src, #\base]
> + vstr d0, [dst, #\base + 8]
> + vldr d0, [src, #\base + 8]
> + vstr d1, [dst, #\base + 16]
> + vldr d1, [src, #\base + 16]
> + vstr d2, [dst, #\base + 24]
> + vldr d2, [src, #\base + 24]
> + vstr \vreg, [dst, #\base + 32]
> + vstr d0, [dst, #\base + 40]
> + vldr d0, [src, #\base + 40]
> + vstr d1, [dst, #\base + 48]
> + vldr d1, [src, #\base + 48]
> + vstr d2, [dst, #\base + 56]
> + vldr d2, [src, #\base + 56]
> + .endm
> +#endif
> +
> + .p2align 6
> +ENTRY(memcpy)
> +
> + mov dst, dstin /* Preserve dstin, we need to return it. */
> + cmp count, #64
> + bge .Lcpy_not_short
> + /* Deal with small copies quickly by dropping straight into the
> + exit block. */
> +
> +.Ltail63unaligned:
> +#ifdef USE_NEON
> + and tmp1, count, #0x38
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> + vld1.8 {d0}, [src]! /* 14 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 12 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 10 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 8 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 6 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 4 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 2 words to go. */
> + vst1.8 {d0}, [dst]!
> +
> + tst count, #4
> + ldrne tmp1, [src], #4
> + strne tmp1, [dst], #4
> +#else
> + /* Copy up to 15 full words of data. May not be aligned. */
> + /* Cannot use VFP for unaligned data. */
> + and tmp1, count, #0x3c
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
> + /* Jump directly into the sequence below at the correct offset. */
> + add pc, pc, tmp1, lsl #1
> +
> + ldr tmp1, [src, #-60] /* 15 words to go. */
> + str tmp1, [dst, #-60]
> +
> + ldr tmp1, [src, #-56] /* 14 words to go. */
> + str tmp1, [dst, #-56]
> + ldr tmp1, [src, #-52]
> + str tmp1, [dst, #-52]
> +
> + ldr tmp1, [src, #-48] /* 12 words to go. */
> + str tmp1, [dst, #-48]
> + ldr tmp1, [src, #-44]
> + str tmp1, [dst, #-44]
> +
> + ldr tmp1, [src, #-40] /* 10 words to go. */
> + str tmp1, [dst, #-40]
> + ldr tmp1, [src, #-36]
> + str tmp1, [dst, #-36]
> +
> + ldr tmp1, [src, #-32] /* 8 words to go. */
> + str tmp1, [dst, #-32]
> + ldr tmp1, [src, #-28]
> + str tmp1, [dst, #-28]
> +
> + ldr tmp1, [src, #-24] /* 6 words to go. */
> + str tmp1, [dst, #-24]
> + ldr tmp1, [src, #-20]
> + str tmp1, [dst, #-20]
> +
> + ldr tmp1, [src, #-16] /* 4 words to go. */
> + str tmp1, [dst, #-16]
> + ldr tmp1, [src, #-12]
> + str tmp1, [dst, #-12]
> +
> + ldr tmp1, [src, #-8] /* 2 words to go. */
> + str tmp1, [dst, #-8]
> + ldr tmp1, [src, #-4]
> + str tmp1, [dst, #-4]
> +#endif
> +
> + lsls count, count, #31
> + ldrhcs tmp1, [src], #2
> + ldrbne src, [src] /* Src is dead, use as a scratch. */
> + strhcs tmp1, [dst], #2
> + strbne src, [dst]
> + bx lr
> +
> +.Lcpy_not_short:
> + /* At least 64 bytes to copy, but don't know the alignment yet. */
> + str tmp2, [sp, #-FRAME_SIZE]!
> + and tmp2, src, #3
> + and tmp1, dst, #3
> + cmp tmp1, tmp2
> + bne .Lcpy_notaligned
> +
> +#ifdef USE_VFP
> + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
> + that the FP pipeline is much better at streaming loads and
> + stores. This is outside the critical loop. */
> + vmov.f32 s0, s0
> +#endif
> +
> + /* SRC and DST have the same mutual 32-bit alignment, but we may
> + still need to pre-copy some bytes to get to natural alignment.
> + We bring DST into full 64-bit alignment. */
> + lsls tmp2, dst, #29
> + beq 1f
> + rsbs tmp2, tmp2, #0
> + sub count, count, tmp2, lsr #29
> + ldrmi tmp1, [src], #4
> + strmi tmp1, [dst], #4
> + lsls tmp2, tmp2, #2
> + ldrhcs tmp1, [src], #2
> + ldrbne tmp2, [src], #1
> + strhcs tmp1, [dst], #2
> + strbne tmp2, [dst], #1
> +
> +1:
> + subs tmp2, count, #64 /* Use tmp2 for count. */
> + blt .Ltail63aligned
> +
> + cmp tmp2, #512
> + bge .Lcpy_body_long
> +
> +.Lcpy_body_medium: /* Count in tmp2. */
> +#ifdef USE_VFP
> +1:
> + vldr d0, [src, #0]
> + subs tmp2, tmp2, #64
> + vldr d1, [src, #8]
> + vstr d0, [dst, #0]
> + vldr d0, [src, #16]
> + vstr d1, [dst, #8]
> + vldr d1, [src, #24]
> + vstr d0, [dst, #16]
> + vldr d0, [src, #32]
> + vstr d1, [dst, #24]
> + vldr d1, [src, #40]
> + vstr d0, [dst, #32]
> + vldr d0, [src, #48]
> + vstr d1, [dst, #40]
> + vldr d1, [src, #56]
> + vstr d0, [dst, #48]
> + add src, src, #64
> + vstr d1, [dst, #56]
> + add dst, dst, #64
> + bge 1b
> + tst tmp2, #0x3f
> + beq .Ldone
> +
> +.Ltail63aligned: /* Count in tmp2. */
> + and tmp1, tmp2, #0x38
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> +
> + vldr d0, [src, #-56] /* 14 words to go. */
> + vstr d0, [dst, #-56]
> + vldr d0, [src, #-48] /* 12 words to go. */
> + vstr d0, [dst, #-48]
> + vldr d0, [src, #-40] /* 10 words to go. */
> + vstr d0, [dst, #-40]
> + vldr d0, [src, #-32] /* 8 words to go. */
> + vstr d0, [dst, #-32]
> + vldr d0, [src, #-24] /* 6 words to go. */
> + vstr d0, [dst, #-24]
> + vldr d0, [src, #-16] /* 4 words to go. */
> + vstr d0, [dst, #-16]
> + vldr d0, [src, #-8] /* 2 words to go. */
> + vstr d0, [dst, #-8]
> +#else
> + sub src, src, #8
> + sub dst, dst, #8
> +1:
> + ldrd A_l, A_h, [src, #8]
> + strd A_l, A_h, [dst, #8]
> + ldrd A_l, A_h, [src, #16]
> + strd A_l, A_h, [dst, #16]
> + ldrd A_l, A_h, [src, #24]
> + strd A_l, A_h, [dst, #24]
> + ldrd A_l, A_h, [src, #32]
> + strd A_l, A_h, [dst, #32]
> + ldrd A_l, A_h, [src, #40]
> + strd A_l, A_h, [dst, #40]
> + ldrd A_l, A_h, [src, #48]
> + strd A_l, A_h, [dst, #48]
> + ldrd A_l, A_h, [src, #56]
> + strd A_l, A_h, [dst, #56]
> + ldrd A_l, A_h, [src, #64]!
> + strd A_l, A_h, [dst, #64]!
> + subs tmp2, tmp2, #64
> + bge 1b
> + tst tmp2, #0x3f
> + bne 1f
> + ldr tmp2,[sp], #FRAME_SIZE
> + bx lr
> +1:
> + add src, src, #8
> + add dst, dst, #8
> +
> +.Ltail63aligned: /* Count in tmp2. */
> + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
> + we know that the src and dest are 32-bit aligned so we can use
> + LDRD/STRD to improve efficiency. */
> + /* TMP2 is now negative, but we don't care about that. The bottom
> + six bits still tell us how many bytes are left to copy. */
> +
> + and tmp1, tmp2, #0x38
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
> + strd A_l, A_h, [dst, #-56]
> + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
> + strd A_l, A_h, [dst, #-48]
> + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
> + strd A_l, A_h, [dst, #-40]
> + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
> + strd A_l, A_h, [dst, #-32]
> + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
> + strd A_l, A_h, [dst, #-24]
> + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
> + strd A_l, A_h, [dst, #-16]
> + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
> + strd A_l, A_h, [dst, #-8]
> +
> +#endif
> + tst tmp2, #4
> + ldrne tmp1, [src], #4
> + strne tmp1, [dst], #4
> + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
> + ldrhcs tmp1, [src], #2
> + ldrbne tmp2, [src]
> + strhcs tmp1, [dst], #2
> + strbne tmp2, [dst]
> +
> +.Ldone:
> + ldr tmp2, [sp], #FRAME_SIZE
> + bx lr
> +
> +.Lcpy_body_long: /* Count in tmp2. */
> +
> + /* Long copy. We know that there's at least (prefetch_lines * 64)
> + bytes to go. */
> +#ifdef USE_VFP
> + /* Don't use PLD. Instead, read some data in advance of the current
> + copy position into a register. This should act like a PLD
> + operation but we won't have to repeat the transfer. */
> +
> + vldr d3, [src, #0]
> + vldr d4, [src, #64]
> + vldr d5, [src, #128]
> + vldr d6, [src, #192]
> + vldr d7, [src, #256]
> +
> + vldr d0, [src, #8]
> + vldr d1, [src, #16]
> + vldr d2, [src, #24]
> + add src, src, #32
> +
> + subs tmp2, tmp2, #prefetch_lines * 64 * 2
> + blt 2f
> +1:
> + cpy_line_vfp d3, 0
> + cpy_line_vfp d4, 64
> + cpy_line_vfp d5, 128
> + add dst, dst, #3 * 64
> + add src, src, #3 * 64
> + cpy_line_vfp d6, 0
> + cpy_line_vfp d7, 64
> + add dst, dst, #2 * 64
> + add src, src, #2 * 64
> + subs tmp2, tmp2, #prefetch_lines * 64
> + bge 1b
> +
> +2:
> + cpy_tail_vfp d3, 0
> + cpy_tail_vfp d4, 64
> + cpy_tail_vfp d5, 128
> + add src, src, #3 * 64
> + add dst, dst, #3 * 64
> + cpy_tail_vfp d6, 0
> + vstr d7, [dst, #64]
> + vldr d7, [src, #64]
> + vstr d0, [dst, #64 + 8]
> + vldr d0, [src, #64 + 8]
> + vstr d1, [dst, #64 + 16]
> + vldr d1, [src, #64 + 16]
> + vstr d2, [dst, #64 + 24]
> + vldr d2, [src, #64 + 24]
> + vstr d7, [dst, #64 + 32]
> + add src, src, #96
> + vstr d0, [dst, #64 + 40]
> + vstr d1, [dst, #64 + 48]
> + vstr d2, [dst, #64 + 56]
> + add dst, dst, #128
> + add tmp2, tmp2, #prefetch_lines * 64
> + b .Lcpy_body_medium
> +#else
> + /* Long copy. Use an SMS style loop to maximize the I/O
> + bandwidth of the core. We don't have enough spare registers
> + to synthesise prefetching, so use PLD operations. */
> + /* Pre-bias src and dst. */
> + sub src, src, #8
> + sub dst, dst, #8
> + pld [src, #8]
> + pld [src, #72]
> + subs tmp2, tmp2, #64
> + pld [src, #136]
> + ldrd A_l, A_h, [src, #8]
> + strd B_l, B_h, [sp, #8]
> + ldrd B_l, B_h, [src, #16]
> + strd C_l, C_h, [sp, #16]
> + ldrd C_l, C_h, [src, #24]
> + strd D_l, D_h, [sp, #24]
> + pld [src, #200]
> + ldrd D_l, D_h, [src, #32]!
> + b 1f
> + .p2align 6
> +2:
> + pld [src, #232]
> + strd A_l, A_h, [dst, #40]
> + ldrd A_l, A_h, [src, #40]
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [src, #48]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [src, #56]
> + strd D_l, D_h, [dst, #64]!
> + ldrd D_l, D_h, [src, #64]!
> + subs tmp2, tmp2, #64
> +1:
> + strd A_l, A_h, [dst, #8]
> + ldrd A_l, A_h, [src, #8]
> + strd B_l, B_h, [dst, #16]
> + ldrd B_l, B_h, [src, #16]
> + strd C_l, C_h, [dst, #24]
> + ldrd C_l, C_h, [src, #24]
> + strd D_l, D_h, [dst, #32]
> + ldrd D_l, D_h, [src, #32]
> + bcs 2b
> + /* Save the remaining bytes and restore the callee-saved regs. */
> + strd A_l, A_h, [dst, #40]
> + add src, src, #40
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [sp, #8]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [sp, #16]
> + strd D_l, D_h, [dst, #64]
> + ldrd D_l, D_h, [sp, #24]
> + add dst, dst, #72
> + tst tmp2, #0x3f
> + bne .Ltail63aligned
> + ldr tmp2, [sp], #FRAME_SIZE
> + bx lr
> +#endif
> +
> +.Lcpy_notaligned:
> + pld [src]
> + pld [src, #64]
> + /* There's at least 64 bytes to copy, but there is no mutual
> + alignment. */
> + /* Bring DST to 64-bit alignment. */
> + lsls tmp2, dst, #29
> + pld [src, #(2 * 64)]
> + beq 1f
> + rsbs tmp2, tmp2, #0
> + sub count, count, tmp2, lsr #29
> + ldrmi tmp1, [src], #4
> + strmi tmp1, [dst], #4
> + lsls tmp2, tmp2, #2
> + ldrbne tmp1, [src], #1
> + ldrhcs tmp2, [src], #2
> + strbne tmp1, [dst], #1
> + strhcs tmp2, [dst], #2
> +1:
> + pld [src, #(3 * 64)]
> + subs count, count, #64
> + ldrmi tmp2, [sp], #FRAME_SIZE
> + bmi .Ltail63unaligned
> + pld [src, #(4 * 64)]
> +
> +#ifdef USE_NEON
> + vld1.8 {d0-d3}, [src]!
> + vld1.8 {d4-d7}, [src]!
> + subs count, count, #64
> + bmi 2f
> +1:
> + pld [src, #(4 * 64)]
> + vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
> + vld1.8 {d0-d3}, [src]!
> + vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
> + vld1.8 {d4-d7}, [src]!
> + subs count, count, #64
> + bpl 1b
> +2:
> + vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
> + vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
> + ands count, count, #0x3f
> +#else
> + /* Use an SMS style loop to maximize the I/O bandwidth. */
> + sub src, src, #4
> + sub dst, dst, #8
> + subs tmp2, count, #64 /* Use tmp2 for count. */
> + ldr A_l, [src, #4]
> + ldr A_h, [src, #8]
> + strd B_l, B_h, [sp, #8]
> + ldr B_l, [src, #12]
> + ldr B_h, [src, #16]
> + strd C_l, C_h, [sp, #16]
> + ldr C_l, [src, #20]
> + ldr C_h, [src, #24]
> + strd D_l, D_h, [sp, #24]
> + ldr D_l, [src, #28]
> + ldr D_h, [src, #32]!
> + b 1f
> + .p2align 6
> +2:
> + pld [src, #(5 * 64) - (32 - 4)]
> + strd A_l, A_h, [dst, #40]
> + ldr A_l, [src, #36]
> + ldr A_h, [src, #40]
> + strd B_l, B_h, [dst, #48]
> + ldr B_l, [src, #44]
> + ldr B_h, [src, #48]
> + strd C_l, C_h, [dst, #56]
> + ldr C_l, [src, #52]
> + ldr C_h, [src, #56]
> + strd D_l, D_h, [dst, #64]!
> + ldr D_l, [src, #60]
> + ldr D_h, [src, #64]!
> + subs tmp2, tmp2, #64
> +1:
> + strd A_l, A_h, [dst, #8]
> + ldr A_l, [src, #4]
> + ldr A_h, [src, #8]
> + strd B_l, B_h, [dst, #16]
> + ldr B_l, [src, #12]
> + ldr B_h, [src, #16]
> + strd C_l, C_h, [dst, #24]
> + ldr C_l, [src, #20]
> + ldr C_h, [src, #24]
> + strd D_l, D_h, [dst, #32]
> + ldr D_l, [src, #28]
> + ldr D_h, [src, #32]
> + bcs 2b
> +
> + /* Save the remaining bytes and restore the callee-saved regs. */
> + strd A_l, A_h, [dst, #40]
> + add src, src, #36
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [sp, #8]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [sp, #16]
> + strd D_l, D_h, [dst, #64]
> + ldrd D_l, D_h, [sp, #24]
> + add dst, dst, #72
> + ands count, tmp2, #0x3f
> +#endif
> + ldr tmp2, [sp], #FRAME_SIZE
> + bne .Ltail63unaligned
> + bx lr
> +
> +END(memcpy)
> +libc_hidden_builtin_def (memcpy)
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
> new file mode 100644
> index 0000000..c0ef1f8
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
> @@ -0,0 +1,3 @@
> +#define MEMCPY_NEON
> +#define memcpy __memcpy_neon
> +#include "memcpy_impl.S"
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
> new file mode 100644
> index 0000000..d21b702
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
> @@ -0,0 +1,3 @@
> +#define MEMCPY_VFP
> +#define memcpy __memcpy_vfp
> +#include "memcpy_impl.S"
> --
> 1.8.1.4
>
--
Will Newton
Toolchain Working Group, Linaro
-------------- next part --------------
before:8:1000000000:1:19.584701: took 19.584701 s for 1000000000 calls to memcpy of 8 bytes. ~719.069 MB/s corrected.
after:8:1000000000:1:10.692606: took 10.692606 s for 1000000000 calls to memcpy of 8 bytes. ~4440.842 MB/s corrected.
before:8:1000000000:2:20.675438: took 20.675438 s for 1000000000 calls to memcpy of 8 bytes. ~652.038 MB/s corrected.
after:8:1000000000:2:14.395184: took 14.395184 s for 1000000000 calls to memcpy of 8 bytes. ~1407.486 MB/s corrected.
before:8:1000000000:4:17.108307: took 17.108307 s for 1000000000 calls to memcpy of 8 bytes. ~937.997 MB/s corrected.
after:8:1000000000:4:11.282288: took 11.282288 s for 1000000000 calls to memcpy of 8 bytes. ~3306.078 MB/s corrected.
before:8:1000000000:8:17.111425: took 17.111425 s for 1000000000 calls to memcpy of 8 bytes. ~937.638 MB/s corrected.
after:8:1000000000:8:10.741472: took 10.741472 s for 1000000000 calls to memcpy of 8 bytes. ~4318.024 MB/s corrected.
before:16:1000000000:1:21.596682: took 21.596682 s for 1000000000 calls to memcpy of 16 bytes. ~1208.896 MB/s corrected.
after:16:1000000000:1:12.013657: took 12.013657 s for 1000000000 calls to memcpy of 16 bytes. ~5020.896 MB/s corrected.
before:16:1000000000:2:20.647859: took 20.647859 s for 1000000000 calls to memcpy of 16 bytes. ~1307.158 MB/s corrected.
after:16:1000000000:2:11.517764: took 11.517764 s for 1000000000 calls to memcpy of 16 bytes. ~5999.924 MB/s corrected.
before:16:1000000000:4:19.232580: took 19.232580 s for 1000000000 calls to memcpy of 16 bytes. ~1487.504 MB/s corrected.
after:16:1000000000:4:15.924562: took 15.924562 s for 1000000000 calls to memcpy of 16 bytes. ~2195.521 MB/s corrected.
before:16:1000000000:8:18.877802: took 18.877802 s for 1000000000 calls to memcpy of 16 bytes. ~1540.793 MB/s corrected.
after:16:1000000000:8:11.624877: took 11.624877 s for 1000000000 calls to memcpy of 16 bytes. ~5757.431 MB/s corrected.
before:20:1000000000:1:21.827229: took 21.827229 s for 1000000000 calls to memcpy of 20 bytes. ~1484.014 MB/s corrected.
after:20:1000000000:1:12.370149: took 12.370149 s for 1000000000 calls to memcpy of 20 bytes. ~5617.202 MB/s corrected.
before:20:1000000000:2:21.827838: took 21.827838 s for 1000000000 calls to memcpy of 20 bytes. ~1483.944 MB/s corrected.
after:20:1000000000:2:15.920191: took 15.920191 s for 1000000000 calls to memcpy of 20 bytes. ~2746.128 MB/s corrected.
before:20:1000000000:4:19.467730: took 19.467730 s for 1000000000 calls to memcpy of 20 bytes. ~1817.712 MB/s corrected.
after:20:1000000000:4:12.137896: took 12.137896 s for 1000000000 calls to memcpy of 20 bytes. ~6029.624 MB/s corrected.
before:20:1000000000:8:19.467976: took 19.467976 s for 1000000000 calls to memcpy of 20 bytes. ~1817.669 MB/s corrected.
after:20:1000000000:8:12.405835: took 12.405835 s for 1000000000 calls to memcpy of 20 bytes. ~5558.782 MB/s corrected.
before:31:1000000000:1:24.187294: took 24.187294 s for 1000000000 calls to memcpy of 31 bytes. ~1943.371 MB/s corrected.
after:31:1000000000:1:11.803614: took 11.803614 s for 1000000000 calls to memcpy of 31 bytes. ~10450.252 MB/s corrected.
before:31:1000000000:2:24.191519: took 24.191519 s for 1000000000 calls to memcpy of 31 bytes. ~1942.831 MB/s corrected.
after:31:1000000000:2:15.894483: took 15.894483 s for 1000000000 calls to memcpy of 31 bytes. ~4272.313 MB/s corrected.
before:31:1000000000:4:21.237294: took 21.237294 s for 1000000000 calls to memcpy of 31 bytes. ~2410.882 MB/s corrected.
after:31:1000000000:4:13.589293: took 13.589293 s for 1000000000 calls to memcpy of 31 bytes. ~6406.472 MB/s corrected.
before:31:1000000000:8:21.239564: took 21.239564 s for 1000000000 calls to memcpy of 31 bytes. ~2410.435 MB/s corrected.
after:31:1000000000:8:13.671633: took 13.671633 s for 1000000000 calls to memcpy of 31 bytes. ~6294.166 MB/s corrected.
before:32:1000000000:1:24.187305: took 24.187305 s for 1000000000 calls to memcpy of 32 bytes. ~2006.059 MB/s corrected.
after:32:1000000000:1:12.865460: took 12.865460 s for 1000000000 calls to memcpy of 32 bytes. ~7843.402 MB/s corrected.
before:32:1000000000:2:24.187038: took 24.187038 s for 1000000000 calls to memcpy of 32 bytes. ~2006.094 MB/s corrected.
after:32:1000000000:2:12.685619: took 12.685619 s for 1000000000 calls to memcpy of 32 bytes. ~8223.504 MB/s corrected.
before:32:1000000000:4:23.007560: took 23.007560 s for 1000000000 calls to memcpy of 32 bytes. ~2174.707 MB/s corrected.
after:32:1000000000:4:12.712468: took 12.712468 s for 1000000000 calls to memcpy of 32 bytes. ~8164.434 MB/s corrected.
before:32:1000000000:8:23.007667: took 23.007667 s for 1000000000 calls to memcpy of 32 bytes. ~2174.691 MB/s corrected.
after:32:1000000000:8:16.517152: took 16.517152 s for 1000000000 calls to memcpy of 32 bytes. ~4046.055 MB/s corrected.
before:63:1000000000:1:31.266944: took 31.266944 s for 1000000000 calls to memcpy of 63 bytes. ~2695.162 MB/s corrected.
after:63:1000000000:1:15.045383: took 15.045383 s for 1000000000 calls to memcpy of 63 bytes. ~9896.826 MB/s corrected.
before:63:1000000000:2:31.266944: took 31.266944 s for 1000000000 calls to memcpy of 63 bytes. ~2695.162 MB/s corrected.
after:63:1000000000:2:15.043915: took 15.043915 s for 1000000000 calls to memcpy of 63 bytes. ~9899.219 MB/s corrected.
before:63:1000000000:4:28.316482: took 28.316482 s for 1000000000 calls to memcpy of 63 bytes. ~3106.289 MB/s corrected.
after:63:1000000000:4:17.104123: took 17.104123 s for 1000000000 calls to memcpy of 63 bytes. ~7390.530 MB/s corrected.
before:63:1000000000:8:28.316644: took 28.316644 s for 1000000000 calls to memcpy of 63 bytes. ~3106.263 MB/s corrected.
after:63:1000000000:8:15.043495: took 15.043495 s for 1000000000 calls to memcpy of 63 bytes. ~9899.904 MB/s corrected.
before:64:1000000000:1:31.266176: took 31.266176 s for 1000000000 calls to memcpy of 64 bytes. ~2738.037 MB/s corrected.
after:64:1000000000:1:21.238190: took 21.238190 s for 1000000000 calls to memcpy of 64 bytes. ~4976.940 MB/s corrected.
before:64:1000000000:2:31.266394: took 31.266394 s for 1000000000 calls to memcpy of 64 bytes. ~2738.010 MB/s corrected.
after:64:1000000000:2:21.237760: took 21.237760 s for 1000000000 calls to memcpy of 64 bytes. ~4977.115 MB/s corrected.
before:64:1000000000:4:28.906717: took 28.906717 s for 1000000000 calls to memcpy of 64 bytes. ~3062.151 MB/s corrected.
after:64:1000000000:4:21.238589: took 21.238589 s for 1000000000 calls to memcpy of 64 bytes. ~4976.778 MB/s corrected.
before:64:1000000000:8:28.906552: took 28.906552 s for 1000000000 calls to memcpy of 64 bytes. ~3062.177 MB/s corrected.
after:64:1000000000:8:15.930619: took 15.930619 s for 1000000000 calls to memcpy of 64 bytes. ~8774.437 MB/s corrected.
before:100:1000000000:1:37.755505: took 37.755505 s for 1000000000 calls to memcpy of 100 bytes. ~3313.566 MB/s corrected.
after:100:1000000000:1:25.662495: took 25.662495 s for 1000000000 calls to memcpy of 100 bytes. ~5714.767 MB/s corrected.
before:100:1000000000:2:37.755721: took 37.755721 s for 1000000000 calls to memcpy of 100 bytes. ~3313.541 MB/s corrected.
after:100:1000000000:2:25.662721: took 25.662721 s for 1000000000 calls to memcpy of 100 bytes. ~5714.690 MB/s corrected.
before:100:1000000000:4:35.396223: took 35.396223 s for 1000000000 calls to memcpy of 100 bytes. ~3609.446 MB/s corrected.
after:100:1000000000:4:26.547438: took 26.547438 s for 1000000000 calls to memcpy of 100 bytes. ~5426.980 MB/s corrected.
before:100:1000000000:8:35.396014: took 35.396014 s for 1000000000 calls to memcpy of 100 bytes. ~3609.475 MB/s corrected.
after:100:1000000000:8:21.237772: took 21.237772 s for 1000000000 calls to memcpy of 100 bytes. ~7776.734 MB/s corrected.
before:200:1000000000:1:56.633398: took 56.633398 s for 1000000000 calls to memcpy of 200 bytes. ~4002.091 MB/s corrected.
after:200:1000000000:1:33.626722: took 33.626722 s for 1000000000 calls to memcpy of 200 bytes. ~7737.057 MB/s corrected.
before:200:1000000000:2:56.633684: took 56.633684 s for 1000000000 calls to memcpy of 200 bytes. ~4002.067 MB/s corrected.
after:200:1000000000:2:33.626692: took 33.626692 s for 1000000000 calls to memcpy of 200 bytes. ~7737.066 MB/s corrected.
before:200:1000000000:4:54.273108: took 54.273108 s for 1000000000 calls to memcpy of 200 bytes. ~4210.621 MB/s corrected.
after:200:1000000000:4:33.626391: took 33.626391 s for 1000000000 calls to memcpy of 200 bytes. ~7737.160 MB/s corrected.
before:200:1000000000:8:54.273969: took 54.273969 s for 1000000000 calls to memcpy of 200 bytes. ~4210.541 MB/s corrected.
after:200:1000000000:8:29.497408: took 29.497408 s for 1000000000 calls to memcpy of 200 bytes. ~9293.799 MB/s corrected.
More information about the Libc-ports
mailing list