[PATCH v2, ARM] Integrate optimized Cortex A15 memcpy.
Jeff Johnston
jjohnstn@redhat.com
Fri Apr 12 21:25:00 GMT 2013
Will,
Please resubmit your patch as an attachment as my e-mail reader is doing
something so it won't apply after saving. You have changed the license
of memcpy.S so this must be a rewrite. You might as well just attach
the new version of the file rather than a patch.
Thanks,
-- Jeff J.
On 03/28/2013 10:43 AM, Will Newton wrote:
> 2013-03-28 Will Newton <will.newton@linaro.org>
>
> * libc/machine/arm/memcpy-stub.c: Use generic memcpy if unaligned
> access is not enabled or if building for big-endian.
> * libc/machine/arm/memcpy.S: Faster memcpy implementation for
> Cortex A15 cores using NEON and VFP if available.
> ---
> newlib/libc/machine/arm/memcpy-stub.c | 3 +-
> newlib/libc/machine/arm/memcpy.S | 1006 ++++++++++++++++++++-------------
> 2 files changed, 607 insertions(+), 402 deletions(-)
>
> Changes in v2:
> - Compile fix and cleanup of whitespace. Please disregard v1 which was sent in error!
>
> diff --git a/newlib/libc/machine/arm/memcpy-stub.c b/newlib/libc/machine/arm/memcpy-stub.c
> index 536b869..8a09b5c 100644
> --- a/newlib/libc/machine/arm/memcpy-stub.c
> +++ b/newlib/libc/machine/arm/memcpy-stub.c
> @@ -29,7 +29,8 @@
> /* The sole purpose of this file is to include the plain memcpy provided in newlib.
> An optimized version of memcpy is provided in the assembly file memcpy.S in this directory. */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
> - (!(defined (__ARM_ARCH_7A__))))
> + (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
> + defined (__ARMEL__))))
>
> #include "../../string/memcpy.c"
>
> diff --git a/newlib/libc/machine/arm/memcpy.S b/newlib/libc/machine/arm/memcpy.S
> index e408ed0..2eeeca8 100644
> --- a/newlib/libc/machine/arm/memcpy.S
> +++ b/newlib/libc/machine/arm/memcpy.S
> @@ -1,423 +1,627 @@
> -/*
> - * Copyright (c) 2011 ARM Ltd
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - * products derived from this software without specific prior written
> - * permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +/* Copyright (c) 2013, Linaro Limited
> + All rights reserved.
> +
> + Redistribution and use in source and binary forms, with or without
> + modification, are permitted provided that the following conditions
> + are met:
> +
> + * Redistributions of source code must retain the above copyright
> + notice, this list of conditions and the following disclaimer.
> +
> + * Redistributions in binary form must reproduce the above copyright
> + notice, this list of conditions and the following disclaimer in the
> + documentation and/or other materials provided with the distribution.
> +
> + * Neither the name of Linaro Limited nor the names of its
> + contributors may be used to endorse or promote products derived
> + from this software without specific prior written permission.
> +
> + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +
> + This memcpy routine is optimised for Cortex-A cores and takes advantage
> + of VFP or NEON when built with the appropriate flags.
> +
> + Assumptions:
> +
> + ARMv6 (ARMv7-a if using Neon)
> + ARM state
> + Unaligned accesses
> + LDRD/STRD support unaligned word accesses
> + Not tested on big-endian
> +
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
> - (!(defined (__ARM_ARCH_7A__))))
> + (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
> + defined (__ARMEL__))))
>
> /* Do nothing here. See memcpy-stub.c in the same directory. */
>
> #else
> - /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */
>
> - /* Use the version of memcpy implemented using LDRD and STRD.
> - This version is tuned for Cortex-A15.
> - This might not be the best for other ARMv7-A CPUs,
> - but there is no predefine to distinguish between
> - different CPUs in the same architecture,
> - and this version is better than the plain memcpy provided in newlib.
> + .syntax unified
> + /* This implementation requires ARM state. */
> + .arm
> +
> +#ifdef __ARM_NEON__
> +
> + .fpu neon
> + .arch armv7-a
> +# define FRAME_SIZE 4
> +# define USE_VFP
> +# define USE_NEON
> +
> +#elif !defined (__SOFTFP__)
> +
> + .arch armv6
> + .fpu vfpv2
> +# define FRAME_SIZE 32
> +# define USE_VFP
> +
> +#else
> + .arch armv6
> +# define FRAME_SIZE 32
> +
> +#endif
> +
> +/* Old versions of GAS incorrectly implement the NEON align semantics. */
> +#ifdef BROKEN_ASM_NEON_ALIGN
> +#define ALIGN(addr, align) addr,:align
> +#else
> +#define ALIGN(addr, align) addr:align
> +#endif
>
> - Therefore, we use this version for all ARMv7-A CPUS. */
> +#define PC_OFFSET 8 /* PC pipeline compensation. */
> +#define INSN_SIZE 4
> +
> +/* Call parameters. */
> +#define dstin r0
> +#define src r1
> +#define count r2
> +
> +/* Locals. */
> +#define tmp1 r3
> +#define dst ip
> +#define tmp2 r10
> +
> +#ifndef USE_NEON
> +/* For bulk copies using GP registers. */
> +#define A_l r2 /* Call-clobbered. */
> +#define A_h r3 /* Call-clobbered. */
> +#define B_l r4
> +#define B_h r5
> +#define C_l r6
> +#define C_h r7
> +#define D_l r8
> +#define D_h r9
> +#endif
>
> - /* To make the same code compile for both ARM and Thumb instruction
> - sets, switch to unified syntax at the beginning of this function.
> - However, by using the same code, we may be missing optimization
> - opportunities. For instance, in LDRD/STRD instructions, the first
> - destination register must be even and the second consecutive in
> - ARM state, but not in Thumb state. */
> +/* Number of lines ahead to pre-fetch data. If you change this the code
> + below will need adjustment to compensate. */
> +
> +#define prefetch_lines 5
> +
> +#ifdef USE_VFP
> + .macro cpy_line_vfp vreg, base
> + vstr \vreg, [dst, #\base]
> + vldr \vreg, [src, #\base]
> + vstr d0, [dst, #\base + 8]
> + vldr d0, [src, #\base + 8]
> + vstr d1, [dst, #\base + 16]
> + vldr d1, [src, #\base + 16]
> + vstr d2, [dst, #\base + 24]
> + vldr d2, [src, #\base + 24]
> + vstr \vreg, [dst, #\base + 32]
> + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
> + vstr d0, [dst, #\base + 40]
> + vldr d0, [src, #\base + 40]
> + vstr d1, [dst, #\base + 48]
> + vldr d1, [src, #\base + 48]
> + vstr d2, [dst, #\base + 56]
> + vldr d2, [src, #\base + 56]
> + .endm
> +
> + .macro cpy_tail_vfp vreg, base
> + vstr \vreg, [dst, #\base]
> + vldr \vreg, [src, #\base]
> + vstr d0, [dst, #\base + 8]
> + vldr d0, [src, #\base + 8]
> + vstr d1, [dst, #\base + 16]
> + vldr d1, [src, #\base + 16]
> + vstr d2, [dst, #\base + 24]
> + vldr d2, [src, #\base + 24]
> + vstr \vreg, [dst, #\base + 32]
> + vstr d0, [dst, #\base + 40]
> + vldr d0, [src, #\base + 40]
> + vstr d1, [dst, #\base + 48]
> + vldr d1, [src, #\base + 48]
> + vstr d2, [dst, #\base + 56]
> + vldr d2, [src, #\base + 56]
> + .endm
> +#endif
>
> - .syntax unified
> + .macro def_fn f p2align=0
> + .text
> + .p2align \p2align
> + .global \f
> + .type \f, %function
> +\f:
> + .endm
> +
> +def_fn memcpy p2align=6
> +
> + mov dst, dstin /* Preserve dstin, we need to return it. */
> + cmp count, #64
> + bge .Lcpy_not_short
> + /* Deal with small copies quickly by dropping straight into the
> + exit block. */
> +
> +.Ltail63unaligned:
> +#ifdef USE_NEON
> + and tmp1, count, #0x38
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> + vld1.8 {d0}, [src]! /* 14 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 12 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 10 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 8 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 6 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 4 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 2 words to go. */
> + vst1.8 {d0}, [dst]!
> +
> + tst count, #4
> + ldrne tmp1, [src], #4
> + strne tmp1, [dst], #4
> +#else
> + /* Copy up to 15 full words of data. May not be aligned. */
> + /* Cannot use VFP for unaligned data. */
> + and tmp1, count, #0x3c
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
> + /* Jump directly into the sequence below at the correct offset. */
> + add pc, pc, tmp1, lsl #1
> +
> + ldr tmp1, [src, #-60] /* 15 words to go. */
> + str tmp1, [dst, #-60]
> +
> + ldr tmp1, [src, #-56] /* 14 words to go. */
> + str tmp1, [dst, #-56]
> + ldr tmp1, [src, #-52]
> + str tmp1, [dst, #-52]
> +
> + ldr tmp1, [src, #-48] /* 12 words to go. */
> + str tmp1, [dst, #-48]
> + ldr tmp1, [src, #-44]
> + str tmp1, [dst, #-44]
> +
> + ldr tmp1, [src, #-40] /* 10 words to go. */
> + str tmp1, [dst, #-40]
> + ldr tmp1, [src, #-36]
> + str tmp1, [dst, #-36]
> +
> + ldr tmp1, [src, #-32] /* 8 words to go. */
> + str tmp1, [dst, #-32]
> + ldr tmp1, [src, #-28]
> + str tmp1, [dst, #-28]
> +
> + ldr tmp1, [src, #-24] /* 6 words to go. */
> + str tmp1, [dst, #-24]
> + ldr tmp1, [src, #-20]
> + str tmp1, [dst, #-20]
> +
> + ldr tmp1, [src, #-16] /* 4 words to go. */
> + str tmp1, [dst, #-16]
> + ldr tmp1, [src, #-12]
> + str tmp1, [dst, #-12]
> +
> + ldr tmp1, [src, #-8] /* 2 words to go. */
> + str tmp1, [dst, #-8]
> + ldr tmp1, [src, #-4]
> + str tmp1, [dst, #-4]
> +#endif
>
> -#if defined (__thumb__)
> - .thumb
> - .thumb_func
> + lsls count, count, #31
> + ldrhcs tmp1, [src], #2
> + ldrbne src, [src] /* Src is dead, use as a scratch. */
> + strhcs tmp1, [dst], #2
> + strbne src, [dst]
> + bx lr
> +
> +.Lcpy_not_short:
> + /* At least 64 bytes to copy, but don't know the alignment yet. */
> + str tmp2, [sp, #-FRAME_SIZE]!
> + and tmp2, src, #3
> + and tmp1, dst, #3
> + cmp tmp1, tmp2
> + bne .Lcpy_notaligned
> +
> +#ifdef USE_VFP
> + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
> + that the FP pipeline is much better at streaming loads and
> + stores. This is outside the critical loop. */
> + vmov.f32 s0, s0
> #endif
>
> - .global memcpy
> - .type memcpy, %function
> -memcpy:
> -
> - /* Assumes that n >= 0, and dst, src are valid pointers.
> - If there is at least 8 bytes to copy, use LDRD/STRD.
> - If src and dst are misaligned with different offsets,
> - first copy byte by byte until dst is aligned,
> - and then copy using LDRD/STRD and shift if needed.
> - When less than 8 left, copy a word and then byte by byte. */
> -
> - /* Save registers (r0 holds the return value):
> - optimized push {r0, r4, r5, lr}.
> - To try and improve performance, stack layout changed,
> - i.e., not keeping the stack looking like users expect
> - (highest numbered register at highest address). */
> - push {r0, lr}
> - strd r4, r5, [sp, #-8]!
> -
> - /* TODO: Add debug frame directives.
> - We don't need exception unwind directives, because the code below
> - does not throw any exceptions and does not call any other functions.
> - Generally, newlib functions like this lack debug information for
> - assembler source. */
> -
> - /* Get copying of tiny blocks out of the way first. */
> - /* Is there at least 4 bytes to copy? */
> - subs r2, r2, #4
> - blt copy_less_than_4 /* If n < 4. */
> -
> - /* Check word alignment. */
> - ands ip, r0, #3 /* ip = last 2 bits of dst. */
> - bne dst_not_word_aligned /* If dst is not word-aligned. */
> -
> - /* Get here if dst is word-aligned. */
> - ands ip, r1, #3 /* ip = last 2 bits of src. */
> - bne src_not_word_aligned /* If src is not word-aligned. */
> -word_aligned:
> - /* Get here if source and dst both are word-aligned.
> - The number of bytes remaining to copy is r2+4. */
> -
> - /* Is there is at least 64 bytes to copy? */
> - subs r2, r2, #60
> - blt copy_less_than_64 /* If r2 + 4 < 64. */
> -
> - /* First, align the destination buffer to 8-bytes,
> - to make sure double loads and stores don't cross cache line boundary,
> - as they are then more expensive even if the data is in the cache
> - (require two load/store issue cycles instead of one).
> - If only one of the buffers is not 8-bytes aligned,
> - then it's more important to align dst than src,
> - because there is more penalty for stores
> - than loads that cross cacheline boundary.
> - This check and realignment are only worth doing
> - if there is a lot to copy. */
> -
> - /* Get here if dst is word aligned,
> - i.e., the 2 least significant bits are 0.
> - If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
> - then copy 1 word (4 bytes). */
> - ands r3, r0, #4
> - beq 11f /* If dst already two-word aligned. */
> - ldr r3, [r1], #4
> - str r3, [r0], #4
> - subs r2, r2, #4
> - blt copy_less_than_64
> -
> -11:
> - /* TODO: Align to cacheline (useful for PLD optimization). */
> -
> - /* Every loop iteration copies 64 bytes. */
> + /* SRC and DST have the same mutual 32-bit alignment, but we may
> + still need to pre-copy some bytes to get to natural alignment.
> + We bring DST into full 64-bit alignment. */
> + lsls tmp2, dst, #29
> + beq 1f
> + rsbs tmp2, tmp2, #0
> + sub count, count, tmp2, lsr #29
> + ldrmi tmp1, [src], #4
> + strmi tmp1, [dst], #4
> + lsls tmp2, tmp2, #2
> + ldrhcs tmp1, [src], #2
> + ldrbne tmp2, [src], #1
> + strhcs tmp1, [dst], #2
> + strbne tmp2, [dst], #1
> +
> 1:
> - .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
> - ldrd r4, r5, [r1, \offset]
> - strd r4, r5, [r0, \offset]
> - .endr
> + subs tmp2, count, #64 /* Use tmp2 for count. */
> + blt .Ltail63aligned
> +
> + cmp tmp2, #512
> + bge .Lcpy_body_long
>
> - add r0, r0, #64
> - add r1, r1, #64
> - subs r2, r2, #64
> - bge 1b /* If there is more to copy. */
> +.Lcpy_body_medium: /* Count in tmp2. */
> +#ifdef USE_VFP
> +1:
> + vldr d0, [src, #0]
> + subs tmp2, tmp2, #64
> + vldr d1, [src, #8]
> + vstr d0, [dst, #0]
> + vldr d0, [src, #16]
> + vstr d1, [dst, #8]
> + vldr d1, [src, #24]
> + vstr d0, [dst, #16]
> + vldr d0, [src, #32]
> + vstr d1, [dst, #24]
> + vldr d1, [src, #40]
> + vstr d0, [dst, #32]
> + vldr d0, [src, #48]
> + vstr d1, [dst, #40]
> + vldr d1, [src, #56]
> + vstr d0, [dst, #48]
> + add src, src, #64
> + vstr d1, [dst, #56]
> + add dst, dst, #64
> + bge 1b
> + tst tmp2, #0x3f
> + beq .Ldone
> +
> +.Ltail63aligned: /* Count in tmp2. */
> + and tmp1, tmp2, #0x38
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> +
> + vldr d0, [src, #-56] /* 14 words to go. */
> + vstr d0, [dst, #-56]
> + vldr d0, [src, #-48] /* 12 words to go. */
> + vstr d0, [dst, #-48]
> + vldr d0, [src, #-40] /* 10 words to go. */
> + vstr d0, [dst, #-40]
> + vldr d0, [src, #-32] /* 8 words to go. */
> + vstr d0, [dst, #-32]
> + vldr d0, [src, #-24] /* 6 words to go. */
> + vstr d0, [dst, #-24]
> + vldr d0, [src, #-16] /* 4 words to go. */
> + vstr d0, [dst, #-16]
> + vldr d0, [src, #-8] /* 2 words to go. */
> + vstr d0, [dst, #-8]
> +#else
> + sub src, src, #8
> + sub dst, dst, #8
> +1:
> + ldrd A_l, A_h, [src, #8]
> + strd A_l, A_h, [dst, #8]
> + ldrd A_l, A_h, [src, #16]
> + strd A_l, A_h, [dst, #16]
> + ldrd A_l, A_h, [src, #24]
> + strd A_l, A_h, [dst, #24]
> + ldrd A_l, A_h, [src, #32]
> + strd A_l, A_h, [dst, #32]
> + ldrd A_l, A_h, [src, #40]
> + strd A_l, A_h, [dst, #40]
> + ldrd A_l, A_h, [src, #48]
> + strd A_l, A_h, [dst, #48]
> + ldrd A_l, A_h, [src, #56]
> + strd A_l, A_h, [dst, #56]
> + ldrd A_l, A_h, [src, #64]!
> + strd A_l, A_h, [dst, #64]!
> + subs tmp2, tmp2, #64
> + bge 1b
> + tst tmp2, #0x3f
> + bne 1f
> + ldr tmp2,[sp], #FRAME_SIZE
> + bx lr
> +1:
> + add src, src, #8
> + add dst, dst, #8
> +
> +.Ltail63aligned: /* Count in tmp2. */
> + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
> + we know that the src and dest are 32-bit aligned so we can use
> + LDRD/STRD to improve efficiency. */
> + /* TMP2 is now negative, but we don't care about that. The bottom
> + six bits still tell us how many bytes are left to copy. */
> +
> + and tmp1, tmp2, #0x38
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
> + strd A_l, A_h, [dst, #-56]
> + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
> + strd A_l, A_h, [dst, #-48]
> + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
> + strd A_l, A_h, [dst, #-40]
> + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
> + strd A_l, A_h, [dst, #-32]
> + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
> + strd A_l, A_h, [dst, #-24]
> + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
> + strd A_l, A_h, [dst, #-16]
> + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
> + strd A_l, A_h, [dst, #-8]
>
> -copy_less_than_64:
> +#endif
> + tst tmp2, #4
> + ldrne tmp1, [src], #4
> + strne tmp1, [dst], #4
> + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
> + ldrhcs tmp1, [src], #2
> + ldrbne tmp2, [src]
> + strhcs tmp1, [dst], #2
> + strbne tmp2, [dst]
> +
> +.Ldone:
> + ldr tmp2, [sp], #FRAME_SIZE
> + bx lr
> +
> +.Lcpy_body_long: /* Count in tmp2. */
> +
> + /* Long copy. We know that there's at least (prefetch_lines * 64)
> + bytes to go. */
> +#ifdef USE_VFP
> + /* Don't use PLD. Instead, read some data in advance of the current
> + copy position into a register. This should act like a PLD
> + operation but we won't have to repeat the transfer. */
> +
> + vldr d3, [src, #0]
> + vldr d4, [src, #64]
> + vldr d5, [src, #128]
> + vldr d6, [src, #192]
> + vldr d7, [src, #256]
> +
> + vldr d0, [src, #8]
> + vldr d1, [src, #16]
> + vldr d2, [src, #24]
> + add src, src, #32
> +
> + subs tmp2, tmp2, #prefetch_lines * 64 * 2
> + blt 2f
> +1:
> + cpy_line_vfp d3, 0
> + cpy_line_vfp d4, 64
> + cpy_line_vfp d5, 128
> + add dst, dst, #3 * 64
> + add src, src, #3 * 64
> + cpy_line_vfp d6, 0
> + cpy_line_vfp d7, 64
> + add dst, dst, #2 * 64
> + add src, src, #2 * 64
> + subs tmp2, tmp2, #prefetch_lines * 64
> + bge 1b
>
> - /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
> - Restore the count if there is more than 7 bytes to copy. */
> - adds r2, r2, #56
> - blt copy_less_than_8
> +2:
> + cpy_tail_vfp d3, 0
> + cpy_tail_vfp d4, 64
> + cpy_tail_vfp d5, 128
> + add src, src, #3 * 64
> + add dst, dst, #3 * 64
> + cpy_tail_vfp d6, 0
> + vstr d7, [dst, #64]
> + vldr d7, [src, #64]
> + vstr d0, [dst, #64 + 8]
> + vldr d0, [src, #64 + 8]
> + vstr d1, [dst, #64 + 16]
> + vldr d1, [src, #64 + 16]
> + vstr d2, [dst, #64 + 24]
> + vldr d2, [src, #64 + 24]
> + vstr d7, [dst, #64 + 32]
> + add src, src, #96
> + vstr d0, [dst, #64 + 40]
> + vstr d1, [dst, #64 + 48]
> + vstr d2, [dst, #64 + 56]
> + add dst, dst, #128
> + add tmp2, tmp2, #prefetch_lines * 64
> + b .Lcpy_body_medium
> +#else
> + /* Long copy. Use an SMS style loop to maximize the I/O
> + bandwidth of the core. We don't have enough spare registers
> + to synthesise prefetching, so use PLD operations. */
> + /* Pre-bias src and dst. */
> + sub src, src, #8
> + sub dst, dst, #8
> + pld [src, #8]
> + pld [src, #72]
> + subs tmp2, tmp2, #64
> + pld [src, #136]
> + ldrd A_l, A_h, [src, #8]
> + strd B_l, B_h, [sp, #8]
> + ldrd B_l, B_h, [src, #16]
> + strd C_l, C_h, [sp, #16]
> + ldrd C_l, C_h, [src, #24]
> + strd D_l, D_h, [sp, #24]
> + pld [src, #200]
> + ldrd D_l, D_h, [src, #32]!
> + b 1f
> + .p2align 6
> +2:
> + pld [src, #232]
> + strd A_l, A_h, [dst, #40]
> + ldrd A_l, A_h, [src, #40]
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [src, #48]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [src, #56]
> + strd D_l, D_h, [dst, #64]!
> + ldrd D_l, D_h, [src, #64]!
> + subs tmp2, tmp2, #64
> +1:
> + strd A_l, A_h, [dst, #8]
> + ldrd A_l, A_h, [src, #8]
> + strd B_l, B_h, [dst, #16]
> + ldrd B_l, B_h, [src, #16]
> + strd C_l, C_h, [dst, #24]
> + ldrd C_l, C_h, [src, #24]
> + strd D_l, D_h, [dst, #32]
> + ldrd D_l, D_h, [src, #32]
> + bcs 2b
> + /* Save the remaining bytes and restore the callee-saved regs. */
> + strd A_l, A_h, [dst, #40]
> + add src, src, #40
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [sp, #8]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [sp, #16]
> + strd D_l, D_h, [dst, #64]
> + ldrd D_l, D_h, [sp, #24]
> + add dst, dst, #72
> + tst tmp2, #0x3f
> + bne .Ltail63aligned
> + ldr tmp2, [sp], #FRAME_SIZE
> + bx lr
> +#endif
>
> - /* Copy 8 bytes at a time. */
> +.Lcpy_notaligned:
> + pld [src]
> + pld [src, #64]
> + /* There's at least 64 bytes to copy, but there is no mutual
> + alignment. */
> + /* Bring DST to 64-bit alignment. */
> + lsls tmp2, dst, #29
> + pld [src, #(2 * 64)]
> + beq 1f
> + rsbs tmp2, tmp2, #0
> + sub count, count, tmp2, lsr #29
> + ldrmi tmp1, [src], #4
> + strmi tmp1, [dst], #4
> + lsls tmp2, tmp2, #2
> + ldrbne tmp1, [src], #1
> + ldrhcs tmp2, [src], #2
> + strbne tmp1, [dst], #1
> + strhcs tmp2, [dst], #2
> +1:
> + pld [src, #(3 * 64)]
> + subs count, count, #64
> + ldrmi tmp2, [sp], #FRAME_SIZE
> + bmi .Ltail63unaligned
> + pld [src, #(4 * 64)]
> +
> +#ifdef USE_NEON
> + vld1.8 {d0-d3}, [src]!
> + vld1.8 {d4-d7}, [src]!
> + subs count, count, #64
> + bmi 2f
> +1:
> + pld [src, #(4 * 64)]
> + vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
> + vld1.8 {d0-d3}, [src]!
> + vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
> + vld1.8 {d4-d7}, [src]!
> + subs count, count, #64
> + bpl 1b
> 2:
> - ldrd r4, r5, [r1], #8
> - strd r4, r5, [r0], #8
> - subs r2, r2, #8
> - bge 2b /* If there is more to copy. */
> -
> -copy_less_than_8:
> -
> - /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
> - Check if there is more to copy. */
> - cmn r2, #8
> - beq return /* If r2 + 8 == 0. */
> -
> - /* Restore the count if there is more than 3 bytes to copy. */
> - adds r2, r2, #4
> - blt copy_less_than_4
> -
> - /* Copy 4 bytes. */
> - ldr r3, [r1], #4
> - str r3, [r0], #4
> -
> -copy_less_than_4:
> - /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
> -
> - /* Restore the count, check if there is more to copy. */
> - adds r2, r2, #4
> - beq return /* If r2 == 0. */
> -
> - /* Get here with r2 is in {1,2,3}={01,10,11}. */
> - /* Logical shift left r2, insert 0s, update flags. */
> - lsls r2, r2, #31
> -
> - /* Copy byte by byte.
> - Condition ne means the last bit of r2 is 0.
> - Condition cs means the second to last bit of r2 is set,
> - i.e., r2 is 1 or 3. */
> - itt ne
> - ldrbne r3, [r1], #1
> - strbne r3, [r0], #1
> -
> - itttt cs
> - ldrbcs r4, [r1], #1
> - ldrbcs r5, [r1]
> - strbcs r4, [r0], #1
> - strbcs r5, [r0]
> -
> -return:
> - /* Restore registers: optimized pop {r0, r4, r5, pc} */
> - ldrd r4, r5, [sp], #8
> - pop {r0, pc} /* This is the only return point of memcpy. */
> -
> -#ifndef __ARM_FEATURE_UNALIGNED
> -
> - /* The following assembly macro implements misaligned copy in software.
> - Assumes that dst is word aligned, src is at offset "pull" bits from
> - word, push = 32 - pull, and the number of bytes that remain to copy
> - is r2 + 4, r2 >= 0. */
> -
> - /* In the code below, r2 is the number of bytes that remain to be
> - written. The number of bytes read is always larger, because we have
> - partial words in the shift queue. */
> -
> - .macro miscopy pull push shiftleft shiftright
> -
> - /* Align src to the previous word boundary. */
> - bic r1, r1, #3
> -
> - /* Initialize the shift queue. */
> - ldr r5, [r1], #4 /* Load a word from source. */
> -
> - subs r2, r2, #4
> - blt 6f /* Go to misaligned copy of less than 8 bytes. */
> -
> - /* Get here if there is more than 8 bytes to copy.
> - The number of bytes to copy is r2+8, r2 >= 0. */
> -
> - /* Save registers: push { r6, r7 }.
> - We need additional registers for LDRD and STRD, because in ARM state
> - the first destination register must be even and the second
> - consecutive. */
> - strd r6, r7, [sp, #-8]!
> -
> - subs r2, r2, #56
> - blt 4f /* Go to misaligned copy of less than 64 bytes. */
> -
> -3:
> - /* Get here if there is more than 64 bytes to copy.
> - The number of bytes to copy is r2+64, r2 >= 0. */
> -
> - /* Copy 64 bytes in every iteration.
> - Use a partial word from the shift queue. */
> - .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
> - mov r6, r5, \shiftleft #\pull
> - ldrd r4, r5, [r1, \offset]
> - orr r6, r6, r4, \shiftright #\push
> - mov r7, r4, \shiftleft #\pull
> - orr r7, r7, r5, \shiftright #\push
> - strd r6, r7, [r0, \offset]
> - .endr
> -
> - add r1, r1, #64
> - add r0, r0, #64
> - subs r2, r2, #64
> - bge 3b
> -
> -4:
> - /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
> - and they are misaligned. */
> -
> - /* Restore the count if there is more than 7 bytes to copy. */
> - adds r2, r2, #56
> -
> - /* If less than 8 bytes to copy,
> - restore registers saved for this loop: optimized poplt { r6, r7 }. */
> - itt lt
> - ldrdlt r6, r7, [sp], #8
> - blt 6f /* Go to misaligned copy of less than 8 bytes. */
> -
> -5:
> - /* Copy 8 bytes at a time.
> - Use a partial word from the shift queue. */
> - mov r6, r5, \shiftleft #\pull
> - ldrd r4, r5, [r1], #8
> - orr r6, r6, r4, \shiftright #\push
> - mov r7, r4, \shiftleft #\pull
> - orr r7, r7, r5, \shiftright #\push
> - strd r6, r7, [r0], #8
> -
> - subs r2, r2, #8
> - bge 5b /* If there is more to copy. */
> -
> - /* Restore registers saved for this loop: optimized pop { r6, r7 }. */
> - ldrd r6, r7, [sp], #8
> -
> -6:
> - /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
> - and they are misaligned. */
> -
> - /* Check if there is more to copy. */
> - cmn r2, #8
> - beq return
> -
> - /* Check if there is less than 4 bytes to copy. */
> - cmn r2, #4
> -
> - itt lt
> - /* Restore src offset from word-align. */
> - sublt r1, r1, #(\push / 8)
> - blt copy_less_than_4
> -
> - /* Use a partial word from the shift queue. */
> - mov r3, r5, \shiftleft #\pull
> - /* Load a word from src, but without writeback
> - (this word is not fully written to dst). */
> - ldr r5, [r1]
> -
> - /* Restore src offset from word-align. */
> - add r1, r1, #(\pull / 8)
> -
> - /* Shift bytes to create one dst word and store it. */
> - orr r3, r3, r5, \shiftright #\push
> - str r3, [r0], #4
> -
> - /* Use single byte copying of the remaining bytes. */
> - b copy_less_than_4
> -
> - .endm
> -
> -#endif /* not __ARM_FEATURE_UNALIGNED */
> -
> -dst_not_word_aligned:
> -
> - /* Get here when dst is not aligned and ip has the last 2 bits of dst,
> - i.e., ip is the offset of dst from word.
> - The number of bytes that remains to copy is r2 + 4,
> - i.e., there are at least 4 bytes to copy.
> - Write a partial word (0 to 3 bytes), such that dst becomes
> - word-aligned. */
> -
> - /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
> - then there are (4 - ip) bytes to fill up to align dst to the next
> - word. */
> - rsb ip, ip, #4 /* ip = #4 - ip. */
> - cmp ip, #2
> -
> - /* Copy byte by byte with conditionals. */
> - itt gt
> - ldrbgt r3, [r1], #1
> - strbgt r3, [r0], #1
> -
> - itt ge
> - ldrbge r4, [r1], #1
> - strbge r4, [r0], #1
> -
> - ldrb lr, [r1], #1
> - strb lr, [r0], #1
> -
> - /* Update the count.
> - ip holds the number of bytes we have just copied. */
> - subs r2, r2, ip /* r2 = r2 - ip. */
> - blt copy_less_than_4 /* If r2 < ip. */
> -
> - /* Get here if there are more than 4 bytes to copy.
> - Check if src is aligned. If beforehand src and dst were not word
> - aligned but congruent (same offset), then now they are both
> - word-aligned, and we can copy the rest efficiently (without
> - shifting). */
> - ands ip, r1, #3 /* ip = last 2 bits of src. */
> - beq word_aligned /* If r1 is word-aligned. */
> -
> -src_not_word_aligned:
> - /* Get here when src is not word-aligned, but dst is word-aligned.
> - The number of bytes that remains to copy is r2+4. */
> -
> -#ifdef __ARM_FEATURE_UNALIGNED
> - /* Copy word by word using LDR when alignment can be done in hardware,
> - i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
> - subs r2, r2, #60
> - blt 8f
> -
> -7:
> - /* Copy 64 bytes in every loop iteration. */
> - .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
> - ldr r3, [r1, \offset]
> - str r3, [r0, \offset]
> - .endr
> -
> - add r0, r0, #64
> - add r1, r1, #64
> - subs r2, r2, #64
> - bge 7b
> -
> -8:
> - /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
> - Check if there is more than 3 bytes to copy. */
> - adds r2, r2, #60
> - blt copy_less_than_4
> -
> -9:
> - /* Get here if there is less than 64 but at least 4 bytes to copy,
> - where the number of bytes to copy is r2+4. */
> - ldr r3, [r1], #4
> - str r3, [r0], #4
> - subs r2, r2, #4
> - bge 9b
> -
> - b copy_less_than_4
> -
> -#else /* not __ARM_FEATURE_UNALIGNED */
> -
> - /* ip has last 2 bits of src,
> - i.e., ip is the offset of src from word, and ip > 0.
> - Compute shifts needed to copy from src to dst. */
> - cmp ip, #2
> - beq miscopy_16_16 /* If ip == 2. */
> - bge miscopy_24_8 /* If ip == 3. */
> -
> - /* Get here if ip == 1. */
> -
> - /* Endian independent macros for shifting bytes within registers. */
> -
> -#ifndef __ARMEB__
> -miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
> -miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
> -miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
> -#else /* not __ARMEB__ */
> -miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
> -miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
> -miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
> -#endif /* not __ARMEB__ */
> -
> -#endif /* not __ARM_FEATURE_UNALIGNED */
> + vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
> + vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
> + ands count, count, #0x3f
> +#else
> + /* Use an SMS style loop to maximize the I/O bandwidth. */
> + sub src, src, #4
> + sub dst, dst, #8
> + subs tmp2, count, #64 /* Use tmp2 for count. */
> + ldr A_l, [src, #4]
> + ldr A_h, [src, #8]
> + strd B_l, B_h, [sp, #8]
> + ldr B_l, [src, #12]
> + ldr B_h, [src, #16]
> + strd C_l, C_h, [sp, #16]
> + ldr C_l, [src, #20]
> + ldr C_h, [src, #24]
> + strd D_l, D_h, [sp, #24]
> + ldr D_l, [src, #28]
> + ldr D_h, [src, #32]!
> + b 1f
> + .p2align 6
> +2:
> + pld [src, #(5 * 64) - (32 - 4)]
> + strd A_l, A_h, [dst, #40]
> + ldr A_l, [src, #36]
> + ldr A_h, [src, #40]
> + strd B_l, B_h, [dst, #48]
> + ldr B_l, [src, #44]
> + ldr B_h, [src, #48]
> + strd C_l, C_h, [dst, #56]
> + ldr C_l, [src, #52]
> + ldr C_h, [src, #56]
> + strd D_l, D_h, [dst, #64]!
> + ldr D_l, [src, #60]
> + ldr D_h, [src, #64]!
> + subs tmp2, tmp2, #64
> +1:
> + strd A_l, A_h, [dst, #8]
> + ldr A_l, [src, #4]
> + ldr A_h, [src, #8]
> + strd B_l, B_h, [dst, #16]
> + ldr B_l, [src, #12]
> + ldr B_h, [src, #16]
> + strd C_l, C_h, [dst, #24]
> + ldr C_l, [src, #20]
> + ldr C_h, [src, #24]
> + strd D_l, D_h, [dst, #32]
> + ldr D_l, [src, #28]
> + ldr D_h, [src, #32]
> + bcs 2b
> +
> + /* Save the remaining bytes and restore the callee-saved regs. */
> + strd A_l, A_h, [dst, #40]
> + add src, src, #36
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [sp, #8]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [sp, #16]
> + strd D_l, D_h, [dst, #64]
> + ldrd D_l, D_h, [sp, #24]
> + add dst, dst, #72
> + ands count, tmp2, #0x3f
> +#endif
> + ldr tmp2, [sp], #FRAME_SIZE
> + bne .Ltail63unaligned
> + bx lr
> +
> + .size memcpy, . - memcpy
>
> #endif /* memcpy */
>
More information about the Newlib
mailing list