[PATCH][AArch64] Tune memcpy

pinskia@gmail.com pinskia@gmail.com
Fri Nov 6 14:13:00 GMT 2015



> On Nov 6, 2015, at 3:01 AM, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> 
> This patch further tunes memcpy on AArch64 - avoid one branch for sizes 1-3,
> add a prefetch and improve small copies that are exact powers of 2.
> 
> OK for commit?
> 
> ChangeLog:
> 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> 
>    * newlib/libc/machine/aarch64/memcpy.S (memcpy):
>    Further tuning for performance.
> 
> ---
> newlib/libc/machine/aarch64/memcpy.S | 53
> ++++++++++++++++++++----------------
> 1 file changed, 29 insertions(+), 24 deletions(-)
> 
> diff --git a/newlib/libc/machine/aarch64/memcpy.S
> b/newlib/libc/machine/aarch64/memcpy.S
> index c109684..b53d125 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -73,6 +73,7 @@
> #define A_h    x7
> #define A_hw    w7
> #define B_l    x8
> +#define B_lw    w8
> #define B_h    x9
> #define C_l    x10
> #define C_h    x11
> @@ -104,21 +105,40 @@
> */
> 
> def_fn memcpy p2align=6
> +    prfm    PLDL1KEEP, [src]

Why keep rather than strm for the prefetches?

Thanks,
Andrew


>    add    srcend, src, count
>    add    dstend, dstin, count
> +    cmp    count, 16
> +    b.ls    L(copy16)
>    cmp    count, 96
>    b.hi    L(copy_long)
> -    cmp    count, 16
> -    b.hs    L(copy_medium)
> 
> +    /* Medium copies: 17..96 bytes.  */
> +    sub    tmp1, count, 1
> +    ldp    A_l, A_h, [src]
> +    tbnz    tmp1, 6, L(copy96)
> +    ldp    D_l, D_h, [srcend, -16]
> +    tbz    tmp1, 5, 1f
> +    ldp    B_l, B_h, [src, 16]
> +    ldp    C_l, C_h, [srcend, -32]
> +    stp    B_l, B_h, [dstin, 16]
> +    stp    C_l, C_h, [dstend, -32]
> +1:
> +    stp    A_l, A_h, [dstin]
> +    stp    D_l, D_h, [dstend, -16]
> +    ret
> +
> +    .p2align 4
>    /* Small copies: 0..16 bytes.  */
> L(copy16):
> -    tbz    count, 3, 1f
> +    cmp    count, 8
> +    b.lo    1f
>    ldr    A_l, [src]
>    ldr    A_h, [srcend, -8]
>    str    A_l, [dstin]
>    str    A_h, [dstend, -8]
>    ret
> +    .p2align 4
> 1:
>    tbz    count, 2, 1f
>    ldr    A_lw, [src]
> @@ -126,33 +146,18 @@ L(copy16):
>    str    A_lw, [dstin]
>    str    A_hw, [dstend, -4]
>    ret
> -    .p2align 4
> 1:
>    cbz    count, 2f
> +    lsr    tmp1, count, 1
>    ldrb    A_lw, [src]
> -    tbz    count, 1, 1f
> -    ldrh    A_hw, [srcend, -2]
> -    strh    A_hw, [dstend, -2]
> -1:    strb    A_lw, [dstin]
> +    ldrb    A_hw, [srcend, -1]
> +    ldrb    B_lw, [src, tmp1]
> +    strb    A_lw, [dstin]
> +    strb    B_lw, [dstin, tmp1]
> +    strb    A_hw, [dstend, -1]
> 2:    ret
> 
>    .p2align 4
> -    /* Medium copies: 17..96 bytes.     */
> -L(copy_medium):
> -    ldp    A_l, A_h, [src]
> -    tbnz    count, 6, L(copy96)
> -    ldp    D_l, D_h, [srcend, -16]
> -    tbz    count, 5, 1f
> -    ldp    B_l, B_h, [src, 16]
> -    ldp    C_l, C_h, [srcend, -32]
> -    stp    B_l, B_h, [dstin, 16]
> -    stp    C_l, C_h, [dstend, -32]
> -1:
> -    stp    A_l, A_h, [dstin]
> -    stp    D_l, D_h, [dstend, -16]
> -    ret
> -
> -    .p2align 4
>    /* Copy 64..96 bytes.  Copy 64 bytes from the start and
>       32 bytes from the end.  */
> L(copy96):
> -- 
> 1.9.1
> 
> 
> 
>    
> <Tune-memcpy.patch>



More information about the Newlib mailing list