This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
Re: [PATCH][AArch64] Tune memcpy
- From: Richard Earnshaw <Richard dot Earnshaw at foss dot arm dot com>
- To: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>, newlib at sourceware dot org
- Date: Fri, 6 Nov 2015 13:33:06 +0000
- Subject: Re: [PATCH][AArch64] Tune memcpy
- Authentication-results: sourceware.org; auth=none
- References: <000001d117fc$6fafe190$4f0fa4b0$ at arm dot com>
On 05/11/15 19:01, Wilco Dijkstra wrote:
> This patch further tunes memcpy on AArch64 - avoid one branch for sizes 1-3,
> add a prefetch and improve small copies that are exact powers of 2.
>
> OK for commit?
>
> ChangeLog:
> 2015-11-06 Wilco Dijkstra <wdijkstr@arm.com>
>
> * newlib/libc/machine/aarch64/memcpy.S (memcpy):
> Further tuning for performance.
>
> ---
> newlib/libc/machine/aarch64/memcpy.S | 53
> ++++++++++++++++++++----------------
> 1 file changed, 29 insertions(+), 24 deletions(-)
>
> diff --git a/newlib/libc/machine/aarch64/memcpy.S
> b/newlib/libc/machine/aarch64/memcpy.S
> index c109684..b53d125 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -73,6 +73,7 @@
> #define A_h x7
> #define A_hw w7
> #define B_l x8
> +#define B_lw w8
> #define B_h x9
> #define C_l x10
> #define C_h x11
> @@ -104,21 +105,40 @@
> */
>
> def_fn memcpy p2align=6
> + prfm PLDL1KEEP, [src]
> add srcend, src, count
> add dstend, dstin, count
> + cmp count, 16
> + b.ls L(copy16)
> cmp count, 96
> b.hi L(copy_long)
> - cmp count, 16
> - b.hs L(copy_medium)
>
> + /* Medium copies: 17..96 bytes. */
> + sub tmp1, count, 1
> + ldp A_l, A_h, [src]
> + tbnz tmp1, 6, L(copy96)
> + ldp D_l, D_h, [srcend, -16]
> + tbz tmp1, 5, 1f
> + ldp B_l, B_h, [src, 16]
> + ldp C_l, C_h, [srcend, -32]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstend, -32]
> +1:
> + stp A_l, A_h, [dstin]
> + stp D_l, D_h, [dstend, -16]
> + ret
> +
> + .p2align 4
> /* Small copies: 0..16 bytes. */
> L(copy16):
> - tbz count, 3, 1f
> + cmp count, 8
> + b.lo 1f
> ldr A_l, [src]
> ldr A_h, [srcend, -8]
> str A_l, [dstin]
> str A_h, [dstend, -8]
> ret
> + .p2align 4
> 1:
> tbz count, 2, 1f
> ldr A_lw, [src]
> @@ -126,33 +146,18 @@ L(copy16):
> str A_lw, [dstin]
> str A_hw, [dstend, -4]
> ret
> - .p2align 4
> 1:
> cbz count, 2f
> + lsr tmp1, count, 1
> ldrb A_lw, [src]
> - tbz count, 1, 1f
> - ldrh A_hw, [srcend, -2]
> - strh A_hw, [dstend, -2]
> -1: strb A_lw, [dstin]
> + ldrb A_hw, [srcend, -1]
> + ldrb B_lw, [src, tmp1]
> + strb A_lw, [dstin]
> + strb B_lw, [dstin, tmp1]
> + strb A_hw, [dstend, -1]
> 2: ret
>
> .p2align 4
> - /* Medium copies: 17..96 bytes. */
> -L(copy_medium):
> - ldp A_l, A_h, [src]
> - tbnz count, 6, L(copy96)
> - ldp D_l, D_h, [srcend, -16]
> - tbz count, 5, 1f
> - ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [srcend, -32]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstend, -32]
> -1:
> - stp A_l, A_h, [dstin]
> - stp D_l, D_h, [dstend, -16]
> - ret
> -
> - .p2align 4
> /* Copy 64..96 bytes. Copy 64 bytes from the start and
> 32 bytes from the end. */
> L(copy96):
>
>
> Tune-memcpy.patch
>
>
>
>
> ---
> newlib/libc/machine/aarch64/memcpy.S | 53 ++++++++++++++++++++----------------
> 1 file changed, 29 insertions(+), 24 deletions(-)
>
> diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S
> index c109684..b53d125 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -73,6 +73,7 @@
> #define A_h x7
> #define A_hw w7
> #define B_l x8
> +#define B_lw w8
> #define B_h x9
> #define C_l x10
> #define C_h x11
> @@ -104,21 +105,40 @@
> */
>
> def_fn memcpy p2align=6
> + prfm PLDL1KEEP, [src]
> add srcend, src, count
> add dstend, dstin, count
> + cmp count, 16
> + b.ls L(copy16)
> cmp count, 96
> b.hi L(copy_long)
> - cmp count, 16
> - b.hs L(copy_medium)
>
> + /* Medium copies: 17..96 bytes. */
> + sub tmp1, count, 1
> + ldp A_l, A_h, [src]
> + tbnz tmp1, 6, L(copy96)
> + ldp D_l, D_h, [srcend, -16]
> + tbz tmp1, 5, 1f
> + ldp B_l, B_h, [src, 16]
> + ldp C_l, C_h, [srcend, -32]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstend, -32]
> +1:
> + stp A_l, A_h, [dstin]
> + stp D_l, D_h, [dstend, -16]
> + ret
> +
> + .p2align 4
> /* Small copies: 0..16 bytes. */
> L(copy16):
> - tbz count, 3, 1f
> + cmp count, 8
> + b.lo 1f
> ldr A_l, [src]
> ldr A_h, [srcend, -8]
> str A_l, [dstin]
> str A_h, [dstend, -8]
> ret
> + .p2align 4
> 1:
> tbz count, 2, 1f
> ldr A_lw, [src]
> @@ -126,33 +146,18 @@ L(copy16):
> str A_lw, [dstin]
> str A_hw, [dstend, -4]
> ret
> - .p2align 4
> 1:
> cbz count, 2f
> + lsr tmp1, count, 1
> ldrb A_lw, [src]
> - tbz count, 1, 1f
> - ldrh A_hw, [srcend, -2]
> - strh A_hw, [dstend, -2]
> -1: strb A_lw, [dstin]
> + ldrb A_hw, [srcend, -1]
> + ldrb B_lw, [src, tmp1]
> + strb A_lw, [dstin]
> + strb B_lw, [dstin, tmp1]
> + strb A_hw, [dstend, -1]
> 2: ret
I think the above is worthy of a comment highlighting that src, src+tmp1
and srcend-1 are all the same when count==1; similarly for dst addresses.
OK with that change.
R.
>
> .p2align 4
> - /* Medium copies: 17..96 bytes. */
> -L(copy_medium):
> - ldp A_l, A_h, [src]
> - tbnz count, 6, L(copy96)
> - ldp D_l, D_h, [srcend, -16]
> - tbz count, 5, 1f
> - ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [srcend, -32]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstend, -32]
> -1:
> - stp A_l, A_h, [dstin]
> - stp D_l, D_h, [dstend, -16]
> - ret
> -
> - .p2align 4
> /* Copy 64..96 bytes. Copy 64 bytes from the start and
> 32 bytes from the end. */
> L(copy96):
>