This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
RE: [PATCH][AArch64] Tune memcpy
- From: "Wilco Dijkstra" <Wilco dot Dijkstra at arm dot com>
- To: <newlib at sourceware dot org>
- Date: Tue, 10 Nov 2015 13:15:25 -0000
- Subject: RE: [PATCH][AArch64] Tune memcpy
- Authentication-results: sourceware.org; auth=none
- References: <000001d117fc$6fafe190$4f0fa4b0$ at arm dot com> <563CAC12 dot 4030605 at foss dot arm dot com>
ping
> -----Original Message-----
> From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
> Sent: 06 November 2015 14:09
> To: 'Richard Earnshaw'; newlib@sourceware.org
> Subject: RE: [PATCH][AArch64] Tune memcpy
>
> > Richard Earnshaw wrote:
> > On 05/11/15 19:01, Wilco Dijkstra wrote:
> > > This patch further tunes memcpy on AArch64 - avoid one branch for
> > > sizes 1-3, add a prefetch and improve small copies that are exact
> > > powers of
> > 2.
> > >
> > > OK for commit?
> > >
> > > ChangeLog:
> > > 2015-11-06 Wilco Dijkstra <wdijkstr@arm.com>
> > >
> > > * newlib/libc/machine/aarch64/memcpy.S (memcpy):
> > > Further tuning for performance.
>
> > I think the above is worthy of a comment highlighting that src,
> > src+tmp1 and
> > srcend-1 are all the same when count==1; similarly for dst addresses.
> >
> > OK with that change.
>
> I've added a comment, see below. Could someone check this in?
>
> + /* Copy 0..3 bytes. Use a branchless sequence that copies the same
> + byte 3 times if count==1, or the 2nd byte twice if count==2. */
>
>
> ---
> newlib/libc/machine/aarch64/memcpy.S | 56 ++++++++++++++++++++-----
> -----------
> 1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/newlib/libc/machine/aarch64/memcpy.S
> b/newlib/libc/machine/aarch64/memcpy.S
> index c109684..463bad0 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -73,6 +73,7 @@
> #define A_h x7
> #define A_hw w7
> #define B_l x8
> +#define B_lw w8
> #define B_h x9
> #define C_l x10
> #define C_h x11
> @@ -104,21 +105,40 @@
> */
>
> def_fn memcpy p2align=6
> + prfm PLDL1KEEP, [src]
> add srcend, src, count
> add dstend, dstin, count
> + cmp count, 16
> + b.ls L(copy16)
> cmp count, 96
> b.hi L(copy_long)
> - cmp count, 16
> - b.hs L(copy_medium)
>
> + /* Medium copies: 17..96 bytes. */
> + sub tmp1, count, 1
> + ldp A_l, A_h, [src]
> + tbnz tmp1, 6, L(copy96)
> + ldp D_l, D_h, [srcend, -16]
> + tbz tmp1, 5, 1f
> + ldp B_l, B_h, [src, 16]
> + ldp C_l, C_h, [srcend, -32]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstend, -32]
> +1:
> + stp A_l, A_h, [dstin]
> + stp D_l, D_h, [dstend, -16]
> + ret
> +
> + .p2align 4
> /* Small copies: 0..16 bytes. */
> L(copy16):
> - tbz count, 3, 1f
> + cmp count, 8
> + b.lo 1f
> ldr A_l, [src]
> ldr A_h, [srcend, -8]
> str A_l, [dstin]
> str A_h, [dstend, -8]
> ret
> + .p2align 4
> 1:
> tbz count, 2, 1f
> ldr A_lw, [src]
> @@ -126,33 +146,21 @@ L(copy16):
> str A_lw, [dstin]
> str A_hw, [dstend, -4]
> ret
> - .p2align 4
> +
> + /* Copy 0..3 bytes. Use a branchless sequence that copies the same
> + byte 3 times if count==1, or the 2nd byte twice if count==2. */
> 1:
> cbz count, 2f
> + lsr tmp1, count, 1
> ldrb A_lw, [src]
> - tbz count, 1, 1f
> - ldrh A_hw, [srcend, -2]
> - strh A_hw, [dstend, -2]
> -1: strb A_lw, [dstin]
> + ldrb A_hw, [srcend, -1]
> + ldrb B_lw, [src, tmp1]
> + strb A_lw, [dstin]
> + strb B_lw, [dstin, tmp1]
> + strb A_hw, [dstend, -1]
> 2: ret
>
> .p2align 4
> - /* Medium copies: 17..96 bytes. */
> -L(copy_medium):
> - ldp A_l, A_h, [src]
> - tbnz count, 6, L(copy96)
> - ldp D_l, D_h, [srcend, -16]
> - tbz count, 5, 1f
> - ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [srcend, -32]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstend, -32]
> -1:
> - stp A_l, A_h, [dstin]
> - stp D_l, D_h, [dstend, -16]
> - ret
> -
> - .p2align 4
> /* Copy 64..96 bytes. Copy 64 bytes from the start and
> 32 bytes from the end. */
> L(copy96):
> --
> 1.9.1