This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH][AArch64] Tune memcpy


On 05/11/15 19:01, Wilco Dijkstra wrote:
> This patch further tunes memcpy on AArch64 - avoid one branch for sizes 1-3,
> add a prefetch and improve small copies that are exact powers of 2.
> 
> OK for commit?
> 
> ChangeLog:
> 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* newlib/libc/machine/aarch64/memcpy.S (memcpy):
> 	Further tuning for performance.
> 
> ---
>  newlib/libc/machine/aarch64/memcpy.S | 53
> ++++++++++++++++++++----------------
>  1 file changed, 29 insertions(+), 24 deletions(-)
> 
> diff --git a/newlib/libc/machine/aarch64/memcpy.S
> b/newlib/libc/machine/aarch64/memcpy.S
> index c109684..b53d125 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -73,6 +73,7 @@
>  #define A_h	x7
>  #define A_hw	w7
>  #define B_l	x8
> +#define B_lw	w8
>  #define B_h	x9
>  #define C_l	x10
>  #define C_h	x11
> @@ -104,21 +105,40 @@
>  */
>  
>  def_fn memcpy p2align=6
> +	prfm	PLDL1KEEP, [src]
>  	add	srcend, src, count
>  	add	dstend, dstin, count
> +	cmp	count, 16
> +	b.ls	L(copy16)
>  	cmp	count, 96
>  	b.hi	L(copy_long)
> -	cmp	count, 16
> -	b.hs	L(copy_medium)
>  
> +	/* Medium copies: 17..96 bytes.  */
> +	sub	tmp1, count, 1
> +	ldp	A_l, A_h, [src]
> +	tbnz	tmp1, 6, L(copy96)
> +	ldp	D_l, D_h, [srcend, -16]
> +	tbz	tmp1, 5, 1f
> +	ldp	B_l, B_h, [src, 16]
> +	ldp	C_l, C_h, [srcend, -32]
> +	stp	B_l, B_h, [dstin, 16]
> +	stp	C_l, C_h, [dstend, -32]
> +1:
> +	stp	A_l, A_h, [dstin]
> +	stp	D_l, D_h, [dstend, -16]
> +	ret
> +
> +	.p2align 4
>  	/* Small copies: 0..16 bytes.  */
>  L(copy16):
> -	tbz	count, 3, 1f
> +	cmp	count, 8
> +	b.lo	1f
>  	ldr	A_l, [src]
>  	ldr	A_h, [srcend, -8]
>  	str	A_l, [dstin]
>  	str	A_h, [dstend, -8]
>  	ret
> +	.p2align 4
>  1:
>  	tbz	count, 2, 1f
>  	ldr	A_lw, [src]
> @@ -126,33 +146,18 @@ L(copy16):
>  	str	A_lw, [dstin]
>  	str	A_hw, [dstend, -4]
>  	ret
> -	.p2align 4
>  1:
>  	cbz	count, 2f
> +	lsr	tmp1, count, 1
>  	ldrb	A_lw, [src]
> -	tbz	count, 1, 1f
> -	ldrh	A_hw, [srcend, -2]
> -	strh	A_hw, [dstend, -2]
> -1:	strb	A_lw, [dstin]
> +	ldrb	A_hw, [srcend, -1]
> +	ldrb	B_lw, [src, tmp1]
> +	strb	A_lw, [dstin]
> +	strb	B_lw, [dstin, tmp1]
> +	strb	A_hw, [dstend, -1]
>  2:	ret
>  
>  	.p2align 4
> -	/* Medium copies: 17..96 bytes.	 */
> -L(copy_medium):
> -	ldp	A_l, A_h, [src]
> -	tbnz	count, 6, L(copy96)
> -	ldp	D_l, D_h, [srcend, -16]
> -	tbz	count, 5, 1f
> -	ldp	B_l, B_h, [src, 16]
> -	ldp	C_l, C_h, [srcend, -32]
> -	stp	B_l, B_h, [dstin, 16]
> -	stp	C_l, C_h, [dstend, -32]
> -1:
> -	stp	A_l, A_h, [dstin]
> -	stp	D_l, D_h, [dstend, -16]
> -	ret
> -
> -	.p2align 4
>  	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
>  	   32 bytes from the end.  */
>  L(copy96):
> 
> 
> Tune-memcpy.patch
> 
> 
> 
> 
> ---
>  newlib/libc/machine/aarch64/memcpy.S | 53 ++++++++++++++++++++----------------
>  1 file changed, 29 insertions(+), 24 deletions(-)
> 
> diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S
> index c109684..b53d125 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -73,6 +73,7 @@
>  #define A_h	x7
>  #define A_hw	w7
>  #define B_l	x8
> +#define B_lw	w8
>  #define B_h	x9
>  #define C_l	x10
>  #define C_h	x11
> @@ -104,21 +105,40 @@
>  */
>  
>  def_fn memcpy p2align=6
> +	prfm	PLDL1KEEP, [src]
>  	add	srcend, src, count
>  	add	dstend, dstin, count
> +	cmp	count, 16
> +	b.ls	L(copy16)
>  	cmp	count, 96
>  	b.hi	L(copy_long)
> -	cmp	count, 16
> -	b.hs	L(copy_medium)
>  
> +	/* Medium copies: 17..96 bytes.  */
> +	sub	tmp1, count, 1
> +	ldp	A_l, A_h, [src]
> +	tbnz	tmp1, 6, L(copy96)
> +	ldp	D_l, D_h, [srcend, -16]
> +	tbz	tmp1, 5, 1f
> +	ldp	B_l, B_h, [src, 16]
> +	ldp	C_l, C_h, [srcend, -32]
> +	stp	B_l, B_h, [dstin, 16]
> +	stp	C_l, C_h, [dstend, -32]
> +1:
> +	stp	A_l, A_h, [dstin]
> +	stp	D_l, D_h, [dstend, -16]
> +	ret
> +
> +	.p2align 4
>  	/* Small copies: 0..16 bytes.  */
>  L(copy16):
> -	tbz	count, 3, 1f
> +	cmp	count, 8
> +	b.lo	1f
>  	ldr	A_l, [src]
>  	ldr	A_h, [srcend, -8]
>  	str	A_l, [dstin]
>  	str	A_h, [dstend, -8]
>  	ret
> +	.p2align 4
>  1:
>  	tbz	count, 2, 1f
>  	ldr	A_lw, [src]
> @@ -126,33 +146,18 @@ L(copy16):
>  	str	A_lw, [dstin]
>  	str	A_hw, [dstend, -4]
>  	ret
> -	.p2align 4
>  1:
>  	cbz	count, 2f
> +	lsr	tmp1, count, 1
>  	ldrb	A_lw, [src]
> -	tbz	count, 1, 1f
> -	ldrh	A_hw, [srcend, -2]
> -	strh	A_hw, [dstend, -2]
> -1:	strb	A_lw, [dstin]
> +	ldrb	A_hw, [srcend, -1]
> +	ldrb	B_lw, [src, tmp1]
> +	strb	A_lw, [dstin]
> +	strb	B_lw, [dstin, tmp1]
> +	strb	A_hw, [dstend, -1]
>  2:	ret

I think the above is worthy of a comment highlighting that src, src+tmp1
and srcend-1 are all the same when count==1; similarly for dst addresses.

OK with that change.

R.

>  
>  	.p2align 4
> -	/* Medium copies: 17..96 bytes.	 */
> -L(copy_medium):
> -	ldp	A_l, A_h, [src]
> -	tbnz	count, 6, L(copy96)
> -	ldp	D_l, D_h, [srcend, -16]
> -	tbz	count, 5, 1f
> -	ldp	B_l, B_h, [src, 16]
> -	ldp	C_l, C_h, [srcend, -32]
> -	stp	B_l, B_h, [dstin, 16]
> -	stp	C_l, C_h, [dstend, -32]
> -1:
> -	stp	A_l, A_h, [dstin]
> -	stp	D_l, D_h, [dstend, -16]
> -	ret
> -
> -	.p2align 4
>  	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
>  	   32 bytes from the end.  */
>  L(copy96):
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]