This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 2/2] aarch64,falkor: Use vector registers for copy
On 29/06/2018 07:43, Siddhesh Poyarekar wrote:
> Vector registers perform better than scalar register pairs for copying
> data so prefer them instead. This results in a time reduction of over
> 50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk.
> Larger sizes show improvements of around 1% to 2%. memcpy-random shows
> a very small improvement, in the range of 1-2%.
>
> * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
> Use vector registers.
LGTM, although I can't really voucher for the performance improvement (I d
id not see any different on a A72).
>
> ---
> sysdeps/aarch64/multiarch/memcpy_falkor.S | 135 ++++++++++------------
> 1 file changed, 64 insertions(+), 71 deletions(-)
>
> diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
> index eef1fb02e4..f1e905d34d 100644
> --- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
> +++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
> @@ -30,25 +30,19 @@
> #define dst x3
> #define srcend x4
> #define dstend x5
> -#define A_l x6
> -#define A_lw w6
> -#define A_h x7
> -#define A_hw w7
> #define tmp1 x14
> +#define A_x x6
> +#define B_x x7
> +#define A_w w6
> +#define B_w w7
>
> -#define B_l x8
> -#define B_lw w8
> -#define B_h x9
> -#define C_l x10
> -#define C_h x11
> -#define D_l x12
> -#define D_h x13
> -#define E_l dst
> -#define E_h tmp1
> -#define F_l src
> -#define F_h count
> -#define G_l srcend
> -#define G_h x15
> +#define A_q q0
> +#define B_q q1
> +#define C_q q2
> +#define D_q q3
> +#define E_q q4
> +#define F_q q5
> +#define G_q q6
>
> /* Copies are split into 3 main cases:
>
> @@ -68,9 +62,9 @@
> bumping up the small copies up to 32 bytes allows us to do that without
> cost and also allows us to reduce the size of the prep code before loop64.
>
> - All copies are done only via two registers r6 and r7. This is to ensure
> - that all loads hit a single hardware prefetcher which can get correctly
> - trained to prefetch a single stream.
> + The copy loop uses only one register q0. This is to ensure that all loads
> + hit a single hardware prefetcher which can get correctly trained to prefetch
> + a single stream.
>
> The non-temporal stores help optimize cache utilization. */
>
> @@ -91,29 +85,29 @@ ENTRY(__memcpy_falkor)
> add srcend, src, count
> add dstend, dstin, count
> b.ls L(copy32)
> - ldp A_l, A_h, [src]
> + ldr A_q, [src]
> cmp count, 128
> - stp A_l, A_h, [dstin]
> + str A_q, [dstin]
> b.hi L(copy_long)
>
> /* Medium copies: 33..128 bytes. */
> sub tmp1, count, 1
> - ldp A_l, A_h, [src, 16]
> - ldp B_l, B_h, [srcend, -32]
> - ldp C_l, C_h, [srcend, -16]
> + ldr A_q, [src, 16]
> + ldr B_q, [srcend, -32]
> + ldr C_q, [srcend, -16]
> tbz tmp1, 6, 1f
> - ldp D_l, D_h, [src, 32]
> - ldp E_l, E_h, [src, 48]
> - stp D_l, D_h, [dstin, 32]
> - stp E_l, E_h, [dstin, 48]
> - ldp F_l, F_h, [srcend, -64]
> - ldp G_l, G_h, [srcend, -48]
> - stp F_l, F_h, [dstend, -64]
> - stp G_l, G_h, [dstend, -48]
> + ldr D_q, [src, 32]
> + ldr E_q, [src, 48]
> + str D_q, [dstin, 32]
> + str E_q, [dstin, 48]
> + ldr F_q, [srcend, -64]
> + ldr G_q, [srcend, -48]
> + str F_q, [dstend, -64]
> + str G_q, [dstend, -48]
> 1:
> - stp A_l, A_h, [dstin, 16]
> - stp B_l, B_h, [dstend, -32]
> - stp C_l, C_h, [dstend, -16]
> + str A_q, [dstin, 16]
> + str B_q, [dstend, -32]
> + str C_q, [dstend, -16]
> ret
>
> .p2align 4
> @@ -122,44 +116,44 @@ L(copy32):
> /* 16-32 */
> cmp count, 16
> b.lo 1f
> - ldp A_l, A_h, [src]
> - ldp B_l, B_h, [srcend, -16]
> - stp A_l, A_h, [dstin]
> - stp B_l, B_h, [dstend, -16]
> + ldr A_q, [src]
> + ldr B_q, [srcend, -16]
> + str A_q, [dstin]
> + str B_q, [dstend, -16]
> ret
> .p2align 4
> 1:
> /* 8-15 */
> tbz count, 3, 1f
> - ldr A_l, [src]
> - ldr B_l, [srcend, -8]
> - str A_l, [dstin]
> - str B_l, [dstend, -8]
> + ldr A_x, [src]
> + ldr B_x, [srcend, -8]
> + str A_x, [dstin]
> + str B_x, [dstend, -8]
> ret
> .p2align 4
> 1:
> /* 4-7 */
> tbz count, 2, 1f
> - ldr A_lw, [src]
> - ldr B_lw, [srcend, -4]
> - str A_lw, [dstin]
> - str B_lw, [dstend, -4]
> + ldr A_w, [src]
> + ldr B_w, [srcend, -4]
> + str A_w, [dstin]
> + str B_w, [dstend, -4]
> ret
> .p2align 4
> 1:
> /* 2-3 */
> tbz count, 1, 1f
> - ldrh A_lw, [src]
> - ldrh B_lw, [srcend, -2]
> - strh A_lw, [dstin]
> - strh B_lw, [dstend, -2]
> + ldrh A_w, [src]
> + ldrh B_w, [srcend, -2]
> + strh A_w, [dstin]
> + strh B_w, [dstend, -2]
> ret
> .p2align 4
> 1:
> /* 0-1 */
> tbz count, 0, 1f
> - ldrb A_lw, [src]
> - strb A_lw, [dstin]
> + ldrb A_w, [src]
> + strb A_w, [dstin]
> 1:
> ret
>
> @@ -178,30 +172,29 @@ L(copy_long):
> add count, count, tmp1
>
> L(loop64):
> - ldp A_l, A_h, [src, 16]!
> - stnp A_l, A_h, [dst, 16]
> - ldp A_l, A_h, [src, 16]!
> + ldr A_q, [src, 16]!
> + str A_q, [dst, 16]
> + ldr A_q, [src, 16]!
> subs count, count, 64
> - stnp A_l, A_h, [dst, 32]
> - ldp A_l, A_h, [src, 16]!
> - stnp A_l, A_h, [dst, 48]
> - ldp A_l, A_h, [src, 16]!
> - stnp A_l, A_h, [dst, 64]
> - add dst, dst, 64
> + str A_q, [dst, 32]
> + ldr A_q, [src, 16]!
> + str A_q, [dst, 48]
> + ldr A_q, [src, 16]!
> + str A_q, [dst, 64]!
> b.hi L(loop64)
>
> /* Write the last full set of 64 bytes. The remainder is at most 64
> bytes, so it is safe to always copy 64 bytes from the end even if
> there is just 1 byte left. */
> L(last64):
> - ldp A_l, A_h, [srcend, -64]
> - stnp A_l, A_h, [dstend, -64]
> - ldp B_l, B_h, [srcend, -48]
> - stnp B_l, B_h, [dstend, -48]
> - ldp C_l, C_h, [srcend, -32]
> - stnp C_l, C_h, [dstend, -32]
> - ldp D_l, D_h, [srcend, -16]
> - stnp D_l, D_h, [dstend, -16]
> + ldr E_q, [srcend, -64]
> + str E_q, [dstend, -64]
> + ldr D_q, [srcend, -48]
> + str D_q, [dstend, -48]
> + ldr C_q, [srcend, -32]
> + str C_q, [dstend, -32]
> + ldr B_q, [srcend, -16]
> + str B_q, [dstend, -16]
> ret
>
> END (__memcpy_falkor)
>