[PATCH v3 2/5] AArch64: Improve A64FX memset
naohirot@fujitsu.com
naohirot@fujitsu.com
Mon Aug 2 13:29:21 GMT 2021
Hi Wilco,
Thank you for the patch.
I confirmed that the performance is improved than the master as shown
in the graphs [1][2].
Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
Tested-by: Naohiro Tamura <naohirot@fujitsu.com>
[1] https://drive.google.com/file/d/1RxdIlJa2Wvl8eT5_TRVkvbkyS6bfZObx/view?usp=sharing
[2] https://drive.google.com/file/d/1xCLsa7qweovdQpWtfnNZZwcEi7W7z3Ok/view?usp=sharing
There are one comment and one question below.
> Subject: [PATCH v3 2/5] AArch64: Improve A64FX memset
How about "AArch64: Improve A64FX memset for more than 8MB"?
>
> Improve performance of large memsets. Simplify alignment code. For zero memset use DC ZVA,
> which almost doubles performance. For non-zero memsets use the unroll8 loop which is about 10% faster.
>
> ---
>
> diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644
> --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> @@ -30,10 +30,8 @@
> #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB
> #define CACHE_LINE_SIZE 256
> #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
> -#define rest x8
> +#define rest x2
> #define vector_length x9
> -#define vl_remainder x10 // vector_length remainder
> -#define cl_remainder x11 // CACHE_LINE_SIZE remainder
>
> #if HAVE_AARCH64_SVE_ASM
> # if IS_IN (libc)
> @@ -41,14 +39,6 @@
>
> .arch armv8.2-a+sve
>
> - .macro dc_zva times
> - dc zva, tmp1
> - add tmp1, tmp1, CACHE_LINE_SIZE
> - .if \times-1
> - dc_zva "(\times-1)"
> - .endif
> - .endm
> -
> .macro st1b_unroll first=0, last=7
> st1b z0.b, p0, [dst, \first, mul vl]
> .if \last-\first
> @@ -187,54 +177,29 @@ L(L1_prefetch): // if rest >= L1_SIZE
> cbnz rest, L(unroll32)
> ret
>
> + // count >= L2_SIZE
> L(L2):
> - // align dst address at vector_length byte boundary
> - sub tmp1, vector_length, 1
> - ands tmp2, dst, tmp1
> - // if vl_remainder == 0
> - b.eq 1f
> - sub vl_remainder, vector_length, tmp2
> - // process remainder until the first vector_length boundary
> - whilelt p2.b, xzr, vl_remainder
> - st1b z0.b, p2, [dst]
> - add dst, dst, vl_remainder
> - sub rest, rest, vl_remainder
> - // align dstin address at CACHE_LINE_SIZE byte boundary
> -1: mov tmp1, CACHE_LINE_SIZE
> - ands tmp2, dst, CACHE_LINE_SIZE - 1
> - // if cl_remainder == 0
> - b.eq L(L2_dc_zva)
> - sub cl_remainder, tmp1, tmp2
> - // process remainder until the first CACHE_LINE_SIZE boundary
> - mov tmp1, xzr // index
> -2: whilelt p2.b, tmp1, cl_remainder
> - st1b z0.b, p2, [dst, tmp1]
> - incb tmp1
> - cmp tmp1, cl_remainder
> - b.lo 2b
> - add dst, dst, cl_remainder
> - sub rest, rest, cl_remainder
> -
> -L(L2_dc_zva):
> - // zero fill
> - mov tmp1, dst
> - dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1
> - mov zva_len, ZF_DIST
> - add tmp1, zva_len, CACHE_LINE_SIZE * 2
> - // unroll
> - .p2align 3
> -1: st1b_unroll 0, 3
> - add tmp2, dst, zva_len
> - dc zva, tmp2
> - st1b_unroll 4, 7
> - add tmp2, tmp2, CACHE_LINE_SIZE
> - dc zva, tmp2
> - add dst, dst, CACHE_LINE_SIZE * 2
> - sub rest, rest, CACHE_LINE_SIZE * 2
> - cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2
> - b.ge 1b
> - cbnz rest, L(unroll8)
> - ret
> + tst valw, 255
> + b.ne L(unroll8)
> + // align dst to CACHE_LINE_SIZE byte boundary
> + and tmp2, dst, CACHE_LINE_SIZE - 1
> + sub tmp2, tmp2, CACHE_LINE_SIZE
> + st1b z0.b, p0, [dst, 0, mul vl]
> + st1b z0.b, p0, [dst, 1, mul vl]
> + st1b z0.b, p0, [dst, 2, mul vl]
> + st1b z0.b, p0, [dst, 3, mul vl]
> + sub dst, dst, tmp2
> + add count, count, tmp2
> +
> + // clear cachelines using DC ZVA
> + sub count, count, CACHE_LINE_SIZE
> + .p2align 4
> +1: dc zva, dst
DC ZVA is called if buffer size is more than 8MB and fill data is zero.
In case of __memset_generic, DC ZVA is called if buffer size is more
than 256B and fill data is zero. This is implemented by you[3].
V3 Patch 02 __memset_a64fx (green line) recorded very close
performance to __memset_generic (red line) in terms of zerofill [4].
V3 Patch 05 __memset_a64fx (green line) recorded almost same
performance as __memset_generic (red line) in terms of zerofill [5].
Graphs[4][5] X axis starts from 256B to 64MB, and are created by the
following command.
$ cat bench-memset-zerofill.out | \
> jq -r 'del(.functions.memset.results[] | select(.char2 != 0))' | \
> plot_strings.py -l -p thru -v -
So DC ZVA implementations for __memset_generic and __memset_a64fx seem
appropriate respectively.
But comparing nonzero fill graph[6] with zero fill graph[4],
why DC ZVA is only effective more than 8MB for __memset_a64fx in spite
that DC ZVA is effective from smaller size for __memset_generic?
Still I couldn't understand DC ZVA behavior.
[3] https://sourceware.org/git/?p=glibc.git;a=commit;f=sysdeps/aarch64/memset.S;h=a8c5a2a9521e105da6e96eaf4029b8e4d595e4f5
[4] https://drive.google.com/file/d/1f0_sTiujCcEZTfxbQ1UZdVwAvbMbP2ii/view?usp=sharing
[5] https://drive.google.com/file/d/1Wyp3GO-9ipcphwqOQOQ9a97EwFz90SPc/view?usp=sharing
[6] https://drive.google.com/file/d/1nZ_lfj6Kz5vFCR35O0q929SceUP-wbih/view?usp=sharing
Thanks.
Naohiro
> + add dst, dst, CACHE_LINE_SIZE
> + subs count, count, CACHE_LINE_SIZE
> + b.hi 1b
> + add count, count, CACHE_LINE_SIZE
> + b L(last)
>
> END (MEMSET)
> libc_hidden_builtin_def (MEMSET)
More information about the Libc-alpha
mailing list