[PATCH v3 1/5] AArch64: Improve A64FX memset
naohirot@fujitsu.com
naohirot@fujitsu.com
Mon Aug 2 13:53:23 GMT 2021
Hi Wilco,
I have one question below.
> -----Original Message-----
> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, July 28, 2021 5:11 PM
> To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Cc: 'GNU C Library' <libc-alpha@sourceware.org>
> Subject: Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
>
> Hi Wilco,
>
> Thanks for the patch.
>
> I confirmed that the performance is improved than the master as show
> in the graphs [1].
> There are two comments, please find them.
>
> Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
> Tested-by: Naohiro Tamura <naohirot@fujitsu.com>
>
> [1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing
>
> > [PATCH v3 1/5] AArch64: Improve A64FX memset
> >
>
> Would you update the commit title so as not to be the same among 5
> patches?
> Because we need to ask distro to backport these patches.
> If all commit titles are the same, it will increase the room to happen
> confusion and mistake.
>
> How about "AArch64: Improve A64FX memset for less than 512B" ?
>
> > Improve performance of small copies by reducing instruction counts and improving
> > alignment. Bench-memset shows 35-45% performance gain for small sizes.
> >
> > ---
> >
> > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
> > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > @@ -30,7 +30,6 @@
> > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB
> > #define CACHE_LINE_SIZE 256
> > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
> > -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
>
> This caused compile error.
>
> > #define rest x8
> > #define vector_length x9
> > #define vl_remainder x10 // vector_length remainder
> > @@ -51,78 +50,54 @@
> > .endm
> >
> > .macro st1b_unroll first=0, last=7
> > - st1b z0.b, p0, [dst, #\first, mul vl]
> > + st1b z0.b, p0, [dst, \first, mul vl]
> > .if \last-\first
> > st1b_unroll "(\first+1)", \last
> > .endif
> > .endm
> >
> > - .macro shortcut_for_small_size exit
> > - // if rest <= vector_length * 2
> > - whilelo p0.b, xzr, count
> > - whilelo p1.b, vector_length, count
> > - b.last 1f
> > - st1b z0.b, p0, [dstin, #0, mul vl]
> > - st1b z0.b, p1, [dstin, #1, mul vl]
> > - ret
> > -1: // if rest > vector_length * 8
> > - cmp count, vector_length, lsl 3 // vector_length * 8
> > - b.hi \exit
> > - // if rest <= vector_length * 4
> > - lsl tmp1, vector_length, 1 // vector_length * 2
> > - whilelo p2.b, tmp1, count
> > - incb tmp1
> > - whilelo p3.b, tmp1, count
> > - b.last 1f
> > - st1b z0.b, p0, [dstin, #0, mul vl]
> > - st1b z0.b, p1, [dstin, #1, mul vl]
> > - st1b z0.b, p2, [dstin, #2, mul vl]
> > - st1b z0.b, p3, [dstin, #3, mul vl]
> > - ret
> > -1: // if rest <= vector_length * 8
> > - lsl tmp1, vector_length, 2 // vector_length * 4
> > - whilelo p4.b, tmp1, count
> > - incb tmp1
> > - whilelo p5.b, tmp1, count
> > - b.last 1f
> > - st1b z0.b, p0, [dstin, #0, mul vl]
> > - st1b z0.b, p1, [dstin, #1, mul vl]
> > - st1b z0.b, p2, [dstin, #2, mul vl]
> > - st1b z0.b, p3, [dstin, #3, mul vl]
> > - st1b z0.b, p4, [dstin, #4, mul vl]
> > - st1b z0.b, p5, [dstin, #5, mul vl]
> > - ret
> > -1: lsl tmp1, vector_length, 2 // vector_length * 4
> > - incb tmp1 // vector_length * 5
> > - incb tmp1 // vector_length * 6
> > - whilelo p6.b, tmp1, count
> > - incb tmp1
> > - whilelo p7.b, tmp1, count
> > - st1b z0.b, p0, [dstin, #0, mul vl]
> > - st1b z0.b, p1, [dstin, #1, mul vl]
> > - st1b z0.b, p2, [dstin, #2, mul vl]
> > - st1b z0.b, p3, [dstin, #3, mul vl]
> > - st1b z0.b, p4, [dstin, #4, mul vl]
> > - st1b z0.b, p5, [dstin, #5, mul vl]
> > - st1b z0.b, p6, [dstin, #6, mul vl]
> > - st1b z0.b, p7, [dstin, #7, mul vl]
> > - ret
> > - .endm
> >
> > -ENTRY (MEMSET)
> > +#undef BTI_C
> > +#define BTI_C
We discussed how should be defined BTI_C macro before, at that time conclusion
was "NOP" rather than empty unless HAVE_AARCH64_BTI.
Now the above code defines BTI_C as empty unconditionally.
A64FX doesn't support BTI, so this code is OK.
But I'm just interested in the reason why it is changed.
Thanks.
Naohiro
> >
> > +ENTRY (MEMSET)
> > PTR_ARG (0)
> > SIZE_ARG (2)
> >
> > - cbnz count, 1f
> > - ret
> > -1: dup z0.b, valw
> > cntb vector_length
> > - // shortcut for less than vector_length * 8
> > - // gives a free ptrue to p0.b for n >= vector_length
> > - shortcut_for_small_size L(vl_agnostic)
> > - // end of shortcut
> > + dup z0.b, valw
> > + whilelo p0.b, vector_length, count
> > + b.last 1f
> > + whilelo p1.b, xzr, count
> > + st1b z0.b, p1, [dstin, 0, mul vl]
> > + st1b z0.b, p0, [dstin, 1, mul vl]
> > + ret
> > +
> > + // count >= vector_length * 2
> > +1: cmp count, vector_length, lsl 2
> > + add dstend, dstin, count
> > + b.hi 1f
> > + st1b z0.b, p0, [dstin, 0, mul vl]
> > + st1b z0.b, p0, [dstin, 1, mul vl]
> > + st1b z0.b, p0, [dstend, -2, mul vl]
> > + st1b z0.b, p0, [dstend, -1, mul vl]
> > + ret
> > +
> > + // count > vector_length * 4
> > +1: lsl tmp1, vector_length, 3
> > + cmp count, tmp1
> > + b.hi L(vl_agnostic)
> > + st1b z0.b, p0, [dstin, 0, mul vl]
> > + st1b z0.b, p0, [dstin, 1, mul vl]
> > + st1b z0.b, p0, [dstin, 2, mul vl]
> > + st1b z0.b, p0, [dstin, 3, mul vl]
> > + st1b z0.b, p0, [dstend, -4, mul vl]
> > + st1b z0.b, p0, [dstend, -3, mul vl]
> > + st1b z0.b, p0, [dstend, -2, mul vl]
> > + st1b z0.b, p0, [dstend, -1, mul vl]
> > + ret
> >
> > + .p2align 4
> > L(vl_agnostic): // VL Agnostic
> > mov rest, count
> > mov dst, dstin
> >
More information about the Libc-alpha
mailing list