[PATCH v3 1/5] AArch64: Improve A64FX memset

Mon Aug 2 13:53:23 GMT 2021

Hi Wilco,

I have one question below.

> -----Original Message-----
> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, July 28, 2021 5:11 PM
> To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Cc: 'GNU C Library' <libc-alpha@sourceware.org>
> Subject: Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
> 
> Hi Wilco,
> 
> Thanks for the patch.
> 
> I confirmed that the performance is improved than the master as show
> in the graphs [1].
> There are two comments, please find them.
> 
> Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
> Tested-by: Naohiro Tamura <naohirot@fujitsu.com>
> 
> [1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing
> 
> > [PATCH v3 1/5] AArch64: Improve A64FX memset
> >
> 
> Would you update the commit title so as not to be the same among 5
> patches?
> Because we need to ask distro to backport these patches.
> If all commit titles are the same, it will increase the room to happen
> confusion and mistake.
> 
> How about "AArch64: Improve A64FX memset for less than 512B" ?
> 
> > Improve performance of small copies by reducing instruction counts and improving
> > alignment. Bench-memset shows 35-45% performance gain for small sizes.
> >
> > ---
> >
> > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
> > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > @@ -30,7 +30,6 @@
> >  #define L2_SIZE         (8*1024*1024)   // L2 8MB - 1MB
> >  #define CACHE_LINE_SIZE 256
> >  #define PF_DIST_L1      (CACHE_LINE_SIZE * 16)  // Prefetch distance L1
> > -#define ZF_DIST                (CACHE_LINE_SIZE * 21)  // Zerofill distance
> 
> This caused compile error.
> 
> >  #define rest            x8
> >  #define vector_length   x9
> >  #define vl_remainder    x10     // vector_length remainder
> > @@ -51,78 +50,54 @@
> >          .endm
> >
> >          .macro st1b_unroll first=0, last=7
> > -       st1b    z0.b, p0, [dst, #\first, mul vl]
> > +       st1b    z0.b, p0, [dst, \first, mul vl]
> >          .if \last-\first
> >          st1b_unroll "(\first+1)", \last
> >          .endif
> >          .endm
> >
> > -       .macro shortcut_for_small_size exit
> > -       // if rest <= vector_length * 2
> > -       whilelo p0.b, xzr, count
> > -       whilelo p1.b, vector_length, count
> > -       b.last  1f
> > -       st1b    z0.b, p0, [dstin, #0, mul vl]
> > -       st1b    z0.b, p1, [dstin, #1, mul vl]
> > -       ret
> > -1:     // if rest > vector_length * 8
> > -       cmp     count, vector_length, lsl 3     // vector_length * 8
> > -       b.hi    \exit
> > -       // if rest <= vector_length * 4
> > -       lsl     tmp1, vector_length, 1  // vector_length * 2
> > -       whilelo p2.b, tmp1, count
> > -       incb    tmp1
> > -       whilelo p3.b, tmp1, count
> > -       b.last  1f
> > -       st1b    z0.b, p0, [dstin, #0, mul vl]
> > -       st1b    z0.b, p1, [dstin, #1, mul vl]
> > -       st1b    z0.b, p2, [dstin, #2, mul vl]
> > -       st1b    z0.b, p3, [dstin, #3, mul vl]
> > -       ret
> > -1:     // if rest <= vector_length * 8
> > -       lsl     tmp1, vector_length, 2  // vector_length * 4
> > -       whilelo p4.b, tmp1, count
> > -       incb    tmp1
> > -       whilelo p5.b, tmp1, count
> > -       b.last  1f
> > -       st1b    z0.b, p0, [dstin, #0, mul vl]
> > -       st1b    z0.b, p1, [dstin, #1, mul vl]
> > -       st1b    z0.b, p2, [dstin, #2, mul vl]
> > -       st1b    z0.b, p3, [dstin, #3, mul vl]
> > -       st1b    z0.b, p4, [dstin, #4, mul vl]
> > -       st1b    z0.b, p5, [dstin, #5, mul vl]
> > -       ret
> > -1:     lsl     tmp1, vector_length, 2  // vector_length * 4
> > -       incb    tmp1                    // vector_length * 5
> > -       incb    tmp1                    // vector_length * 6
> > -       whilelo p6.b, tmp1, count
> > -       incb    tmp1
> > -       whilelo p7.b, tmp1, count
> > -       st1b    z0.b, p0, [dstin, #0, mul vl]
> > -       st1b    z0.b, p1, [dstin, #1, mul vl]
> > -       st1b    z0.b, p2, [dstin, #2, mul vl]
> > -       st1b    z0.b, p3, [dstin, #3, mul vl]
> > -       st1b    z0.b, p4, [dstin, #4, mul vl]
> > -       st1b    z0.b, p5, [dstin, #5, mul vl]
> > -       st1b    z0.b, p6, [dstin, #6, mul vl]
> > -       st1b    z0.b, p7, [dstin, #7, mul vl]
> > -       ret
> > -       .endm
> >
> > -ENTRY (MEMSET)
> > +#undef BTI_C
> > +#define BTI_C

We discussed how should be defined BTI_C macro before, at that time conclusion
was "NOP" rather than empty unless HAVE_AARCH64_BTI.
Now the above code defines BTI_C as empty unconditionally.
A64FX doesn't support BTI, so this code is OK.
But I'm just interested in the reason why it is changed.

Thanks.
Naohiro

> >
> > +ENTRY (MEMSET)
> >          PTR_ARG (0)
> >          SIZE_ARG (2)
> >
> > -       cbnz    count, 1f
> > -       ret
> > -1:     dup     z0.b, valw
> >          cntb    vector_length
> > -       // shortcut for less than vector_length * 8
> > -       // gives a free ptrue to p0.b for n >= vector_length
> > -       shortcut_for_small_size L(vl_agnostic)
> > -       // end of shortcut
> > +       dup     z0.b, valw
> > +       whilelo p0.b, vector_length, count
> > +       b.last  1f
> > +       whilelo p1.b, xzr, count
> > +       st1b    z0.b, p1, [dstin, 0, mul vl]
> > +       st1b    z0.b, p0, [dstin, 1, mul vl]
> > +       ret
> > +
> > +       // count >= vector_length * 2
> > +1:     cmp     count, vector_length, lsl 2
> > +       add     dstend, dstin, count
> > +       b.hi    1f
> > +       st1b    z0.b, p0, [dstin, 0, mul vl]
> > +       st1b    z0.b, p0, [dstin, 1, mul vl]
> > +       st1b    z0.b, p0, [dstend, -2, mul vl]
> > +       st1b    z0.b, p0, [dstend, -1, mul vl]
> > +       ret
> > +
> > +       // count > vector_length * 4
> > +1:     lsl     tmp1, vector_length, 3
> > +       cmp     count, tmp1
> > +       b.hi    L(vl_agnostic)
> > +       st1b    z0.b, p0, [dstin, 0, mul vl]
> > +       st1b    z0.b, p0, [dstin, 1, mul vl]
> > +       st1b    z0.b, p0, [dstin, 2, mul vl]
> > +       st1b    z0.b, p0, [dstin, 3, mul vl]
> > +       st1b    z0.b, p0, [dstend, -4, mul vl]
> > +       st1b    z0.b, p0, [dstend, -3, mul vl]
> > +       st1b    z0.b, p0, [dstend, -2, mul vl]
> > +       st1b    z0.b, p0, [dstend, -1, mul vl]
> > +       ret
> >
> > +       .p2align 4
> >  L(vl_agnostic): // VL Agnostic
> >          mov     rest, count
> >          mov     dst, dstin
> >