[PATCH v3 1/2] Aarch64: Add memcpy for qualcomm's oryon-1 core
Andreas K. Huettel
dilfridge@gentoo.org
Sun Jun 30 11:49:56 GMT 2024
Thank you. Both patches committed.
Am Freitag, 28. Juni 2024, 21:08:38 CEST schrieb Adhemerval Zanella Netto:
>
> On 12/06/24 19:53, Andrew Pinski wrote:
> > Qualcomm's new core (oryon-1) has a different performance characteristic
> > than other cores. For memcpy, it is faster to use the GPRs to
> > do the copy for large sizes (2x faster). For even larger sizes,
> > it is better to use the nontemporal load/store instructions so
> > we don't pollute the L1/L2 caches.
> >
> > For smaller sizes, the characteristic are very similar to
> > other cores.
> > I used the thunderx memcpy as a starting point and expanded from there.
> >
> > Changes since v1:
> > * v2: Fix ordering in Makefile.
> > * v3: Fix comment grammar about the ldnp/stnp instructions.
> >
> > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
>
> LGTM, thanks.
>
> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
>
> > ---
> > sysdeps/aarch64/cpu-features.h | 6 +
> > sysdeps/aarch64/multiarch/Makefile | 1 +
> > sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 +
> > sysdeps/aarch64/multiarch/memcpy.c | 5 +
> > sysdeps/aarch64/multiarch/memcpy_oryon1.S | 301 ++++++++++++++++++++
> > 5 files changed, 316 insertions(+)
> > create mode 100644 sysdeps/aarch64/multiarch/memcpy_oryon1.S
> >
> > diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
> > index 31782b66f9..bc8d842238 100644
> > --- a/sysdeps/aarch64/cpu-features.h
> > +++ b/sysdeps/aarch64/cpu-features.h
> > @@ -1,6 +1,7 @@
> > /* Initialize CPU feature data. AArch64 version.
> > This file is part of the GNU C Library.
> > Copyright (C) 2017-2024 Free Software Foundation, Inc.
> > + Copyright The GNU Toolchain Authors.
> >
> > The GNU C Library is free software; you can redistribute it and/or
> > modify it under the terms of the GNU Lesser General Public
> > @@ -56,6 +57,11 @@
> > #define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F' \
> > && MIDR_PARTNUM(midr) == 0x001)
> >
> > +#define IS_ORYON1(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \
> > + && (MIDR_PARTNUM(midr) == 0x001 \
> > + || (MIDR_PARTNUM(midr) == 0x002 \
> > + && MIDR_VARIANT(midr) == 0)))
> > +
> > struct cpu_features
> > {
> > uint64_t midr_el1;
> > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> > index e4720b7468..ef5ea9ab8c 100644
> > --- a/sysdeps/aarch64/multiarch/Makefile
> > +++ b/sysdeps/aarch64/multiarch/Makefile
> > @@ -5,6 +5,7 @@ sysdep_routines += \
> > memcpy_a64fx \
> > memcpy_generic \
> > memcpy_mops \
> > + memcpy_oryon1 \
> > memcpy_sve \
> > memcpy_thunderx \
> > memcpy_thunderx2 \
> > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > index ecd0f87de6..65c56b9b41 100644
> > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > @@ -1,5 +1,6 @@
> > /* Enumerate available IFUNC implementations of a function. AARCH64 version.
> > Copyright (C) 2017-2024 Free Software Foundation, Inc.
> > + Copyright The GNU Toolchain Authors.
> > This file is part of the GNU C Library.
> >
> > The GNU C Library is free software; you can redistribute it and/or
> > @@ -35,6 +36,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c. */
> > IFUNC_IMPL (i, name, memcpy,
> > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
> > + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1)
> > IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
> > #if HAVE_AARCH64_SVE_ASM
> > IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx)
> > @@ -44,6 +46,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
> > IFUNC_IMPL (i, name, memmove,
> > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> > + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1)
> > IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
> > #if HAVE_AARCH64_SVE_ASM
> > IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx)
> > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> > index ce53567dab..15c954778b 100644
> > --- a/sysdeps/aarch64/multiarch/memcpy.c
> > +++ b/sysdeps/aarch64/multiarch/memcpy.c
> > @@ -1,5 +1,6 @@
> > /* Multiple versions of memcpy. AARCH64 version.
> > Copyright (C) 2017-2024 Free Software Foundation, Inc.
> > + Copyright The GNU Toolchain Authors.
> > This file is part of the GNU C Library.
> >
> > The GNU C Library is free software; you can redistribute it and/or
> > @@ -34,6 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
> > extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
> > extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden;
> > extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden;
> > +extern __typeof (__redirect_memcpy) __memcpy_oryon1 attribute_hidden;
> >
> > static inline __typeof (__redirect_memcpy) *
> > select_memcpy_ifunc (void)
> > @@ -50,6 +52,9 @@ select_memcpy_ifunc (void)
> > return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
> > }
> >
> > + if (IS_ORYON1 (midr))
> > + return __memcpy_oryon1;
> > +
> > if (IS_THUNDERX (midr))
> > return __memcpy_thunderx;
> >
> > diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
> > new file mode 100644
> > index 0000000000..4efc43df28
> > --- /dev/null
> > +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
> > @@ -0,0 +1,301 @@
> > +/* A oryon-1 core Optimized memcpy implementation for AARCH64.
> > + Copyright (C) 2017-2024 Free Software Foundation, Inc.
> > + Copyright The GNU Toolchain Authors.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <sysdep.h>
> > +
> > +/* Assumptions:
> > + *
> > + * ARMv8-a, AArch64, unaligned accesses.
> > + *
> > + */
> > +
> > +#define dstin x0
> > +#define src x1
> > +#define count x2
> > +#define dst x3
> > +#define srcend x4
> > +#define dstend x5
> > +#define A_l x6
> > +#define A_lw w6
> > +#define A_h x7
> > +#define A_hw w7
> > +#define B_l x8
> > +#define B_lw w8
> > +#define B_h x9
> > +#define C_l x10
> > +#define C_h x11
> > +#define D_l x12
> > +#define D_h x13
> > +#define E_l src
> > +#define E_h count
> > +#define F_l srcend
> > +#define F_h dst
> > +#define G_l count
> > +#define G_h dst
> > +#define tmp1 x14
> > +
> > +/* Copies are split into 3 main cases: small copies of up to 16 bytes,
> > + medium copies of 17..96 bytes which are fully unrolled. Large copies
> > + of more than 96 bytes align the destination and use an unrolled loop
> > + processing 64 bytes per iteration.
> > + In order to share code with memmove, small and medium copies read all
> > + data before writing, allowing any kind of overlap. So small, medium
> > + and large backwards memmoves are handled by falling through into memcpy.
> > + Overlapping large forward memmoves use a loop that copies backwards.
> > +*/
> > +
> > +ENTRY (__memmove_oryon1)
> > +
> > + PTR_ARG (0)
> > + PTR_ARG (1)
> > + SIZE_ARG (2)
> > +
> > + sub tmp1, dstin, src
> > + cmp count, 96
> > + ccmp tmp1, count, 2, hi
> > + b.lo L(move_long)
> > +
> > + /* Common case falls through into memcpy. */
> > +END (__memmove_oryon1)
> > +
> > +ENTRY (__memcpy_oryon1)
> > +
> > + PTR_ARG (0)
> > + PTR_ARG (1)
> > + SIZE_ARG (2)
> > +
> > + add srcend, src, count
> > + add dstend, dstin, count
> > + cmp count, 16
> > + b.ls L(copy16)
> > + cmp count, 96
> > + b.hi L(copy_long)
> > +
> > + /* Medium copies: 17..96 bytes. */
> > + sub tmp1, count, 1
> > + ldp A_l, A_h, [src]
> > + tbnz tmp1, 6, L(copy96)
> > + ldp D_l, D_h, [srcend, -16]
> > + tbz tmp1, 5, 1f
> > + ldp B_l, B_h, [src, 16]
> > + ldp C_l, C_h, [srcend, -32]
> > + stp B_l, B_h, [dstin, 16]
> > + stp C_l, C_h, [dstend, -32]
> > +1:
> > + stp A_l, A_h, [dstin]
> > + stp D_l, D_h, [dstend, -16]
> > + ret
> > +
> > + .p2align 6
> > + /* Small copies: 0..16 bytes. */
> > +L(copy16):
> > + cmp count, 8
> > + b.lo 1f
> > + ldr A_l, [src]
> > + ldr A_h, [srcend, -8]
> > + str A_l, [dstin]
> > + str A_h, [dstend, -8]
> > + ret
> > + .p2align 6
> > +1:
> > + tbz count, 2, 1f
> > + ldr A_lw, [src]
> > + ldr A_hw, [srcend, -4]
> > + str A_lw, [dstin]
> > + str A_hw, [dstend, -4]
> > + ret
> > +
> > + /* Copy 0..3 bytes. Use a branchless sequence that copies the same
> > + byte 3 times if count==1, or the 2nd byte twice if count==2. */
> > +1:
> > + cbz count, 2f
> > + lsr tmp1, count, 1
> > + ldrb A_lw, [src]
> > + ldrb A_hw, [srcend, -1]
> > + ldrb B_lw, [src, tmp1]
> > + strb A_lw, [dstin]
> > + strb B_lw, [dstin, tmp1]
> > + strb A_hw, [dstend, -1]
> > +2: ret
> > +
> > + .p2align 6
> > + /* Copy 64..96 bytes. Copy 64 bytes from the start and
> > + 32 bytes from the end. */
> > +L(copy96):
> > + ldp B_l, B_h, [src, 16]
> > + ldp C_l, C_h, [src, 32]
> > + ldp D_l, D_h, [src, 48]
> > + ldp E_l, E_h, [srcend, -32]
> > + ldp F_l, F_h, [srcend, -16]
> > + stp A_l, A_h, [dstin]
> > + stp B_l, B_h, [dstin, 16]
> > + stp C_l, C_h, [dstin, 32]
> > + stp D_l, D_h, [dstin, 48]
> > + stp E_l, E_h, [dstend, -32]
> > + stp F_l, F_h, [dstend, -16]
> > + ret
> > +
> > + /* Align DST to 16 byte alignment so that we don't cross cache line
> > + boundaries on both loads and stores. There are at least 96 bytes
> > + to copy, so copy 16 bytes unaligned and then align. The loop
> > + copies 64 bytes per iteration and prefetches one iteration ahead. */
> > +
> > + .p2align 6
> > +L(copy_long):
> > +
> > + /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp.
> > + This loop is identical to the one below it but using ldnp/stnp
> > + instructions. For loops that are less than 32768 bytes,
> > + the ldnp/stnp instructions will not help and will cause a slow
> > + down so only use the ldnp/stnp loop for the largest sizes. */
> > +
> > + cmp count, #32768
> > + b.lo L(copy_long_without_nontemp)
> > + and tmp1, dstin, 15
> > + bic dst, dstin, 15
> > + ldnp D_l, D_h, [src]
> > + sub src, src, tmp1
> > + add count, count, tmp1 /* Count is now 16 too large. */
> > + ldnp A_l, A_h, [src, 16]
> > + stnp D_l, D_h, [dstin]
> > + ldnp B_l, B_h, [src, 32]
> > + ldnp C_l, C_h, [src, 48]
> > + ldnp D_l, D_h, [src, 64]
> > + add src, src, #64
> > + subs count, count, 128 + 16 /* Test and readjust count. */
> > +
> > +L(nontemp_loop64):
> > + tbz src, #6, 1f
> > +1:
> > + stnp A_l, A_h, [dst, 16]
> > + ldnp A_l, A_h, [src, 16]
> > + stnp B_l, B_h, [dst, 32]
> > + ldnp B_l, B_h, [src, 32]
> > + stnp C_l, C_h, [dst, 48]
> > + ldnp C_l, C_h, [src, 48]
> > + stnp D_l, D_h, [dst, 64]
> > + ldnp D_l, D_h, [src, 64]
> > + add src, src, #64
> > + add dst, dst, #64
> > + subs count, count, 64
> > + b.hi L(nontemp_loop64)
> > + b L(last64)
> > +
> > +L(copy_long_without_nontemp):
> > +
> > + and tmp1, dstin, 15
> > + bic dst, dstin, 15
> > + ldp D_l, D_h, [src]
> > + sub src, src, tmp1
> > + add count, count, tmp1 /* Count is now 16 too large. */
> > + ldp A_l, A_h, [src, 16]
> > + stp D_l, D_h, [dstin]
> > + ldp B_l, B_h, [src, 32]
> > + ldp C_l, C_h, [src, 48]
> > + ldp D_l, D_h, [src, 64]!
> > + subs count, count, 128 + 16 /* Test and readjust count. */
> > + b.ls L(last64)
> > +L(loop64):
> > + stp A_l, A_h, [dst, 16]
> > + ldp A_l, A_h, [src, 16]
> > + stp B_l, B_h, [dst, 32]
> > + ldp B_l, B_h, [src, 32]
> > + stp C_l, C_h, [dst, 48]
> > + ldp C_l, C_h, [src, 48]
> > + stp D_l, D_h, [dst, 64]!
> > + ldp D_l, D_h, [src, 64]!
> > + subs count, count, 64
> > + b.hi L(loop64)
> > +
> > + /* Write the last full set of 64 bytes. The remainder is at most 64
> > + bytes, so it is safe to always copy 64 bytes from the end even if
> > + there is just 1 byte left. */
> > +L(last64):
> > + ldp E_l, E_h, [srcend, -64]
> > + stp A_l, A_h, [dst, 16]
> > + ldp A_l, A_h, [srcend, -48]
> > + stp B_l, B_h, [dst, 32]
> > + ldp B_l, B_h, [srcend, -32]
> > + stp C_l, C_h, [dst, 48]
> > + ldp C_l, C_h, [srcend, -16]
> > + stp D_l, D_h, [dst, 64]
> > + stp E_l, E_h, [dstend, -64]
> > + stp A_l, A_h, [dstend, -48]
> > + stp B_l, B_h, [dstend, -32]
> > + stp C_l, C_h, [dstend, -16]
> > + ret
> > +
> > + .p2align 6
> > +L(move_long):
> > + cbz tmp1, 3f
> > +
> > + add srcend, src, count
> > + add dstend, dstin, count
> > +
> > + /* Align dstend to 16 byte alignment so that we don't cross cache line
> > + boundaries on both loads and stores. There are at least 96 bytes
> > + to copy, so copy 16 bytes unaligned and then align. The loop
> > + copies 64 bytes per iteration and prefetches one iteration ahead. */
> > +
> > + and tmp1, dstend, 15
> > + ldp D_l, D_h, [srcend, -16]
> > + sub srcend, srcend, tmp1
> > + sub count, count, tmp1
> > + ldp A_l, A_h, [srcend, -16]
> > + stp D_l, D_h, [dstend, -16]
> > + ldp B_l, B_h, [srcend, -32]
> > + ldp C_l, C_h, [srcend, -48]
> > + ldp D_l, D_h, [srcend, -64]!
> > + sub dstend, dstend, tmp1
> > + subs count, count, 128
> > + b.ls 2f
> > +
> > + nop
> > +1:
> > + stp A_l, A_h, [dstend, -16]
> > + ldp A_l, A_h, [srcend, -16]
> > + stp B_l, B_h, [dstend, -32]
> > + ldp B_l, B_h, [srcend, -32]
> > + stp C_l, C_h, [dstend, -48]
> > + ldp C_l, C_h, [srcend, -48]
> > + stp D_l, D_h, [dstend, -64]!
> > + ldp D_l, D_h, [srcend, -64]!
> > + subs count, count, 64
> > + b.hi 1b
> > +
> > + /* Write the last full set of 64 bytes. The remainder is at most 64
> > + bytes, so it is safe to always copy 64 bytes from the start even if
> > + there is just 1 byte left. */
> > +2:
> > + ldp G_l, G_h, [src, 48]
> > + stp A_l, A_h, [dstend, -16]
> > + ldp A_l, A_h, [src, 32]
> > + stp B_l, B_h, [dstend, -32]
> > + ldp B_l, B_h, [src, 16]
> > + stp C_l, C_h, [dstend, -48]
> > + ldp C_l, C_h, [src]
> > + stp D_l, D_h, [dstend, -64]
> > + stp G_l, G_h, [dstin, 48]
> > + stp A_l, A_h, [dstin, 32]
> > + stp B_l, B_h, [dstin, 16]
> > + stp C_l, C_h, [dstin]
> > +3: ret
> > +
> > +END (__memcpy_oryon1)
>
--
Andreas K. Hüttel
dilfridge@gentoo.org
Gentoo Linux developer
(council, comrel, toolchain, base-system, perl, libreoffice)
https://wiki.gentoo.org/wiki/User:Dilfridge
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: This is a digitally signed message part.
URL: <https://sourceware.org/pipermail/libc-alpha/attachments/20240630/76fdcf1e/attachment.sig>
More information about the Libc-alpha
mailing list