[PATCH v3 1/2] Aarch64: Add memcpy for qualcomm's oryon-1 core

Sun Jun 30 11:49:56 GMT 2024

Thank you. Both patches committed.

Am Freitag, 28. Juni 2024, 21:08:38 CEST schrieb Adhemerval Zanella Netto:
> 
> On 12/06/24 19:53, Andrew Pinski wrote:
> > Qualcomm's new core (oryon-1) has a different performance characteristic
> > than other cores. For memcpy, it is faster to use the GPRs to
> > do the copy for large sizes (2x faster). For even larger sizes,
> > it is better to use the nontemporal load/store instructions so
> > we don't pollute the L1/L2 caches.
> > 
> > For smaller sizes, the characteristic are very similar to
> > other cores.
> > I used the thunderx memcpy as a starting point and expanded from there.
> > 
> > Changes since v1:
> > * v2: Fix ordering in Makefile.
> > * v3: Fix comment grammar about the ldnp/stnp instructions.
> > 
> > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> 
> LGTM, thanks.
> 
> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
> 
> > ---
> >  sysdeps/aarch64/cpu-features.h              |   6 +
> >  sysdeps/aarch64/multiarch/Makefile          |   1 +
> >  sysdeps/aarch64/multiarch/ifunc-impl-list.c |   3 +
> >  sysdeps/aarch64/multiarch/memcpy.c          |   5 +
> >  sysdeps/aarch64/multiarch/memcpy_oryon1.S   | 301 ++++++++++++++++++++
> >  5 files changed, 316 insertions(+)
> >  create mode 100644 sysdeps/aarch64/multiarch/memcpy_oryon1.S
> > 
> > diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
> > index 31782b66f9..bc8d842238 100644
> > --- a/sysdeps/aarch64/cpu-features.h
> > +++ b/sysdeps/aarch64/cpu-features.h
> > @@ -1,6 +1,7 @@
> >  /* Initialize CPU feature data.  AArch64 version.
> >     This file is part of the GNU C Library.
> >     Copyright (C) 2017-2024 Free Software Foundation, Inc.
> > +   Copyright The GNU Toolchain Authors.
> >  
> >     The GNU C Library is free software; you can redistribute it and/or
> >     modify it under the terms of the GNU Lesser General Public
> > @@ -56,6 +57,11 @@
> >  #define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F'			      \
> >  			&& MIDR_PARTNUM(midr) == 0x001)
> >  
> > +#define IS_ORYON1(midr) (MIDR_IMPLEMENTOR(midr) == 'Q'			\
> > +		         && (MIDR_PARTNUM(midr) == 0x001		\
> > +			     || (MIDR_PARTNUM(midr) == 0x002		\
> > +			         && MIDR_VARIANT(midr) == 0)))
> > +
> >  struct cpu_features
> >  {
> >    uint64_t midr_el1;
> > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> > index e4720b7468..ef5ea9ab8c 100644
> > --- a/sysdeps/aarch64/multiarch/Makefile
> > +++ b/sysdeps/aarch64/multiarch/Makefile
> > @@ -5,6 +5,7 @@ sysdep_routines += \
> >    memcpy_a64fx \
> >    memcpy_generic \
> >    memcpy_mops \
> > +  memcpy_oryon1 \
> >    memcpy_sve \
> >    memcpy_thunderx \
> >    memcpy_thunderx2 \
> > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > index ecd0f87de6..65c56b9b41 100644
> > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > @@ -1,5 +1,6 @@
> >  /* Enumerate available IFUNC implementations of a function.  AARCH64 version.
> >     Copyright (C) 2017-2024 Free Software Foundation, Inc.
> > +   Copyright The GNU Toolchain Authors.
> >     This file is part of the GNU C Library.
> >  
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -35,6 +36,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c.  */
> >    IFUNC_IMPL (i, name, memcpy,
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
> > +	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1)
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
> >  #if HAVE_AARCH64_SVE_ASM
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx)
> > @@ -44,6 +46,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
> >    IFUNC_IMPL (i, name, memmove,
> >  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> > +	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1)
> >  	      IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
> >  #if HAVE_AARCH64_SVE_ASM
> >  	      IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx)
> > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> > index ce53567dab..15c954778b 100644
> > --- a/sysdeps/aarch64/multiarch/memcpy.c
> > +++ b/sysdeps/aarch64/multiarch/memcpy.c
> > @@ -1,5 +1,6 @@
> >  /* Multiple versions of memcpy. AARCH64 version.
> >     Copyright (C) 2017-2024 Free Software Foundation, Inc.
> > +   Copyright The GNU Toolchain Authors.
> >     This file is part of the GNU C Library.
> >  
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -34,6 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
> >  extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
> >  extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden;
> >  extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden;
> > +extern __typeof (__redirect_memcpy) __memcpy_oryon1 attribute_hidden;
> >  
> >  static inline __typeof (__redirect_memcpy) *
> >  select_memcpy_ifunc (void)
> > @@ -50,6 +52,9 @@ select_memcpy_ifunc (void)
> >        return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
> >      }
> >  
> > +  if (IS_ORYON1 (midr))
> > +    return __memcpy_oryon1;
> > +
> >    if (IS_THUNDERX (midr))
> >      return __memcpy_thunderx;
> >  
> > diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
> > new file mode 100644
> > index 0000000000..4efc43df28
> > --- /dev/null
> > +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
> > @@ -0,0 +1,301 @@
> > +/* A oryon-1 core Optimized memcpy implementation for AARCH64.
> > +   Copyright (C) 2017-2024 Free Software Foundation, Inc.
> > +   Copyright The GNU Toolchain Authors.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +
> > +/* Assumptions:
> > + *
> > + * ARMv8-a, AArch64, unaligned accesses.
> > + *
> > + */
> > +
> > +#define dstin	x0
> > +#define src	x1
> > +#define count	x2
> > +#define dst	x3
> > +#define srcend	x4
> > +#define dstend	x5
> > +#define A_l	x6
> > +#define A_lw	w6
> > +#define A_h	x7
> > +#define A_hw	w7
> > +#define B_l	x8
> > +#define B_lw	w8
> > +#define B_h	x9
> > +#define C_l	x10
> > +#define C_h	x11
> > +#define D_l	x12
> > +#define D_h	x13
> > +#define E_l	src
> > +#define E_h	count
> > +#define F_l	srcend
> > +#define F_h	dst
> > +#define G_l	count
> > +#define G_h	dst
> > +#define tmp1	x14
> > +
> > +/* Copies are split into 3 main cases: small copies of up to 16 bytes,
> > +   medium copies of 17..96 bytes which are fully unrolled. Large copies
> > +   of more than 96 bytes align the destination and use an unrolled loop
> > +   processing 64 bytes per iteration.
> > +   In order to share code with memmove, small and medium copies read all
> > +   data before writing, allowing any kind of overlap. So small, medium
> > +   and large backwards memmoves are handled by falling through into memcpy.
> > +   Overlapping large forward memmoves use a loop that copies backwards.
> > +*/
> > +
> > +ENTRY (__memmove_oryon1)
> > +
> > +	PTR_ARG (0)
> > +	PTR_ARG (1)
> > +	SIZE_ARG (2)
> > +
> > +	sub	tmp1, dstin, src
> > +	cmp	count, 96
> > +	ccmp	tmp1, count, 2, hi
> > +	b.lo	L(move_long)
> > +
> > +	/* Common case falls through into memcpy.  */
> > +END (__memmove_oryon1)
> > +
> > +ENTRY (__memcpy_oryon1)
> > +
> > +	PTR_ARG (0)
> > +	PTR_ARG (1)
> > +	SIZE_ARG (2)
> > +
> > +	add	srcend, src, count
> > +	add	dstend, dstin, count
> > +	cmp	count, 16
> > +	b.ls	L(copy16)
> > +	cmp	count, 96
> > +	b.hi	L(copy_long)
> > +
> > +	/* Medium copies: 17..96 bytes.  */
> > +	sub	tmp1, count, 1
> > +	ldp	A_l, A_h, [src]
> > +	tbnz	tmp1, 6, L(copy96)
> > +	ldp	D_l, D_h, [srcend, -16]
> > +	tbz	tmp1, 5, 1f
> > +	ldp	B_l, B_h, [src, 16]
> > +	ldp	C_l, C_h, [srcend, -32]
> > +	stp	B_l, B_h, [dstin, 16]
> > +	stp	C_l, C_h, [dstend, -32]
> > +1:
> > +	stp	A_l, A_h, [dstin]
> > +	stp	D_l, D_h, [dstend, -16]
> > +	ret
> > +
> > +	.p2align 6
> > +	/* Small copies: 0..16 bytes.  */
> > +L(copy16):
> > +	cmp	count, 8
> > +	b.lo	1f
> > +	ldr	A_l, [src]
> > +	ldr	A_h, [srcend, -8]
> > +	str	A_l, [dstin]
> > +	str	A_h, [dstend, -8]
> > +	ret
> > +	.p2align 6
> > +1:
> > +	tbz	count, 2, 1f
> > +	ldr	A_lw, [src]
> > +	ldr	A_hw, [srcend, -4]
> > +	str	A_lw, [dstin]
> > +	str	A_hw, [dstend, -4]
> > +	ret
> > +
> > +	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
> > +	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
> > +1:
> > +	cbz	count, 2f
> > +	lsr	tmp1, count, 1
> > +	ldrb	A_lw, [src]
> > +	ldrb	A_hw, [srcend, -1]
> > +	ldrb	B_lw, [src, tmp1]
> > +	strb	A_lw, [dstin]
> > +	strb	B_lw, [dstin, tmp1]
> > +	strb	A_hw, [dstend, -1]
> > +2:	ret
> > +
> > +	.p2align 6
> > +	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
> > +	   32 bytes from the end.  */
> > +L(copy96):
> > +	ldp	B_l, B_h, [src, 16]
> > +	ldp	C_l, C_h, [src, 32]
> > +	ldp	D_l, D_h, [src, 48]
> > +	ldp	E_l, E_h, [srcend, -32]
> > +	ldp	F_l, F_h, [srcend, -16]
> > +	stp	A_l, A_h, [dstin]
> > +	stp	B_l, B_h, [dstin, 16]
> > +	stp	C_l, C_h, [dstin, 32]
> > +	stp	D_l, D_h, [dstin, 48]
> > +	stp	E_l, E_h, [dstend, -32]
> > +	stp	F_l, F_h, [dstend, -16]
> > +	ret
> > +
> > +	/* Align DST to 16 byte alignment so that we don't cross cache line
> > +	   boundaries on both loads and stores.  There are at least 96 bytes
> > +	   to copy, so copy 16 bytes unaligned and then align.  The loop
> > +	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
> > +
> > +	.p2align 6
> > +L(copy_long):
> > +
> > +	/* On oryon1 cores, large memcpy's are helped by using ldnp/stnp.
> > +	   This loop is identical to the one below it but using ldnp/stnp
> > +	   instructions.  For loops that are less than 32768 bytes,
> > +	   the ldnp/stnp instructions will not help and will cause a slow
> > +	   down so only use the ldnp/stnp loop for the largest sizes.  */
> > +
> > +	cmp	count, #32768
> > +	b.lo	L(copy_long_without_nontemp)
> > +	and	tmp1, dstin, 15
> > +	bic	dst, dstin, 15
> > +	ldnp	D_l, D_h, [src]
> > +	sub	src, src, tmp1
> > +	add	count, count, tmp1	/* Count is now 16 too large.  */
> > +	ldnp	A_l, A_h, [src, 16]
> > +	stnp	D_l, D_h, [dstin]
> > +	ldnp	B_l, B_h, [src, 32]
> > +	ldnp	C_l, C_h, [src, 48]
> > +	ldnp	D_l, D_h, [src, 64]
> > +	add	src, src, #64
> > +	subs	count, count, 128 + 16	/* Test and readjust count.  */
> > +
> > +L(nontemp_loop64):
> > +	tbz	src, #6, 1f
> > +1:
> > +	stnp	A_l, A_h, [dst, 16]
> > +	ldnp	A_l, A_h, [src, 16]
> > +	stnp	B_l, B_h, [dst, 32]
> > +	ldnp	B_l, B_h, [src, 32]
> > +	stnp	C_l, C_h, [dst, 48]
> > +	ldnp	C_l, C_h, [src, 48]
> > +	stnp	D_l, D_h, [dst, 64]
> > +	ldnp	D_l, D_h, [src, 64]
> > +	add	src, src, #64
> > +	add	dst, dst, #64
> > +	subs	count, count, 64
> > +	b.hi	L(nontemp_loop64)
> > +	b	L(last64)
> > +
> > +L(copy_long_without_nontemp):
> > +
> > +	and	tmp1, dstin, 15
> > +	bic	dst, dstin, 15
> > +	ldp	D_l, D_h, [src]
> > +	sub	src, src, tmp1
> > +	add	count, count, tmp1	/* Count is now 16 too large.  */
> > +	ldp	A_l, A_h, [src, 16]
> > +	stp	D_l, D_h, [dstin]
> > +	ldp	B_l, B_h, [src, 32]
> > +	ldp	C_l, C_h, [src, 48]
> > +	ldp	D_l, D_h, [src, 64]!
> > +	subs	count, count, 128 + 16	/* Test and readjust count.  */
> > +	b.ls	L(last64)
> > +L(loop64):
> > +	stp	A_l, A_h, [dst, 16]
> > +	ldp	A_l, A_h, [src, 16]
> > +	stp	B_l, B_h, [dst, 32]
> > +	ldp	B_l, B_h, [src, 32]
> > +	stp	C_l, C_h, [dst, 48]
> > +	ldp	C_l, C_h, [src, 48]
> > +	stp	D_l, D_h, [dst, 64]!
> > +	ldp	D_l, D_h, [src, 64]!
> > +	subs	count, count, 64
> > +	b.hi	L(loop64)
> > +
> > +	/* Write the last full set of 64 bytes.  The remainder is at most 64
> > +	   bytes, so it is safe to always copy 64 bytes from the end even if
> > +	   there is just 1 byte left.  */
> > +L(last64):
> > +	ldp	E_l, E_h, [srcend, -64]
> > +	stp	A_l, A_h, [dst, 16]
> > +	ldp	A_l, A_h, [srcend, -48]
> > +	stp	B_l, B_h, [dst, 32]
> > +	ldp	B_l, B_h, [srcend, -32]
> > +	stp	C_l, C_h, [dst, 48]
> > +	ldp	C_l, C_h, [srcend, -16]
> > +	stp	D_l, D_h, [dst, 64]
> > +	stp	E_l, E_h, [dstend, -64]
> > +	stp	A_l, A_h, [dstend, -48]
> > +	stp	B_l, B_h, [dstend, -32]
> > +	stp	C_l, C_h, [dstend, -16]
> > +	ret
> > +
> > +	.p2align 6
> > +L(move_long):
> > +	cbz	tmp1, 3f
> > +
> > +	add	srcend, src, count
> > +	add	dstend, dstin, count
> > +
> > +	/* Align dstend to 16 byte alignment so that we don't cross cache line
> > +	   boundaries on both loads and stores.  There are at least 96 bytes
> > +	   to copy, so copy 16 bytes unaligned and then align.  The loop
> > +	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
> > +
> > +	and	tmp1, dstend, 15
> > +	ldp	D_l, D_h, [srcend, -16]
> > +	sub	srcend, srcend, tmp1
> > +	sub	count, count, tmp1
> > +	ldp	A_l, A_h, [srcend, -16]
> > +	stp	D_l, D_h, [dstend, -16]
> > +	ldp	B_l, B_h, [srcend, -32]
> > +	ldp	C_l, C_h, [srcend, -48]
> > +	ldp	D_l, D_h, [srcend, -64]!
> > +	sub	dstend, dstend, tmp1
> > +	subs	count, count, 128
> > +	b.ls	2f
> > +
> > +	nop
> > +1:
> > +	stp	A_l, A_h, [dstend, -16]
> > +	ldp	A_l, A_h, [srcend, -16]
> > +	stp	B_l, B_h, [dstend, -32]
> > +	ldp	B_l, B_h, [srcend, -32]
> > +	stp	C_l, C_h, [dstend, -48]
> > +	ldp	C_l, C_h, [srcend, -48]
> > +	stp	D_l, D_h, [dstend, -64]!
> > +	ldp	D_l, D_h, [srcend, -64]!
> > +	subs	count, count, 64
> > +	b.hi	1b
> > +
> > +	/* Write the last full set of 64 bytes.  The remainder is at most 64
> > +	   bytes, so it is safe to always copy 64 bytes from the start even if
> > +	   there is just 1 byte left.  */
> > +2:
> > +	ldp	G_l, G_h, [src, 48]
> > +	stp	A_l, A_h, [dstend, -16]
> > +	ldp	A_l, A_h, [src, 32]
> > +	stp	B_l, B_h, [dstend, -32]
> > +	ldp	B_l, B_h, [src, 16]
> > +	stp	C_l, C_h, [dstend, -48]
> > +	ldp	C_l, C_h, [src]
> > +	stp	D_l, D_h, [dstend, -64]
> > +	stp	G_l, G_h, [dstin, 48]
> > +	stp	A_l, A_h, [dstin, 32]
> > +	stp	B_l, B_h, [dstin, 16]
> > +	stp	C_l, C_h, [dstin]
> > +3:	ret
> > +
> > +END (__memcpy_oryon1)
> 

-- 
Andreas K. Hüttel
dilfridge@gentoo.org
Gentoo Linux developer 
(council, comrel, toolchain, base-system, perl, libreoffice)
https://wiki.gentoo.org/wiki/User:Dilfridge
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: This is a digitally signed message part.
URL: <https://sourceware.org/pipermail/libc-alpha/attachments/20240630/76fdcf1e/attachment.sig>