[PATCH] Remove thunderx{,2} memcpy
Andrew Pinski
quic_apinski@quicinc.com
Mon Nov 18 18:39:22 GMT 2024
ThunderX1 and ThunderX2 have been retired for a few years now.
So let's remove the thunderx{,2} specific versions of memcpy.
The performance gain or them was for medium and large sizes
while the generic (aarch64) memcpy will handle just slightly worse.
Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
sysdeps/aarch64/cpu-features.h | 8 -
sysdeps/aarch64/multiarch/Makefile | 2 -
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/aarch64/multiarch/memcpy.c | 8 -
sysdeps/aarch64/multiarch/memcpy_thunderx.S | 305 -------------
sysdeps/aarch64/multiarch/memcpy_thunderx2.S | 457 -------------------
sysdeps/aarch64/multiarch/memmove.c | 8 -
7 files changed, 792 deletions(-)
delete mode 100644 sysdeps/aarch64/multiarch/memcpy_thunderx.S
delete mode 100644 sysdeps/aarch64/multiarch/memcpy_thunderx2.S
diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
index bc8d842238..6c0a3fe4e1 100644
--- a/sysdeps/aarch64/cpu-features.h
+++ b/sysdeps/aarch64/cpu-features.h
@@ -40,14 +40,6 @@
#define MIDR_IMPLEMENTOR(midr) \
(((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT)
-#define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
- && MIDR_PARTNUM(midr) == 0x0a1)
-
-#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B' \
- && MIDR_PARTNUM(midr) == 0x516)
-#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
- && MIDR_PARTNUM(midr) == 0xaf)
-
#define IS_EMAG(midr) (MIDR_IMPLEMENTOR(midr) == 'P' \
&& MIDR_PARTNUM(midr) == 0x000)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 3e251cc234..772b16a358 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -7,8 +7,6 @@ sysdep_routines += \
memcpy_mops \
memcpy_oryon1 \
memcpy_sve \
- memcpy_thunderx \
- memcpy_thunderx2 \
memmove_mops \
memset_a64fx \
memset_emag \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index b2fda541f9..4a981e931d 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -35,9 +35,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c. */
IFUNC_IMPL (i, name, memcpy,
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1)
- IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
#if HAVE_AARCH64_SVE_ASM
IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx)
IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
@@ -45,9 +43,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, mops, __memcpy_mops)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1)
- IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
#if HAVE_AARCH64_SVE_ASM
IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx)
IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 15c954778b..9251297ee6 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -30,8 +30,6 @@
extern __typeof (__redirect_memcpy) __libc_memcpy;
extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
-extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
-extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden;
@@ -55,12 +53,6 @@ select_memcpy_ifunc (void)
if (IS_ORYON1 (midr))
return __memcpy_oryon1;
- if (IS_THUNDERX (midr))
- return __memcpy_thunderx;
-
- if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
- return __memcpy_thunderx2;
-
return __memcpy_generic;
}
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
deleted file mode 100644
index 5d8438a82e..0000000000
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ /dev/null
@@ -1,305 +0,0 @@
-/* A Thunderx Optimized memcpy implementation for AARCH64.
- Copyright (C) 2017-2024 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-/* The actual code in this memcpy and memmove should be identical to the
- generic version except for the code under '#ifdef THUNDERX'. This is
- to make is easier to keep this version and the generic version in sync
- for changes that are not specific to thunderx. */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses.
- *
- */
-
-#define dstin x0
-#define src x1
-#define count x2
-#define dst x3
-#define srcend x4
-#define dstend x5
-#define A_l x6
-#define A_lw w6
-#define A_h x7
-#define A_hw w7
-#define B_l x8
-#define B_lw w8
-#define B_h x9
-#define C_l x10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l src
-#define E_h count
-#define F_l srcend
-#define F_h dst
-#define G_l count
-#define G_h dst
-#define tmp1 x14
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
- medium copies of 17..96 bytes which are fully unrolled. Large copies
- of more than 96 bytes align the destination and use an unrolled loop
- processing 64 bytes per iteration.
- In order to share code with memmove, small and medium copies read all
- data before writing, allowing any kind of overlap. So small, medium
- and large backwards memmoves are handled by falling through into memcpy.
- Overlapping large forward memmoves use a loop that copies backwards.
-*/
-
-ENTRY (__memmove_thunderx)
-
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- sub tmp1, dstin, src
- cmp count, 96
- ccmp tmp1, count, 2, hi
- b.lo L(move_long)
-
- /* Common case falls through into memcpy. */
-END (__memmove_thunderx)
-
-ENTRY (__memcpy_thunderx)
-
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- prfm PLDL1KEEP, [src]
- add srcend, src, count
- add dstend, dstin, count
- cmp count, 16
- b.ls L(copy16)
- cmp count, 96
- b.hi L(copy_long)
-
- /* Medium copies: 17..96 bytes. */
- sub tmp1, count, 1
- ldp A_l, A_h, [src]
- tbnz tmp1, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz tmp1, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
- /* Small copies: 0..16 bytes. */
-L(copy16):
- cmp count, 8
- b.lo 1f
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- str A_l, [dstin]
- str A_h, [dstend, -8]
- ret
- .p2align 4
-1:
- tbz count, 2, 1f
- ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
- str A_lw, [dstin]
- str A_hw, [dstend, -4]
- ret
-
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-1:
- cbz count, 2f
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-2: ret
-
- .p2align 4
- /* Copy 64..96 bytes. Copy 64 bytes from the start and
- 32 bytes from the end. */
-L(copy96):
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [src, 32]
- ldp D_l, D_h, [src, 48]
- ldp E_l, E_h, [srcend, -32]
- ldp F_l, F_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin, 32]
- stp D_l, D_h, [dstin, 48]
- stp E_l, E_h, [dstend, -32]
- stp F_l, F_h, [dstend, -16]
- ret
-
- /* Align DST to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-
- .p2align 4
-L(copy_long):
-
- /* On thunderx, large memcpy's are helped by software prefetching.
- This loop is identical to the one below it but with prefetching
- instructions included. For loops that are less than 32768 bytes,
- the prefetching does not help and slow the code down so we only
- use the prefetching loop for the largest memcpys. */
-
- cmp count, #32768
- b.lo L(copy_long_without_prefetch)
- and tmp1, dstin, 15
- bic dst, dstin, 15
- ldp D_l, D_h, [src]
- sub src, src, tmp1
- prfm pldl1strm, [src, 384]
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
- subs count, count, 128 + 16 /* Test and readjust count. */
-
-L(prefetch_loop64):
- tbz src, #6, 1f
- prfm pldl1strm, [src, 512]
-1:
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
- ldp D_l, D_h, [src, 64]!
- subs count, count, 64
- b.hi L(prefetch_loop64)
- b L(last64)
-
-L(copy_long_without_prefetch):
-
- and tmp1, dstin, 15
- bic dst, dstin, 15
- ldp D_l, D_h, [src]
- sub src, src, tmp1
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
- subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(last64)
-L(loop64):
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
- ldp D_l, D_h, [src, 64]!
- subs count, count, 64
- b.hi L(loop64)
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the end even if
- there is just 1 byte left. */
-L(last64):
- ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
- ret
-
- .p2align 4
-L(move_long):
- cbz tmp1, 3f
-
- add srcend, src, count
- add dstend, dstin, count
-
- /* Align dstend to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-
- and tmp1, dstend, 15
- ldp D_l, D_h, [srcend, -16]
- sub srcend, srcend, tmp1
- sub count, count, tmp1
- ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
- ldp B_l, B_h, [srcend, -32]
- ldp C_l, C_h, [srcend, -48]
- ldp D_l, D_h, [srcend, -64]!
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls 2f
-
- nop
-1:
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
- ldp D_l, D_h, [srcend, -64]!
- subs count, count, 64
- b.hi 1b
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
- ldp G_l, G_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp G_l, G_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
-3: ret
-
-END (__memcpy_thunderx)
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
deleted file mode 100644
index a3d79aafcd..0000000000
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ /dev/null
@@ -1,457 +0,0 @@
-/* A Thunderx2 Optimized memcpy implementation for AARCH64.
- Copyright (C) 2018-2024 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses.
- *
- */
-
-#define dstin x0
-#define src x1
-#define count x2
-#define dst x3
-#define srcend x4
-#define dstend x5
-#define tmp2 x6
-#define tmp3 x7
-#define tmp3w w7
-#define A_l x6
-#define A_lw w6
-#define A_h x7
-#define A_hw w7
-#define B_l x8
-#define B_lw w8
-#define B_h x9
-#define C_l x10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l src
-#define E_h count
-#define F_l srcend
-#define F_h dst
-#define G_l count
-#define G_h dst
-#define tmp1 x14
-
-#define A_q q0
-#define B_q q1
-#define C_q q2
-#define D_q q3
-#define E_q q4
-#define F_q q5
-#define G_q q6
-#define H_q q7
-#define I_q q16
-#define J_q q17
-
-#define A_v v0
-#define B_v v1
-#define C_v v2
-#define D_v v3
-#define E_v v4
-#define F_v v5
-#define G_v v6
-#define H_v v7
-#define I_v v16
-#define J_v v17
-
-/* Overlapping large forward memmoves use a loop that copies backwards.
- Otherwise memcpy is used. Small moves branch to memcopy16 directly.
- The longer memcpy cases fall through to the memcpy head.
-*/
-
-ENTRY (__memmove_thunderx2)
-
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- add srcend, src, count
- cmp count, 16
- b.ls L(memcopy16)
- sub tmp1, dstin, src
- cmp count, 96
- ccmp tmp1, count, 2, hi
- b.lo L(move_long)
-
-END (__memmove_thunderx2)
-
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
- medium copies of 17..96 bytes which are fully unrolled. Large copies
- of more than 96 bytes align the destination and use load-and-merge
- approach in the case src and dst addresses are unaligned not evenly,
- so that, actual loads and stores are always aligned.
- Large copies use the loops processing 64 bytes per iteration for
- unaligned case and 128 bytes per iteration for aligned ones.
-*/
-
-#define MEMCPY_PREFETCH_LDR 640
-
-ENTRY (__memcpy_thunderx2)
-
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- add srcend, src, count
- cmp count, 16
- b.ls L(memcopy16)
- ldr A_q, [src], #16
- add dstend, dstin, count
- and tmp1, src, 15
- cmp count, 96
- b.hi L(memcopy_long)
-
- /* Medium copies: 17..96 bytes. */
- ldr E_q, [srcend, -16]
- cmp count, 64
- b.gt L(memcpy_copy96)
- cmp count, 48
- b.le L(bytes_17_to_48)
- /* 49..64 bytes */
- ldp B_q, C_q, [src]
- str E_q, [dstend, -16]
- stp A_q, B_q, [dstin]
- str C_q, [dstin, 32]
- ret
-
-L(bytes_17_to_48):
- /* 17..48 bytes*/
- cmp count, 32
- b.gt L(bytes_32_to_48)
- /* 17..32 bytes*/
- str A_q, [dstin]
- str E_q, [dstend, -16]
- ret
-
-L(bytes_32_to_48):
- /* 32..48 */
- ldr B_q, [src]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- str B_q, [dstin, 16]
- ret
-
- .p2align 4
- /* Small copies: 0..16 bytes. */
-L(memcopy16):
- cmp count, 8
- b.lo L(bytes_0_to_8)
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- add dstend, dstin, count
- str A_l, [dstin]
- str A_h, [dstend, -8]
- ret
- .p2align 4
-
-L(bytes_0_to_8):
- tbz count, 2, L(bytes_0_to_3)
- ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
- add dstend, dstin, count
- str A_lw, [dstin]
- str A_hw, [dstend, -4]
- ret
-
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-L(bytes_0_to_3):
- cbz count, 1f
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
- add dstend, dstin, count
- ldrb B_lw, [src, tmp1]
- strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
- strb A_lw, [dstin]
-1:
- ret
-
- .p2align 4
-
-L(memcpy_copy96):
- /* Copying 65..96 bytes. A_q (first 16 bytes) and
- E_q(last 16 bytes) are already loaded. The size
- is large enough to benefit from aligned loads */
- bic src, src, 15
- ldp B_q, C_q, [src]
- /* Loaded 64 bytes, second 16-bytes chunk can be
- overlapping with the first chunk by tmp1 bytes.
- Stored 16 bytes. */
- sub dst, dstin, tmp1
- add count, count, tmp1
- /* The range of count being [65..96] becomes [65..111]
- after tmp [0..15] gets added to it,
- count now is <bytes-left-to-load>+48 */
- cmp count, 80
- b.gt L(copy96_medium)
- ldr D_q, [src, 32]
- stp B_q, C_q, [dst, 16]
- str D_q, [dst, 48]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- ret
-
- .p2align 4
-L(copy96_medium):
- ldp D_q, G_q, [src, 32]
- cmp count, 96
- b.gt L(copy96_large)
- stp B_q, C_q, [dst, 16]
- stp D_q, G_q, [dst, 48]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- ret
-
-L(copy96_large):
- ldr F_q, [src, 64]
- str B_q, [dst, 16]
- stp C_q, D_q, [dst, 32]
- stp G_q, F_q, [dst, 64]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- ret
-
- .p2align 4
-L(memcopy_long):
- bic src, src, 15
- ldp B_q, C_q, [src], #32
- sub dst, dstin, tmp1
- add count, count, tmp1
- add dst, dst, 16
- and tmp1, dst, 15
- ldp D_q, E_q, [src], #32
- str A_q, [dstin]
-
- /* Already loaded 64+16 bytes. Check if at
- least 64 more bytes left */
- subs count, count, 64+64+16
- b.lt L(loop128_exit0)
- cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
- b.lt L(loop128)
- cbnz tmp1, L(dst_unaligned)
- sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
-
- .p2align 4
-
-L(loop128_prefetch):
- prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
- ldp F_q, G_q, [src], #32
- stp B_q, C_q, [dst], #32
- ldp H_q, I_q, [src], #32
- prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
- ldp B_q, C_q, [src], #32
- stp D_q, E_q, [dst], #32
- ldp D_q, E_q, [src], #32
- stp F_q, G_q, [dst], #32
- stp H_q, I_q, [dst], #32
- subs count, count, 128
- b.ge L(loop128_prefetch)
-
- add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
- .p2align 4
-L(loop128):
- ldp F_q, G_q, [src], #32
- ldp H_q, I_q, [src], #32
- stp B_q, C_q, [dst], #32
- stp D_q, E_q, [dst], #32
- subs count, count, 64
- b.lt L(loop128_exit1)
- ldp B_q, C_q, [src], #32
- ldp D_q, E_q, [src], #32
- stp F_q, G_q, [dst], #32
- stp H_q, I_q, [dst], #32
- subs count, count, 64
- b.ge L(loop128)
-L(loop128_exit0):
- ldp F_q, G_q, [srcend, -64]
- ldp H_q, I_q, [srcend, -32]
- stp B_q, C_q, [dst], #32
- stp D_q, E_q, [dst]
- stp F_q, G_q, [dstend, -64]
- stp H_q, I_q, [dstend, -32]
- ret
-L(loop128_exit1):
- ldp B_q, C_q, [srcend, -64]
- ldp D_q, E_q, [srcend, -32]
- stp F_q, G_q, [dst], #32
- stp H_q, I_q, [dst]
- stp B_q, C_q, [dstend, -64]
- stp D_q, E_q, [dstend, -32]
- ret
-
-L(dst_unaligned_tail):
- ldp C_q, D_q, [srcend, -64]
- ldp E_q, F_q, [srcend, -32]
- stp A_q, B_q, [dst], #32
- stp H_q, I_q, [dst], #16
- str G_q, [dst, tmp1]
- stp C_q, D_q, [dstend, -64]
- stp E_q, F_q, [dstend, -32]
- ret
-
-L(dst_unaligned):
- /* For the unaligned store case the code loads two
- aligned chunks and then merges them using ext
- instruction. This can be up to 30% faster than
- the the simple unaligned store access.
-
- Current state: tmp1 = dst % 16; C_q, D_q, E_q
- contains data yet to be stored. src and dst points
- to next-to-be-processed data. A_q, B_q contains
- data already stored before, count = bytes left to
- be load decremented by 64.
-
- The control is passed here if at least 64 bytes left
- to be loaded. The code does two aligned loads and then
- extracts (16-tmp1) bytes from the first register and
- tmp1 bytes from the next register forming the value
- for the aligned store.
-
- As ext instruction can only have it's index encoded
- as immediate. 15 code chunks process each possible
- index value. Computed goto is used to reach the
- required code. */
-
- /* Store the 16 bytes to dst and align dst for further
- operations, several bytes will be stored at this
- address once more */
-
- ldp F_q, G_q, [src], #32
- stp B_q, C_q, [dst], #32
- bic dst, dst, 15
- sub count, count, 32
- adrp tmp2, L(ext_table)
- add tmp2, tmp2, :lo12:L(ext_table)
- add tmp2, tmp2, tmp1, LSL #2
- ldr tmp3w, [tmp2]
- add tmp2, tmp2, tmp3w, SXTW
- br tmp2
-
-.p2align 4
- /* to make the loop in each chunk 16-bytes aligned */
- nop
-#define EXT_CHUNK(shft) \
-L(ext_size_ ## shft):;\
- ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
- ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
- ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-1:;\
- stp A_q, B_q, [dst], #32;\
- prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
- ldp C_q, D_q, [src], #32;\
- ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- stp H_q, I_q, [dst], #32;\
- ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\
- ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\
- ldp F_q, G_q, [src], #32;\
- ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\
- subs count, count, 64;\
- b.ge 1b;\
-2:;\
- ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- b L(dst_unaligned_tail);
-
-EXT_CHUNK(1)
-EXT_CHUNK(2)
-EXT_CHUNK(3)
-EXT_CHUNK(4)
-EXT_CHUNK(5)
-EXT_CHUNK(6)
-EXT_CHUNK(7)
-EXT_CHUNK(8)
-EXT_CHUNK(9)
-EXT_CHUNK(10)
-EXT_CHUNK(11)
-EXT_CHUNK(12)
-EXT_CHUNK(13)
-EXT_CHUNK(14)
-EXT_CHUNK(15)
-
-L(move_long):
- .p2align 4
-1:
- cbz tmp1, 3f
-
- add srcend, src, count
- add dstend, dstin, count
-
- and tmp1, srcend, 15
- ldr D_q, [srcend, -16]
- sub srcend, srcend, tmp1
- sub count, count, tmp1
- ldp A_q, B_q, [srcend, -32]
- str D_q, [dstend, -16]
- ldp C_q, D_q, [srcend, -64]!
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls 2f
-
- .p2align 4
-1:
- subs count, count, 64
- stp A_q, B_q, [dstend, -32]
- ldp A_q, B_q, [srcend, -32]
- stp C_q, D_q, [dstend, -64]!
- ldp C_q, D_q, [srcend, -64]!
- b.hi 1b
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
- ldp E_q, F_q, [src, 32]
- ldp G_q, H_q, [src]
- stp A_q, B_q, [dstend, -32]
- stp C_q, D_q, [dstend, -64]
- stp E_q, F_q, [dstin, 32]
- stp G_q, H_q, [dstin]
-3: ret
-
-
-END (__memcpy_thunderx2)
- .section .rodata
- .p2align 4
-
-L(ext_table):
- /* The first entry is for the alignment of 0 and is never
- actually used (could be any value). */
- .word 0
- .word L(ext_size_1) -.
- .word L(ext_size_2) -.
- .word L(ext_size_3) -.
- .word L(ext_size_4) -.
- .word L(ext_size_5) -.
- .word L(ext_size_6) -.
- .word L(ext_size_7) -.
- .word L(ext_size_8) -.
- .word L(ext_size_9) -.
- .word L(ext_size_10) -.
- .word L(ext_size_11) -.
- .word L(ext_size_12) -.
- .word L(ext_size_13) -.
- .word L(ext_size_14) -.
- .word L(ext_size_15) -.
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index fe95037be3..106011acec 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,8 +29,6 @@
extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_mops attribute_hidden;
@@ -50,12 +48,6 @@ select_memmove_ifunc (void)
return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
}
- if (IS_THUNDERX (midr))
- return __memmove_thunderx;
-
- if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
- return __memmove_thunderx2;
-
return __memmove_generic;
}
--
2.43.0
More information about the Libc-alpha
mailing list