This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] aarch64: thunderx2 memcpy implementation cleanup and streamlining
- From: Anton Youdkevitch <anton dot youdkevitch at bell-sw dot com>
- To: libc-alpha at sourceware dot org
- Date: Wed, 20 Mar 2019 17:31:42 +0300
- Subject: [PATCH] aarch64: thunderx2 memcpy implementation cleanup and streamlining
Replaced named labels with the numeric ones to
reduce clutter.
Rewrote branching scheme in load and merge chunk
to be aligned with the most probable case.
ChangeLog:
* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: cleanup
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..b55dd52 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -352,7 +352,7 @@ L(bytes_32_to_48):
/* Small copies: 0..16 bytes. */
L(memcopy16):
cmp count, 8
- b.lo L(bytes_0_to_8)
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
add dstend, dstin, count
@@ -361,8 +361,8 @@ L(memcopy16):
ret
.p2align 4
-L(bytes_0_to_8):
- tbz count, 2, L(bytes_0_to_3)
+1:
+ tbz count, 2, 1f
ldr A_lw, [src]
ldr A_hw, [srcend, -4]
add dstend, dstin, count
@@ -372,8 +372,8 @@ L(bytes_0_to_8):
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
byte 3 times if count==1, or the 2nd byte twice if count==2. */
-L(bytes_0_to_3):
- cbz count, L(end)
+1:
+ cbz count, 1f
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb A_hw, [srcend, -1]
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb A_hw, [dstend, -1]
-L(end): ret
+1:
+ ret
.p2align 4
@@ -404,7 +405,7 @@ L(memcpy_copy96):
after tmp [0..15] gets added to it,
count now is <bytes-left-to-load>+48 */
cmp count, 80
- b.gt L(copy96_medium)
+ b.gt 1f
ldr D_q, [src, 32]
stp B_q, C_q, [dst, 16]
str E_q, [dstend, -16]
@@ -412,17 +413,17 @@ L(memcpy_copy96):
ret
.p2align 4
-L(copy96_medium):
+1:
ldp D_q, A_q, [src, 32]
str B_q, [dst, 16]
cmp count, 96
- b.gt L(copy96_large)
+ b.gt 1f
str E_q, [dstend, -16]
stp C_q, D_q, [dst, 32]
str A_q, [dst, 64]
ret
-L(copy96_large):
+1:
ldr F_q, [src, 64]
stp C_q, D_q, [dst, 32]
str E_q, [dstend, -16]
@@ -557,17 +558,9 @@ L(ext_size_ ## shft):;\
ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
subs count, count, 32;\
- b.ge 2f;\
+ b.lt 2f;\
1:;\
stp A_q, B_q, [dst], #32;\
- ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
- ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- stp H_q, I_q, [dst], #16;\
- add dst, dst, tmp1;\
- str G_q, [dst], #16;\
- b L(copy_long_check32);\
-2:;\
- stp A_q, B_q, [dst], #32;\
prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
ldp D_q, J_q, [src], #32;\
ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
@@ -579,8 +572,15 @@ L(ext_size_ ## shft):;\
ext B_v.16b, D_v.16b, J_v.16b, 16-shft;\
mov E_v.16b, J_v.16b;\
subs count, count, 64;\
- b.ge 2b;\
- b 1b;\
+ b.ge 1b;\
+2:;\
+ stp A_q, B_q, [dst], #32;\
+ ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+ ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+ stp H_q, I_q, [dst], #16;\
+ add dst, dst, tmp1;\
+ str G_q, [dst], #16;\
+ b L(copy_long_check32);\
EXT_CHUNK(1)
EXT_CHUNK(2)