This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v3] aarch64: thunderx2 memcpy branches reordering
- From: Anton Youdkevitch <anton dot youdkevitch at bell-sw dot com>
- To: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>, libc-alpha at sourceware dot org
- Date: Fri, 22 Mar 2019 23:14:54 +0300
- Subject: [PATCH v3] aarch64: thunderx2 memcpy branches reordering
The "ext" chunk changes are:
1. Always taken conditional branch at the beginning is
removed.
2. Epilogue code is placed after the end of the loop to
reduce the number of branches.
2. The redundant "mov" instruction inside the loop is
removed.
3. Invariant code in the loop epilogue is no more
repeated for each chunk.
make check showed no regressions.
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..f53bc2a 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb A_hw, [dstend, -1]
-L(end): ret
+L(end):
+ ret
.p2align 4
@@ -544,6 +545,7 @@ L(dst_unaligned):
str C_q, [dst], #16
ldp F_q, G_q, [src], #32
bic dst, dst, 15
+ subs count, count, 32
adrp tmp2, L(ext_table)
add tmp2, tmp2, :lo12:L(ext_table)
add tmp2, tmp2, tmp1, LSL #2
@@ -556,31 +558,24 @@ L(dst_unaligned):
L(ext_size_ ## shft):;\
ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
- subs count, count, 32;\
- b.ge 2f;\
1:;\
stp A_q, B_q, [dst], #32;\
- ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
- ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- stp H_q, I_q, [dst], #16;\
- add dst, dst, tmp1;\
- str G_q, [dst], #16;\
- b L(copy_long_check32);\
-2:;\
- stp A_q, B_q, [dst], #32;\
prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
- ldp D_q, J_q, [src], #32;\
+ ldp C_q, D_q, [src], #32;\
ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- mov C_v.16b, G_v.16b;\
stp H_q, I_q, [dst], #32;\
+ ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\
ldp F_q, G_q, [src], #32;\
- ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
- ext B_v.16b, D_v.16b, J_v.16b, 16-shft;\
- mov E_v.16b, J_v.16b;\
+ ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\
+ mov E_v.16b, D_v.16b;\
subs count, count, 64;\
- b.ge 2b;\
- b 1b;\
+ b.ge 1b;\
+2:;\
+ stp A_q, B_q, [dst], #32;\
+ ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+ ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+ b L(ext_tail);
EXT_CHUNK(1)
EXT_CHUNK(2)
@@ -598,6 +593,13 @@ EXT_CHUNK(13)
EXT_CHUNK(14)
EXT_CHUNK(15)
+L(ext_tail):
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+
+
END (MEMCPY)
.section .rodata
.p2align 4