This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH v3] aarch64: thunderx2 memcpy branches reordering

From: Anton Youdkevitch <anton dot youdkevitch at bell-sw dot com>
To: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>, libc-alpha at sourceware dot org
Date: Fri, 22 Mar 2019 23:14:54 +0300
Subject: [PATCH v3] aarch64: thunderx2 memcpy branches reordering

The "ext" chunk changes are:

1. Always taken conditional branch at the beginning is
removed.
2. Epilogue code is placed after the end of the loop to
reduce the number of branches.
2. The redundant "mov" instruction inside the loop is
removed.
3. Invariant code in the loop epilogue is no more
repeated for each chunk.

make check showed no regressions.

diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..f53bc2a 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
 	strb    A_lw, [dstin]
 	strb    B_lw, [dstin, tmp1]
 	strb    A_hw, [dstend, -1]
-L(end): ret
+L(end):
+	ret
 
 	.p2align 4
 
@@ -544,6 +545,7 @@ L(dst_unaligned):
 	str     C_q, [dst], #16
 	ldp     F_q, G_q, [src], #32
 	bic	dst, dst, 15
+	subs    count, count, 32
 	adrp	tmp2, L(ext_table)
 	add	tmp2, tmp2, :lo12:L(ext_table)
 	add	tmp2, tmp2, tmp1, LSL #2
@@ -556,31 +558,24 @@ L(dst_unaligned):
 L(ext_size_ ## shft):;\
 	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
-	subs    count, count, 32;\
-	b.ge    2f;\
 1:;\
 	stp     A_q, B_q, [dst], #32;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #16;\
-	add     dst, dst, tmp1;\
-	str     G_q, [dst], #16;\
-	b       L(copy_long_check32);\
-2:;\
-	stp     A_q, B_q, [dst], #32;\
 	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
-	ldp     D_q, J_q, [src], #32;\
+	ldp     C_q, D_q, [src], #32;\
 	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
 	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	mov     C_v.16b, G_v.16b;\
 	stp     H_q, I_q, [dst], #32;\
+	ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
 	ldp     F_q, G_q, [src], #32;\
-	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
-	ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
-	mov     E_v.16b, J_v.16b;\
+	ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
+	mov     E_v.16b, D_v.16b;\
 	subs    count, count, 64;\
-	b.ge    2b;\
-	b	1b;\
+	b.ge    1b;\
+2:;\
+	stp     A_q, B_q, [dst], #32;\
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+	b	L(ext_tail);
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)
@@ -598,6 +593,13 @@ EXT_CHUNK(13)
 EXT_CHUNK(14)
 EXT_CHUNK(15)
 
+L(ext_tail):
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+
+
 END (MEMCPY)
 	.section	.rodata
 	.p2align	4

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]