This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] aarch64: thunderx2 memcpy implementation cleanup and streamlining


Replaced named labels with the numeric ones to
reduce clutter.
Rewrote branching scheme in load and merge chunk
to be aligned with the most probable case.

ChangeLog:

	* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: cleanup

diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..b55dd52 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -352,7 +352,7 @@ L(bytes_32_to_48):
 	/* Small copies: 0..16 bytes.  */
 L(memcopy16):
 	cmp     count, 8
-	b.lo    L(bytes_0_to_8)
+	b.lo    1f
 	ldr     A_l, [src]
 	ldr     A_h, [srcend, -8]
 	add     dstend, dstin, count
@@ -361,8 +361,8 @@ L(memcopy16):
 	ret
 	.p2align 4
 
-L(bytes_0_to_8):
-	tbz     count, 2, L(bytes_0_to_3)
+1:
+	tbz     count, 2, 1f
 	ldr     A_lw, [src]
 	ldr     A_hw, [srcend, -4]
 	add     dstend, dstin, count
@@ -372,8 +372,8 @@ L(bytes_0_to_8):
 
 	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-L(bytes_0_to_3):
-	cbz     count, L(end)
+1:
+	cbz     count, 1f
 	lsr     tmp1, count, 1
 	ldrb    A_lw, [src]
 	ldrb    A_hw, [srcend, -1]
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
 	strb    A_lw, [dstin]
 	strb    B_lw, [dstin, tmp1]
 	strb    A_hw, [dstend, -1]
-L(end): ret
+1:
+	ret
 
 	.p2align 4
 
@@ -404,7 +405,7 @@ L(memcpy_copy96):
 	   after tmp [0..15] gets added to it,
 	   count now is <bytes-left-to-load>+48 */
 	cmp     count, 80
-	b.gt    L(copy96_medium)
+	b.gt    1f
 	ldr     D_q, [src, 32]
 	stp     B_q, C_q, [dst, 16]
 	str     E_q, [dstend, -16]
@@ -412,17 +413,17 @@ L(memcpy_copy96):
 	ret
 
 	.p2align 4
-L(copy96_medium):
+1:
 	ldp     D_q, A_q, [src, 32]
 	str     B_q, [dst, 16]
 	cmp     count, 96
-	b.gt    L(copy96_large)
+	b.gt    1f
 	str     E_q, [dstend, -16]
 	stp     C_q, D_q, [dst, 32]
 	str     A_q, [dst, 64]
 	ret
 
-L(copy96_large):
+1:
 	ldr     F_q, [src, 64]
 	stp     C_q, D_q, [dst, 32]
 	str     E_q, [dstend, -16]
@@ -557,17 +558,9 @@ L(ext_size_ ## shft):;\
 	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
 	subs    count, count, 32;\
-	b.ge    2f;\
+	b.lt    2f;\
 1:;\
 	stp     A_q, B_q, [dst], #32;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #16;\
-	add     dst, dst, tmp1;\
-	str     G_q, [dst], #16;\
-	b       L(copy_long_check32);\
-2:;\
-	stp     A_q, B_q, [dst], #32;\
 	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
 	ldp     D_q, J_q, [src], #32;\
 	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
@@ -579,8 +572,15 @@ L(ext_size_ ## shft):;\
 	ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
 	mov     E_v.16b, J_v.16b;\
 	subs    count, count, 64;\
-	b.ge    2b;\
-	b	1b;\
+	b.ge    1b;\
+2:;\
+	stp     A_q, B_q, [dst], #32;\
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+	stp     H_q, I_q, [dst], #16;\
+	add     dst, dst, tmp1;\
+	str     G_q, [dst], #16;\
+	b       L(copy_long_check32);\
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]