This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch master updated. glibc-2.29.9000-183-g94e358f


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  94e358f6d490650c714edb1ffc3a52f56ffe086e (commit)
      from  f82ed45d7f77838bc8cff4c0a4ff33e76bb18a35 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=94e358f6d490650c714edb1ffc3a52f56ffe086e

commit 94e358f6d490650c714edb1ffc3a52f56ffe086e
Author: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
Date:   Fri Apr 5 13:59:54 2019 -0700

    aarch64: thunderx2 memcpy implementation cleanup and streamlining
    
    Here is the updated patch for improving the long unaligned
    code path (the one using "ext" instruction).
    
    1. Always taken conditional branch at the beginning is
    removed.
    
    2. Epilogue code is placed after the end of the loop to
    reduce the number of branches.
    
    3. The redundant "mov" instructions inside the loop are
    gone due to the changed order of the registers in the "ext"
    instructions inside the loop,  the prologue has additional
    "ext" instruction.
    
    4.Updating count in the prologue was hoisted out as
    it is the same update for each prologue.
    
    5. Invariant code of the loop epilogue was hoisted out.
    
    6. As the current size of the ext chunk is exactly 16
    instructions long "nop" was added at the beginning
    of the code sequence so that the loop entry for all the
    chunks be aligned.
    
    	* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: Cleanup branching
    	and remove redundant code.

diff --git a/ChangeLog b/ChangeLog
index b00c783..5ad8875 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-04-05  Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
+
+	* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: Cleanup branching
+	and remove redundant code.
+
 2019-04-04  Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 
 	* sysdeps/powerpc/Makefile [$(subdir) == wcsmbs] (CFLAGS-wcsrchr.c):
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..45e9a29 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
 	strb    A_lw, [dstin]
 	strb    B_lw, [dstin, tmp1]
 	strb    A_hw, [dstend, -1]
-L(end): ret
+L(end):
+	ret
 
 	.p2align 4
 
@@ -544,43 +545,35 @@ L(dst_unaligned):
 	str     C_q, [dst], #16
 	ldp     F_q, G_q, [src], #32
 	bic	dst, dst, 15
+	subs    count, count, 32
 	adrp	tmp2, L(ext_table)
 	add	tmp2, tmp2, :lo12:L(ext_table)
 	add	tmp2, tmp2, tmp1, LSL #2
 	ldr	tmp3w, [tmp2]
 	add	tmp2, tmp2, tmp3w, SXTW
 	br	tmp2
-
-#define EXT_CHUNK(shft) \
 .p2align 4 ;\
+	nop
+#define EXT_CHUNK(shft) \
 L(ext_size_ ## shft):;\
 	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
-	subs    count, count, 32;\
-	b.ge    2f;\
-1:;\
-	stp     A_q, B_q, [dst], #32;\
 	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #16;\
-	add     dst, dst, tmp1;\
-	str     G_q, [dst], #16;\
-	b       L(copy_long_check32);\
-2:;\
+1:;\
 	stp     A_q, B_q, [dst], #32;\
 	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
-	ldp     D_q, J_q, [src], #32;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+	ldp     C_q, D_q, [src], #32;\
 	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	mov     C_v.16b, G_v.16b;\
 	stp     H_q, I_q, [dst], #32;\
+	ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
+	ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ldp     F_q, G_q, [src], #32;\
-	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
-	ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
-	mov     E_v.16b, J_v.16b;\
+	ext     H_v.16b, D_v.16b, F_v.16b, 16-shft;\
 	subs    count, count, 64;\
-	b.ge    2b;\
-	b	1b;\
+	b.ge    1b;\
+2:;\
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+	b	L(ext_tail);
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)
@@ -598,6 +591,14 @@ EXT_CHUNK(13)
 EXT_CHUNK(14)
 EXT_CHUNK(15)
 
+L(ext_tail):
+	stp     A_q, B_q, [dst], #32
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+
+
 END (MEMCPY)
 	.section	.rodata
 	.p2align	4

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                    |    5 +++
 sysdeps/aarch64/multiarch/memcpy_thunderx2.S |   43 +++++++++++++------------
 2 files changed, 27 insertions(+), 21 deletions(-)


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]