This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.29.9000-183-g94e358f
- From: sje at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 5 Apr 2019 21:02:08 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.29.9000-183-g94e358f
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 94e358f6d490650c714edb1ffc3a52f56ffe086e (commit)
from f82ed45d7f77838bc8cff4c0a4ff33e76bb18a35 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=94e358f6d490650c714edb1ffc3a52f56ffe086e
commit 94e358f6d490650c714edb1ffc3a52f56ffe086e
Author: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
Date: Fri Apr 5 13:59:54 2019 -0700
aarch64: thunderx2 memcpy implementation cleanup and streamlining
Here is the updated patch for improving the long unaligned
code path (the one using "ext" instruction).
1. Always taken conditional branch at the beginning is
removed.
2. Epilogue code is placed after the end of the loop to
reduce the number of branches.
3. The redundant "mov" instructions inside the loop are
gone due to the changed order of the registers in the "ext"
instructions inside the loop, the prologue has additional
"ext" instruction.
4.Updating count in the prologue was hoisted out as
it is the same update for each prologue.
5. Invariant code of the loop epilogue was hoisted out.
6. As the current size of the ext chunk is exactly 16
instructions long "nop" was added at the beginning
of the code sequence so that the loop entry for all the
chunks be aligned.
* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: Cleanup branching
and remove redundant code.
diff --git a/ChangeLog b/ChangeLog
index b00c783..5ad8875 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-04-05 Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
+
+ * sysdeps/aarch64/multiarch/memcpy_thunderx2.S: Cleanup branching
+ and remove redundant code.
+
2019-04-04 Adhemerval Zanella <adhemerval.zanella@linaro.org>
* sysdeps/powerpc/Makefile [$(subdir) == wcsmbs] (CFLAGS-wcsrchr.c):
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..45e9a29 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb A_hw, [dstend, -1]
-L(end): ret
+L(end):
+ ret
.p2align 4
@@ -544,43 +545,35 @@ L(dst_unaligned):
str C_q, [dst], #16
ldp F_q, G_q, [src], #32
bic dst, dst, 15
+ subs count, count, 32
adrp tmp2, L(ext_table)
add tmp2, tmp2, :lo12:L(ext_table)
add tmp2, tmp2, tmp1, LSL #2
ldr tmp3w, [tmp2]
add tmp2, tmp2, tmp3w, SXTW
br tmp2
-
-#define EXT_CHUNK(shft) \
.p2align 4 ;\
+ nop
+#define EXT_CHUNK(shft) \
L(ext_size_ ## shft):;\
ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
- subs count, count, 32;\
- b.ge 2f;\
-1:;\
- stp A_q, B_q, [dst], #32;\
ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
- ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- stp H_q, I_q, [dst], #16;\
- add dst, dst, tmp1;\
- str G_q, [dst], #16;\
- b L(copy_long_check32);\
-2:;\
+1:;\
stp A_q, B_q, [dst], #32;\
prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
- ldp D_q, J_q, [src], #32;\
- ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+ ldp C_q, D_q, [src], #32;\
ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- mov C_v.16b, G_v.16b;\
stp H_q, I_q, [dst], #32;\
+ ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\
+ ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\
ldp F_q, G_q, [src], #32;\
- ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
- ext B_v.16b, D_v.16b, J_v.16b, 16-shft;\
- mov E_v.16b, J_v.16b;\
+ ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\
subs count, count, 64;\
- b.ge 2b;\
- b 1b;\
+ b.ge 1b;\
+2:;\
+ ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+ b L(ext_tail);
EXT_CHUNK(1)
EXT_CHUNK(2)
@@ -598,6 +591,14 @@ EXT_CHUNK(13)
EXT_CHUNK(14)
EXT_CHUNK(15)
+L(ext_tail):
+ stp A_q, B_q, [dst], #32
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+
+
END (MEMCPY)
.section .rodata
.p2align 4
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 5 +++
sysdeps/aarch64/multiarch/memcpy_thunderx2.S | 43 +++++++++++++------------
2 files changed, 27 insertions(+), 21 deletions(-)
hooks/post-receive
--
GNU C Library master sources