This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.14-247-g5025581
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 8 Sep 2011 01:55:03 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.14-247-g5025581
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 5025581e1c66a184a587ab1bd99cd168e8fb7770 (commit)
from a450513e1d51cb8fe46ba5ebd92399247060b980 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5025581e1c66a184a587ab1bd99cd168e8fb7770
commit 5025581e1c66a184a587ab1bd99cd168e8fb7770
Author: Will Schmidt <will_schmidt@vnet.ibm.com>
Date: Wed Sep 7 21:54:41 2011 -0400
power7 memcpy VSX optimizations
diff --git a/ChangeLog b/ChangeLog
index c90f2c7..429767d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2011-07-28 Will Schmidt <will_schmidt@vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc32/power7/memcpy.S: Optimize the
+ aligned copy for power7 with vector-scalar instructions.
+ * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+
2011-07-24 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Simplify
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index f0c332f..ec70557 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -1,5 +1,5 @@
/* Optimized memcpy implementation for PowerPC32/POWER7.
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -116,24 +116,82 @@ L(copy_GE_32_aligned_cont):
stfd 6,0(3)
addi 10,3,8
+L(aligned_copy):
+ /* Main aligned copy loop. Copies up to 128-bytes at a time. */
.align 4
-4: /* Main aligned copy loop. Copies 32-bytes at a time. */
- lfd 6,0(11)
- lfd 7,8(11)
- lfd 8,16(11)
- lfd 0,24(11)
- addi 11,11,32
+4:
+ /* check for any 32-byte or 64-byte lumps that are outside of a
+ nice 128-byte range. R8 contains the number of 32-byte
+ lumps, so drop this into the CR, and use the SO/EQ bits to help
+ handle the 32- or 64- byte lumps. Then handle the rest with an
+ unrolled 128-bytes-at-a-time copy loop. */
+ mtocrf 1,8
+ li 6,16 # 16() index
+ li 7,32 # 32() index
+ li 8,48 # 48() index
+
+L(aligned_32byte):
+ /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+ bns cr7,L(aligned_64byte)
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ addi 11,11,32
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ addi 10,10,32
+
+L(aligned_64byte):
+ /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+ bne cr7,L(aligned_128setup)
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ addi 11,11,64
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ addi 10,10,64
+
+L(aligned_128setup):
+ /* Set up for the 128-byte at a time copy loop. */
+ srwi 8,31,7
+ cmpwi 8,0 # Any 4x lumps left?
+ beq 3f # if not, move along.
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ mtctr 8 # otherwise, load the ctr and begin.
+ li 8,48 # 48() index
+ b L(aligned_128loop)
+
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+L(aligned_128loop):
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ stxvd2x 6,0,10
+ addi 11,11,64
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ addi 10,10,64
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ addi 11,11,64
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ addi 10,10,64
+ bdnz L(aligned_128head)
- stfd 6,0(10)
- stfd 7,8(10)
- stfd 8,16(10)
- stfd 0,24(10)
- addi 10,10,32
- bdnz 4b
3:
-
/* Check for tail bytes. */
-
clrrwi 0,31,3
mtcrf 0x01,31
beq cr6,0f
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 2e5beed..8aaef97 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -1,5 +1,5 @@
/* Optimized memcpy implementation for PowerPC64/POWER7.
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -115,23 +115,81 @@ L(copy_GE_32_aligned_cont):
std 6,0(3)
addi 10,3,8
- /* Main aligned copy loop. Copies 32-bytes at a time. */
+L(aligned_copy):
+ /* Main aligned copy loop. Copies up to 128-bytes at a time. */
.align 4
4:
- ld 6,0(11)
- ld 7,8(11)
- ld 8,16(11)
- ld 0,24(11)
- addi 11,11,32
+ /* check for any 32-byte or 64-byte lumps that are outside of a
+ nice 128-byte range. R8 contains the number of 32-byte
+ lumps, so drop this into the CR, and use the SO/EQ bits to help
+ handle the 32- or 64- byte lumps. Then handle the rest with an
+ unrolled 128-bytes-at-a-time copy loop. */
+ mtocrf 1,8
+ li 6,16 # 16() index
+ li 7,32 # 32() index
+ li 8,48 # 48() index
+
+L(aligned_32byte):
+ /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+ bns cr7,L(aligned_64byte)
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ addi 11,11,32
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ addi 10,10,32
+
+L(aligned_64byte):
+ /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+ bne cr7,L(aligned_128setup)
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ addi 11,11,64
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ addi 10,10,64
+
+L(aligned_128setup):
+ /* Set up for the 128-byte at a time copy loop. */
+ srdi 8,31,7
+ cmpdi 8,0 # Any 4x lumps left?
+ beq 3f # if not, move along.
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ mtctr 8 # otherwise, load the ctr and begin.
+ li 8,48 # 48() index
+ b L(aligned_128loop)
+
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+L(aligned_128loop):
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ stxvd2x 6,0,10
+ addi 11,11,64
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ addi 10,10,64
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ addi 11,11,64
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ addi 10,10,64
+ bdnz L(aligned_128head)
- std 6,0(10)
- std 7,8(10)
- std 8,16(10)
- std 0,24(10)
- addi 10,10,32
- bdnz 4b
3:
-
/* Check for tail bytes. */
rldicr 0,31,0,60
mtcrf 0x01,31
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 6 ++
sysdeps/powerpc/powerpc32/power7/memcpy.S | 90 +++++++++++++++++++++++-----
sysdeps/powerpc/powerpc64/power7/memcpy.S | 88 +++++++++++++++++++++++-----
3 files changed, 153 insertions(+), 31 deletions(-)
hooks/post-receive
--
GNU C Library master sources