This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] PowerPC: stpcpy optimization for PPC64/POWER7


On 17-09-2013 03:15, Alan Modra wrote:
> On Mon, Sep 16, 2013 at 11:30:06AM -0300, Adhemerval Zanella wrote:
>> 	* sysdeps/powerpc/powerpc64/power7/stpcpy.S: New file.
> Please don't add new files that lack little-endian support.
>
Fair enough, I have added support for LE on default implementation and for
POWER7 one.

---

2013-09-18  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>

        * sysdeps/powerpc/powerpc64/stpcpy.S (__stpcpy): Add doubleword read
	and write to provide a boost for large inputs. Also fix little endian
	issues.
	* sysdeps/powerpc/powerpc64/power7/stpcpy.S: New file.

--

diff --git a/sysdeps/powerpc/powerpc64/power7/stpcpy.S b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
new file mode 100644
index 0000000..65ff6a0
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
@@ -0,0 +1,196 @@
+/* Optimized stpcpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* See strlen.s for comments on how the end-of-string testing works.  */
+
+/* char * [r3] stpcpy (char *dest [r3], const char *src [r4])  */
+
+	.machine  power7
+EALIGN (__stpcpy, 4, 0)
+	CALL_MCOUNT 2
+
+#define rTMP	r0
+#define rRTN	r3	/* pointer to previous word/doubleword in dest */
+#define rSRC	r4	/* pointer to previous word/doubleword in src */
+#define rMASK	r5	/* mark 0xffffffff | 0xffffffffffffffff */
+#define rWORD	r6	/* current word from src */
+#define rALT	r7	/* alternate word from src */
+
+	or	rTMP, rSRC, rRTN
+	clrldi.	rTMP, rTMP, 61
+	bne	L(check_word_alignment)
+
+/* For doubleword aligned memory, operate using doubleword load and stores.  */
+	li	rMASK, 0
+	addi	rRTN, rRTN, -8
+	ld	rWORD, 0(rSRC)
+	b	L(g2)
+
+	.align 4
+L(g0):	ldu	rALT, 8(rSRC)
+	stdu	rWORD, 8(rRTN)
+	cmpb	rTMP, rALT, rMASK
+	cmpdi	rTMP, 0
+	bne	L(g1)
+	ldu	rWORD, 8(rSRC)
+	stdu	rALT, 8(rRTN)
+L(g2):	cmpb	rTMP, rWORD, rMASK
+	cmpdi	rTMP, 0		/* If rTMP is 0, no null's have been found.  */
+	beq	L(g0)
+
+	mr	rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	extrdi.	rTMP, rALT, 8, 56
+	stbu	rALT, 8(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 48
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 40
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 32
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 24
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 16
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 8
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi	rTMP, rALT, 8, 0
+	stbu	rTMP, 1(rRTN)
+#else
+	extrdi.	rTMP, rALT, 8, 0
+	stbu	rTMP, 8(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 8
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 16
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 24
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 32
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 40
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 48
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	stbu	rALT, 1(rRTN)
+#endif
+	blr
+
+L(check_word_alignment):
+	clrldi. rTMP, rTMP, 62
+	bne     L(unaligned)
+
+/* For word aligned memory, operate using word load and
+   stores.  */
+	li	rMASK, 0
+	addi	rRTN, rRTN, -4
+	lwz	rWORD, 0(rSRC)
+	b	L(g5)
+
+	.align	4
+L(g3):	lwzu	rALT, 4(rSRC)
+	stwu	rWORD, 4(rRTN)
+	cmpb	rTMP, rALT, rMASK
+	cmpwi	rTMP, 0
+	bne	L(g4)
+	lwzu	rWORD, 4(rSRC)
+	stwu	rALT, 4(rRTN)
+L(g5):	cmpb	rTMP, rWORD, rMASK
+	cmpwi	rTMP, 0		/* If rTMP is 0, no null in word.  */
+	beq	L(g3)
+
+        mr      rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g4):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rRTN)
+#else
+	rlwinm. rTMP, rALT, 8, 24, 31
+        stbu    rTMP, 4(rRTN)
+        beqlr
+        rlwinm. rTMP, rALT, 16, 24, 31
+        stbu    rTMP, 1(rRTN)
+        beqlr
+        rlwinm. rTMP, rALT, 24, 24, 31
+        stbu    rTMP, 1(rRTN)
+        beqlr
+        stbu    rALT, 1(rRTN)
+#endif
+        blr
+
+/* Oh well.  In this case, we just do a byte-by-byte copy.  */
+	.align	4
+L(unaligned):
+	lbz	rWORD, 0(rSRC)
+	addi	rRTN, rRTN, -1
+	cmpdi	rWORD, 0
+	beq	L(u2)
+
+	.align 	5
+L(u0):	lbzu	rALT, 1(rSRC)
+	stbu	rWORD, 1(rRTN)
+	cmpdi	rALT, 0
+	beq	L(u1)
+	lbzu	rWORD, 1(rSRC)
+	stbu	rALT, 1(rRTN)
+	cmpdi	rWORD, 0
+	beq	L(u2)
+	lbzu	rALT, 1(rSRC)
+	stbu	rWORD, 1(rRTN)
+	cmpdi	rALT, 0
+	beq	L(u1)
+	lbzu	rWORD, 1(rSRC)
+	stbu	rALT, 1(rRTN)
+	cmpdi	rWORD, 0
+	bne	L(u0)
+L(u2):	stbu	rWORD, 1(rRTN)
+	blr
+L(u1):	stbu	rALT, 1(rRTN)
+	blr
+END (__stpcpy)
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/stpcpy.S b/sysdeps/powerpc/powerpc64/stpcpy.S
index 070cd46..4ea7d5a 100644
--- a/sysdeps/powerpc/powerpc64/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/stpcpy.S
@@ -26,35 +26,39 @@ EALIGN (__stpcpy, 4, 0)
 	CALL_MCOUNT 2
 
 #define rTMP	r0
-#define rRTN	r3
-#define rDEST	r3		/* pointer to previous word in dest */
-#define rSRC	r4		/* pointer to previous word in src */
-#define rWORD	r6		/* current word from src */
-#define rFEFE	r7		/* 0xfefefeff */
-#define r7F7F	r8		/* 0x7f7f7f7f */
-#define rNEG	r9		/* ~(word in src | 0x7f7f7f7f) */
-#define rALT	r10		/* alternate word from src */
-
-	or	rTMP, rSRC, rDEST
-	clrldi.	rTMP, rTMP, 62
-	addi	rDEST, rDEST, -4
-	bne	L(unaligned)
+#define rRTN	r3	/* pointer to previous word/doubleword in dest */
+#define rSRC	r4	/* pointer to previous word/doubleword in src */
+#define rWORD	r6	/* current word from src */
+#define rFEFE	r7	/* constant 0xfefefeff | 0xfefefefefefefeff */
+#define r7F7F	r8	/* constant 0x7f7f7f7f | 0x7f7f7f7f7f7f7f7f */
+#define rNEG	r9	/* ~(word in s1 | r7F7F) */
+#define rALT	r10	/* alternate word from src */
+
+	or	rTMP, rSRC, rRTN
+	clrldi.	rTMP, rTMP, 61
+	bne	L(check_word_alignment)
+
+/* For doubleword aligned memory, operate using doubleword load and stores.  */
+	addi	rRTN, rRTN, -8
 
 	lis	rFEFE, -0x101
 	lis	r7F7F, 0x7f7f
-	lwz	rWORD, 0(rSRC)
+	ld	rWORD, 0(rSRC)
 	addi	rFEFE, rFEFE, -0x101
 	addi	r7F7F, r7F7F, 0x7f7f
+	sldi	rTMP, rFEFE, 32
+	insrdi	r7F7F, r7F7F, 32, 0
+	add	rFEFE, rFEFE, rTMP
 	b	L(g2)
 
-L(g0):	lwzu	rALT, 4(rSRC)
-	stwu	rWORD, 4(rDEST)
+L(g0):	ldu	rALT, 8(rSRC)
+	stdu	rWORD, 8(rRTN)
 	add	rTMP, rFEFE, rALT
 	nor	rNEG, r7F7F, rALT
 	and.	rTMP, rTMP, rNEG
 	bne-	L(g1)
-	lwzu	rWORD, 4(rSRC)
-	stwu	rALT, 4(rDEST)
+	ldu	rWORD, 8(rSRC)
+	stdu	rALT, 8(rRTN)
 L(g2):	add	rTMP, rFEFE, rWORD
 	nor	rNEG, r7F7F, rWORD
 	and.	rTMP, rTMP, rNEG
@@ -62,16 +66,111 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
-	stbu	rTMP, 4(rDEST)
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	extrdi.	rTMP, rALT, 8, 56
+	stbu	rALT, 8(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 48
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 40
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 32
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 24
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 16
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 8
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi	rTMP, rALT, 8, 0
+	stbu	rTMP, 1(rRTN)
+#else
+	extrdi.	rTMP, rALT, 8, 0
+	stbu	rTMP, 8(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 8
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 16
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 24
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 32
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 40
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 48
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	stbu	rALT, 1(rRTN)
+#endif
+	blr
+
+L(check_word_alignment):
+	clrldi. rTMP, rTMP, 62
+	bne     L(unaligned)
+
+/* For word aligned memory, operate using word load and stores.  */
+	addi	rRTN, rRTN, -4
+
+	lis	rFEFE, -0x101
+	lis	r7F7F, 0x7f7f
+	lwz	rWORD, 0(rSRC)
+	addi	rFEFE, rFEFE, -0x101
+	addi	r7F7F, r7F7F, 0x7f7f
+	b	L(g5)
+
+L(g3):	lwzu	rALT, 4(rSRC)
+	stwu	rWORD, 4(rRTN)
+	add	rTMP, rFEFE, rALT
+	nor	rNEG, r7F7F, rALT
+	and.	rTMP, rTMP, rNEG
+	bne-	L(g4)
+	lwzu	rWORD, 4(rSRC)
+	stwu	rALT, 4(rRTN)
+L(g5):	add	rTMP, rFEFE, rWORD
+	nor	rNEG, r7F7F, rWORD
+	and.	rTMP, rTMP, rNEG
+	beq+	L(g3)
+
+	mr	rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g4):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rRTN)
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 4(rRTN)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
-	stbu	rTMP, 1(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	rlwinm.	rTMP, rALT, 24, 24, 31
-	stbu	rTMP, 1(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
-	stbu	rALT, 1(rDEST)
+	stbu	rALT, 1(rRTN)
+#endif
 	blr
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
@@ -79,22 +178,22 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	nop
 L(unaligned):
 	lbz	rWORD, 0(rSRC)
-	addi	rDEST, rDEST, 3
+	addi	rRTN, rRTN, -1
 	cmpwi	rWORD, 0
 	beq-	L(u2)
 
 L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rDEST)
+	stbu	rWORD, 1(rRTN)
 	cmpwi	rALT, 0
 	beq-	L(u1)
 	nop		/* Let 601 load start of loop.  */
 	lbzu	rWORD, 1(rSRC)
-	stbu	rALT, 1(rDEST)
+	stbu	rALT, 1(rRTN)
 	cmpwi	rWORD, 0
 	bne+	L(u0)
-L(u2):	stbu	rWORD, 1(rDEST)
+L(u2):	stbu	rWORD, 1(rRTN)
 	blr
-L(u1):	stbu	rALT, 1(rDEST)
+L(u1):	stbu	rALT, 1(rRTN)
 	blr
 END (__stpcpy)
  


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]