This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH v3] PowerPC: stpcpy optimization for PPC64/POWER7


On Fri, 2013-10-04 at 12:31 -0300, Adhemerval Zanella wrote:
> This is a new version of this patch intended to unify both strcpy and stpcpy
> implementations for PPC64 and PPC64/POWER7. It is coded to ajust now Alan's
> pushed his patches for LE. As before the idea for default implementation
> (sysdeps/powerpc/powerpc64/strcpy.S) is to provide both doubleword and word
> aligned memory access.
> 
> For PPC64/POWER7 the idea is to also provide doubleword and word memory access,
> remove the branch hints, use the cmpb instruction for compare doubleword/words,
> and add an optimization for inputs of same alignment.
> 
> THe performance results based on benchtests are provided in attachments. Tested
> on PPC64 and PPC64/POWER7.


A few comments scattered about below on cosmetic issues.   Nothing of
real concern, this looks good to me.  

Thanks, 
-Will


> 
> ---
> 
> 2013-10-04  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
> 
> 	* sysdeps/powerpc/powerpc64/strcpy.S (strcpy): Add word load/store
> 	provide a boost for large inputs with word alignment. Also fix little
> 	endian issues.
> 	* sysdeps/powerpc/powerpc64/stpcpy.S (__stpcpy): Rewrite
> 	implementation based on optimized PPC64 strcpy.
> 	* sysdeps/powerpc/powerpc64/power7/strcpy.S: New file: optimized
> 	strcpy for PPC64/POWER7 based on both doubleword and word load/store.
> 	* sysdeps/powerpc/powerpc64/power7/stpcpy.S: New file: optimized
> 	stpcpy for PPC64/POWER7 based on PPC64/POWER7 strcpy.

> 
> --
> 
> diff --git a/sysdeps/powerpc/powerpc64/power7/stpcpy.S b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
> new file mode 100644
> index 0000000..727dd06
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpcpy implementation for PowerPC64/POWER7.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define USE_AS_STPCPY
> +#include <sysdeps/powerpc/powerpc64/power7/strcpy.S>
> +
> +weak_alias (__stpcpy, stpcpy)
> +libc_hidden_def (__stpcpy)
> +libc_hidden_builtin_def (stpcpy)


Looks OK.


> diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
> new file mode 100644
> index 0000000..5c341a1
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
> @@ -0,0 +1,274 @@
> +/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER7.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Implements the function
> +
> +   char * [r3] strcpy (char *dest [r3], const char *src [r4])
> +
> +   or
> +
> +   char * [r3] strcpy (char *dest [r3], const char *src [r4])
> +
> +   if USE_AS_STPCPY is defined. It tries to use aligned memory accesses
> +   when possible using the following algorithm:
> +
> +   if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
> +     goto aligned_doubleword_copy;
> +   if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
> +     goto aligned_word_copy;
> +   if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
> +     goto same_alignment;
> +   goto unaligned;
> +
> +   The aligned comparison are made using cmpb instructions.  */
> +
> +#ifdef USE_AS_STPCPY
> +# define FUNC_NAME __stpcpy
> +#else
> +# define FUNC_NAME strcpy
> +#endif
> +
> +	.machine  power7
> +EALIGN (FUNC_NAME, 4, 0)
> +	CALL_MCOUNT 2
> +
> +#define rTMP	r0
> +#ifdef USE_AS_STPCPY
> +#define rRTN	r3	/* pointer to previous word/doubleword in dest */
> +#else
> +#define rRTN	r12	/* pointer to previous word/doubleword in dest */
> +#endif
> +#define rSRC	r4	/* pointer to previous word/doubleword in src */
> +#define rMASK	r5	/* mask 0xffffffff | 0xffffffffffffffff */
> +#define rWORD	r6	/* current word from src */
> +#define rALT	r7	/* alternate word from src */
> +#define rRTNAL	r8	/* alignment of return pointer */
> +#define rSRCAL	r9	/* alignment of source pointer */
> +#define rALCNT	r10	/* bytes to read to reach 8 bytes alignment */
> +#define rSUBAL	r11	/* doubleword minus unaligned displacement */
> +
> +#ifndef USE_AS_STPCPY
> +/* Save the dst pointer to use as return value.  */
> +	mr	rRTN, r3
> +#endif

Just to get rid of a #ifndef, I'd be tempted to change the above to just
	mr	rRTN, r3
and live with the "mr r3, r3" for the USE_AS_STPCPY case.

I Don't feel strongly about it, but wanted to have more feedback than
'looks good' :-)

> +	or	rTMP, rSRC, rRTN
> +	clrldi.	rTMP, rTMP, 61
> +	bne	L(check_word_alignment)
> +	b	L(aligned_doubleword_copy)
> +
> +L(same_alignment):
> +/* Src and dst with same alignment: align both to doubleword.  */
> +	mr	rALCNT, rRTN
> +	lbz	rWORD, 0(rSRC)
> +	subfic	rSUBAL, rRTNAL, 8
> +	addi	rRTN, rRTN, 1
> +	addi	rSRC, rSRC, 1
> +	cmpdi	cr7, rWORD, 0
> +	stb	rWORD, 0(rALCNT)
> +	beq	cr7, L(s2)
> +
> +	add	rALCNT, rALCNT, rSUBAL
> +	subf	rALCNT, rRTN, rALCNT
> +	addi	rALCNT, rALCNT, 1
> +	mtctr	rALCNT
> +	b	L(s1)
> +
> +	.align 4
> +L(s0):
> +	addi	rSRC, rSRC, 1
> +	lbz	rWORD, -1(rSRC)
> +	cmpdi	cr7, rWORD, 0
> +	stb	rWORD, -1(rALCNT)
> +	beqlr	cr7
> +	mr	rRTN, rALCNT
> +L(s1):
> +	addi	rALCNT, rRTN,1
> +	bdnz	L(s0)
> +	b L(aligned_doubleword_copy)
> +	.align 4
> +L(s2):
> +	mr	rRTN, rALCNT
> +	blr
> +
> +/* For doubleword aligned memory, operate using doubleword load and stores.  */
> +	.align 4
> +L(aligned_doubleword_copy):
> +	li	rMASK, 0
> +	addi	rRTN, rRTN, -8
> +	ld	rWORD, 0(rSRC)
> +	b	L(g2)
> +
> +	.align 4
> +L(g0):	ldu	rALT, 8(rSRC)
> +	stdu	rWORD, 8(rRTN)
> +	cmpb	rTMP, rALT, rMASK
> +	cmpdi	rTMP, 0
> +	bne	L(g1)
> +	ldu	rWORD, 8(rSRC)
> +	stdu	rALT, 8(rRTN)
> +L(g2):	cmpb	rTMP, rWORD, rMASK
> +	cmpdi	rTMP, 0		/* If rTMP is 0, no null's have been found.  */
> +	beq	L(g0)
> +
> +	mr	rALT, rWORD
> +/* We've hit the end of the string.  Do the rest byte-by-byte.  */
> +L(g1):
> +#ifdef __LITTLE_ENDIAN__
> +	extrdi.	rTMP, rALT, 8, 56
> +	stbu	rALT, 8(rRTN)
> +	beqlr-
> +	extrdi.	rTMP, rALT, 8, 48
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	extrdi.	rTMP, rALT, 8, 40
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	extrdi.	rTMP, rALT, 8, 32
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	extrdi.	rTMP, rALT, 8, 24
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	extrdi.	rTMP, rALT, 8, 16
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	extrdi.	rTMP, rALT, 8, 8
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	extrdi	rTMP, rALT, 8, 0
> +	stbu	rTMP, 1(rRTN)
> +#else
> +	extrdi.	rTMP, rALT, 8, 0
> +	stbu	rTMP, 8(rRTN)
> +	beqlr
> +	extrdi.	rTMP, rALT, 8, 8
> +	stbu	rTMP, 1(rRTN)
> +	beqlr
> +	extrdi.	rTMP, rALT, 8, 16
> +	stbu	rTMP, 1(rRTN)
> +	beqlr
> +	extrdi.	rTMP, rALT, 8, 24
> +	stbu	rTMP, 1(rRTN)
> +	beqlr
> +	extrdi.	rTMP, rALT, 8, 32
> +	stbu	rTMP, 1(rRTN)
> +	beqlr
> +	extrdi.	rTMP, rALT, 8, 40
> +	stbu	rTMP, 1(rRTN)
> +	beqlr
> +	extrdi.	rTMP, rALT, 8, 48
> +	stbu	rTMP, 1(rRTN)
> +	beqlr
> +	stbu	rALT, 1(rRTN)
> +#endif
> +	blr
> +
> +L(check_word_alignment):
> +	clrldi. rTMP, rTMP, 62
> +	beq	L(aligned_word_copy)
> +	rldicl	rRTNAL, rRTN, 0, 61
> +	rldicl	rSRCAL, rSRC, 0, 61
> +	cmpld	cr7, rSRCAL, rRTNAL
> +	beq	cr7, L(same_alignment)
> +	b	L(unaligned)
> +
> +/* For word aligned memory, operate using word load and stores.  */
> +	.align	4
> +L(aligned_word_copy):
> +	li	rMASK, 0
> +	addi	rRTN, rRTN, -4
> +	lwz	rWORD, 0(rSRC)
> +	b	L(g5)
> +
> +	.align	4
> +L(g3):	lwzu	rALT, 4(rSRC)
> +	stwu	rWORD, 4(rRTN)
> +	cmpb	rTMP, rALT, rMASK
> +	cmpwi	rTMP, 0
> +	bne	L(g4)
> +	lwzu	rWORD, 4(rSRC)
> +	stwu	rALT, 4(rRTN)
> +L(g5):	cmpb	rTMP, rWORD, rMASK
> +	cmpwi	rTMP, 0		/* If rTMP is 0, no null in word.  */
> +	beq	L(g3)
> +
> +	mr      rALT, rWORD
> +/* We've hit the end of the string.  Do the rest byte-by-byte.  */
> +L(g4):
> +#ifdef __LITTLE_ENDIAN__
> +	rlwinm.	rTMP, rALT, 0, 24, 31
> +	stbu	rALT, 4(rRTN)
> +	beqlr-
> +	rlwinm.	rTMP, rALT, 24, 24, 31
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	rlwinm.	rTMP, rALT, 16, 24, 31
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	rlwinm	rTMP, rALT, 8, 24, 31
> +	stbu	rTMP, 1(rRTN)
> +#else
> +	rlwinm. rTMP, rALT, 8, 24, 31
> +	stbu    rTMP, 4(rRTN)
> +	beqlr
> +	rlwinm. rTMP, rALT, 16, 24, 31
> +	stbu    rTMP, 1(rRTN)
> +	beqlr
> +	rlwinm. rTMP, rALT, 24, 24, 31
> +	stbu    rTMP, 1(rRTN)
> +	beqlr
> +	stbu    rALT, 1(rRTN)
> +#endif
> +	blr
> +
> +/* Oh well.  In this case, we just do a byte-by-byte copy.  */
> +	.align	4
> +L(unaligned):
> +	lbz	rWORD, 0(rSRC)
> +	addi	rRTN, rRTN, -1
> +	cmpdi	rWORD, 0
> +	beq	L(u2)
> +
> +	.align 	5
> +L(u0):	lbzu	rALT, 1(rSRC)
> +	stbu	rWORD, 1(rRTN)
> +	cmpdi	rALT, 0
> +	beq	L(u1)
> +	lbzu	rWORD, 1(rSRC)
> +	stbu	rALT, 1(rRTN)
> +	cmpdi	rWORD, 0
> +	beq	L(u2)
> +	lbzu	rALT, 1(rSRC)
> +	stbu	rWORD, 1(rRTN)
> +	cmpdi	rALT, 0
> +	beq	L(u1)
> +	lbzu	rWORD, 1(rSRC)
> +	stbu	rALT, 1(rRTN)
> +	cmpdi	rWORD, 0
> +	bne	L(u0)
> +L(u2):	stbu	rWORD, 1(rRTN)
> +	blr
> +L(u1):	stbu	rALT, 1(rRTN)
> +	blr
> +END (FUNC_NAME)
> +
> +#ifndef USE_AS_STPCPY
> +libc_hidden_builtin_def (strcpy)
> +#endif

I'm not sure if there is style precedence, can or should the #ifndef
above be dropped, so that reads 
	libc_hidden_builtin_def (FUNC_NAME)
And then remove the statement from the stpcpy.S that includes this file?

I suppose that would be a problem if we needed to mix
libc_hidden_builtin_def and libc_builtin_def incantations...  


Looks Ok. 

> diff --git a/sysdeps/powerpc/powerpc64/stpcpy.S b/sysdeps/powerpc/powerpc64/stpcpy.S
> index c0b3972..09aa3be 100644
> --- a/sysdeps/powerpc/powerpc64/stpcpy.S
> +++ b/sysdeps/powerpc/powerpc64/stpcpy.S
> @@ -16,103 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
> 
> -#include <sysdep.h>
> -
> -/* See strlen.s for comments on how the end-of-string testing works.  */
> -
> -/* char * [r3] stpcpy (char *dest [r3], const char *src [r4])  */
> -
> -EALIGN (__stpcpy, 4, 0)
> -	CALL_MCOUNT 2
> -
> -#define rTMP	r0
> -#define rRTN	r3
> -#define rDEST	r3		/* pointer to previous word in dest */
> -#define rSRC	r4		/* pointer to previous word in src */
> -#define rWORD	r6		/* current word from src */
> -#define rFEFE	r7		/* 0xfefefeff */
> -#define r7F7F	r8		/* 0x7f7f7f7f */
> -#define rNEG	r9		/* ~(word in src | 0x7f7f7f7f) */
> -#define rALT	r10		/* alternate word from src */
> -
> -	or	rTMP, rSRC, rDEST
> -	clrldi.	rTMP, rTMP, 62
> -	addi	rDEST, rDEST, -4
> -	bne	L(unaligned)
> -
> -	lis	rFEFE, -0x101
> -	lis	r7F7F, 0x7f7f
> -	lwz	rWORD, 0(rSRC)
> -	addi	rFEFE, rFEFE, -0x101
> -	addi	r7F7F, r7F7F, 0x7f7f
> -	b	L(g2)
> -
> -L(g0):	lwzu	rALT, 4(rSRC)
> -	stwu	rWORD, 4(rDEST)
> -	add	rTMP, rFEFE, rALT
> -	nor	rNEG, r7F7F, rALT
> -	and.	rTMP, rTMP, rNEG
> -	bne-	L(g1)
> -	lwzu	rWORD, 4(rSRC)
> -	stwu	rALT, 4(rDEST)
> -L(g2):	add	rTMP, rFEFE, rWORD
> -	nor	rNEG, r7F7F, rWORD
> -	and.	rTMP, rTMP, rNEG
> -	beq+	L(g0)
> -
> -	mr	rALT, rWORD
> -/* We've hit the end of the string.  Do the rest byte-by-byte.  */
> -L(g1):
> -#ifdef __LITTLE_ENDIAN__
> -	rlwinm.	rTMP, rALT, 0, 24, 31
> -	stbu	rALT, 4(rDEST)
> -	beqlr-
> -	rlwinm.	rTMP, rALT, 24, 24, 31
> -	stbu	rTMP, 1(rDEST)
> -	beqlr-
> -	rlwinm.	rTMP, rALT, 16, 24, 31
> -	stbu	rTMP, 1(rDEST)
> -	beqlr-
> -	rlwinm	rTMP, rALT, 8, 24, 31
> -	stbu	rTMP, 1(rDEST)
> -	blr
> -#else
> -	rlwinm.	rTMP, rALT, 8, 24, 31
> -	stbu	rTMP, 4(rDEST)
> -	beqlr-
> -	rlwinm.	rTMP, rALT, 16, 24, 31
> -	stbu	rTMP, 1(rDEST)
> -	beqlr-
> -	rlwinm.	rTMP, rALT, 24, 24, 31
> -	stbu	rTMP, 1(rDEST)
> -	beqlr-
> -	stbu	rALT, 1(rDEST)
> -	blr
> -#endif
> -
> -/* Oh well.  In this case, we just do a byte-by-byte copy.  */
> -	.align 4
> -	nop
> -L(unaligned):
> -	lbz	rWORD, 0(rSRC)
> -	addi	rDEST, rDEST, 3
> -	cmpwi	rWORD, 0
> -	beq-	L(u2)
> -
> -L(u0):	lbzu	rALT, 1(rSRC)
> -	stbu	rWORD, 1(rDEST)
> -	cmpwi	rALT, 0
> -	beq-	L(u1)
> -	nop		/* Let 601 load start of loop.  */
> -	lbzu	rWORD, 1(rSRC)
> -	stbu	rALT, 1(rDEST)
> -	cmpwi	rWORD, 0
> -	bne+	L(u0)
> -L(u2):	stbu	rWORD, 1(rDEST)
> -	blr
> -L(u1):	stbu	rALT, 1(rDEST)
> -	blr
> -END (__stpcpy)
> +#define USE_AS_STPCPY
> +#include <sysdeps/powerpc/powerpc64/strcpy.S>

> 
>  weak_alias (__stpcpy, stpcpy)
>  libc_hidden_def (__stpcpy)

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/strcpy.S b/sysdeps/powerpc/powerpc64/strcpy.S
> index a7fd85b..793325d 100644
> --- a/sysdeps/powerpc/powerpc64/strcpy.S
> +++ b/sysdeps/powerpc/powerpc64/strcpy.S
> @@ -22,25 +22,38 @@
> 
>  /* char * [r3] strcpy (char *dest [r3], const char *src [r4])  */
> 
> -EALIGN (strcpy, 4, 0)
> +#ifdef USE_AS_STPCPY
> +# define FUNC_NAME __stpcpy
> +#else
> +# define FUNC_NAME strcpy
> +#endif

> +
> +EALIGN (FUNC_NAME, 4, 0)
>  	CALL_MCOUNT 2
> 
>  #define rTMP	r0
> -#define rRTN	r3	/* incoming DEST arg preserved as result */
> -#define rSRC	r4	/* pointer to previous word in src */
> -#define rDEST	r5	/* pointer to previous word in dest */
> +#ifdef USE_AS_STPCPY
> +#define rRTN    r3      /* pointer to previous word/doubleword in dest */
> +#else
> +#define rRTN    r12     /* pointer to previous word/doubleword in dest */
> +#endif
> +#define rSRC	r4	/* pointer to previous word/doubleword in src */
>  #define rWORD	r6	/* current word from src */
> -#define rFEFE	r7	/* constant 0xfefefefefefefeff (-0x0101010101010101) */
> -#define r7F7F	r8	/* constant 0x7f7f7f7f7f7f7f7f */
> -#define rNEG	r9	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
> +#define rFEFE	r7	/* constant 0xfefefeff | 0xfefefefefefefeff */
> +#define r7F7F	r8	/* constant 0x7f7f7f7f | 0x7f7f7f7f7f7f7f7f */
> +#define rNEG	r9	/* ~(word in s1 | r7F7F) */
>  #define rALT	r10	/* alternate word from src */
> 
> -	dcbt	0,rSRC
> +#ifndef USE_AS_STPCPY
> +/* Save the dst pointer to use as return value.  */
> +	mr      rRTN, r3
> +#endif
>  	or	rTMP, rSRC, rRTN
>  	clrldi.	rTMP, rTMP, 61
> -	addi	rDEST, rRTN, -8
> -	dcbtst	0,rRTN
> -	bne	L(unaligned)
> +	bne	L(check_word_alignment)
> +
> +/* For doubleword aligned memory, operate using doubleword load and stores.  */
> +	addi	rRTN, rRTN, -8
> 
>  	lis	rFEFE, -0x101
>  	lis	r7F7F, 0x7f7f
> @@ -53,13 +66,13 @@ EALIGN (strcpy, 4, 0)
>  	b	L(g2)
> 
>  L(g0):	ldu	rALT, 8(rSRC)
> -	stdu	rWORD, 8(rDEST)
> +	stdu	rWORD, 8(rRTN)
>  	add	rTMP, rFEFE, rALT
>  	nor	rNEG, r7F7F, rALT
>  	and.	rTMP, rTMP, rNEG
>  	bne-	L(g1)
>  	ldu	rWORD, 8(rSRC)
> -	stdu	rALT, 8(rDEST)
> +	stdu	rALT, 8(rRTN)
>  L(g2):	add	rTMP, rFEFE, rWORD
>  	nor	rNEG, r7F7F, rWORD
>  	and.	rTMP, rTMP, rNEG
> @@ -70,77 +83,134 @@ L(g2):	add	rTMP, rFEFE, rWORD
>  L(g1):
>  #ifdef __LITTLE_ENDIAN__
>  	extrdi.	rTMP, rALT, 8, 56
> -	stb	rALT, 8(rDEST)
> +	stbu	rALT, 8(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 48
> -	stb	rTMP, 9(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 40
> -	stb	rTMP, 10(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 32
> -	stb	rTMP, 11(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 24
> -	stb	rTMP, 12(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 16
> -	stb	rTMP, 13(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 8
> -	stb	rTMP, 14(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi	rTMP, rALT, 8, 0
> -	stb	rTMP, 15(rDEST)
> -	blr
> +	stbu	rTMP, 1(rRTN)
>  #else
>  	extrdi.	rTMP, rALT, 8, 0
> -	stb	rTMP, 8(rDEST)
> +	stbu	rTMP, 8(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 8
> -	stb	rTMP, 9(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 16
> -	stb	rTMP, 10(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 24
> -	stb	rTMP, 11(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 32
> -	stb	rTMP, 12(rDEST)
> -	beqlr-
> +	stbu	rTMP, 1(rRTN)
> +	beqlr
>  	extrdi.	rTMP, rALT, 8, 40
> -	stb	rTMP, 13(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
>  	extrdi.	rTMP, rALT, 8, 48
> -	stb	rTMP, 14(rDEST)
> +	stbu	rTMP, 1(rRTN)
>  	beqlr-
> -	stb	rALT, 15(rDEST)
> +	stbu	rALT, 1(rRTN)
> +#endif
>  	blr
> +
> +L(check_word_alignment):
> +	clrldi. rTMP, rTMP, 62
> +	bne     L(unaligned)
> +
> +/* For word aligned memory, operate using word load and stores.  */
> +	addi	rRTN, rRTN, -4
> +
> +	lis	rFEFE, -0x101
> +	lis	r7F7F, 0x7f7f
> +	lwz	rWORD, 0(rSRC)
> +	addi	rFEFE, rFEFE, -0x101
> +	addi	r7F7F, r7F7F, 0x7f7f
> +	b	L(g5)
> +
> +L(g3):	lwzu	rALT, 4(rSRC)
> +	stwu	rWORD, 4(rRTN)
> +	add	rTMP, rFEFE, rALT
> +	nor	rNEG, r7F7F, rALT
> +	and.	rTMP, rTMP, rNEG
> +	bne-	L(g4)
> +	lwzu	rWORD, 4(rSRC)
> +	stwu	rALT, 4(rRTN)
> +L(g5):	add	rTMP, rFEFE, rWORD
> +	nor	rNEG, r7F7F, rWORD
> +	and.	rTMP, rTMP, rNEG
> +	beq+	L(g3)
> +
> +	mr	rALT, rWORD
> +/* We've hit the end of the string.  Do the rest byte-by-byte.  */
> +L(g4):
> +#ifdef __LITTLE_ENDIAN__
> +	rlwinm.	rTMP, rALT, 0, 24, 31
> +	stbu	rALT, 4(rRTN)
> +	beqlr-
> +	rlwinm.	rTMP, rALT, 24, 24, 31
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	rlwinm.	rTMP, rALT, 16, 24, 31
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	rlwinm	rTMP, rALT, 8, 24, 31
> +	stbu	rTMP, 1(rRTN)
> +#else
> +	rlwinm.	rTMP, rALT, 8, 24, 31
> +	stbu	rTMP, 4(rRTN)
> +	beqlr-
> +	rlwinm.	rTMP, rALT, 16, 24, 31
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	rlwinm.	rTMP, rALT, 24, 24, 31
> +	stbu	rTMP, 1(rRTN)
> +	beqlr-
> +	stbu	rALT, 1(rRTN)
>  #endif
> +	blr
> 
>  /* Oh well.  In this case, we just do a byte-by-byte copy.  */
>  	.align 4
>  	nop
>  L(unaligned):
>  	lbz	rWORD, 0(rSRC)
> -	addi	rDEST, rRTN, -1
> +	addi	rRTN, rRTN, -1
>  	cmpwi	rWORD, 0
>  	beq-	L(u2)
> 
>  L(u0):	lbzu	rALT, 1(rSRC)
> -	stbu	rWORD, 1(rDEST)
> +	stbu	rWORD, 1(rRTN)
>  	cmpwi	rALT, 0
>  	beq-	L(u1)
>  	nop		/* Let 601 load start of loop.  */
>  	lbzu	rWORD, 1(rSRC)
> -	stbu	rALT, 1(rDEST)
> +	stbu	rALT, 1(rRTN)
>  	cmpwi	rWORD, 0
>  	bne+	L(u0)
> -L(u2):	stb	rWORD, 1(rDEST)
> +L(u2):	stbu	rWORD, 1(rRTN)
>  	blr
> -L(u1):	stb	rALT, 1(rDEST)
> +L(u1):	stbu	rALT, 1(rRTN)
>  	blr
> +END (FUNC_NAME)
> 
> -END (strcpy)
> +#ifndef USE_AS_STPCPY
>  libc_hidden_builtin_def (strcpy)
> +#endif


Similar comment as earlier, should the strcpy reference here become
FUNC_NAME, and possibly also remove the #ifndef wrapper.


Either way, looks OK. 
Thanks, 
-Will



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]