This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v3] PowerPC: stpcpy optimization for PPC64/POWER7
- From: Will Schmidt <will_schmidt at vnet dot ibm dot com>
- To: Adhemerval Zanella <azanella at linux dot vnet dot ibm dot com>
- Cc: "GNU C. Library" <libc-alpha at sourceware dot org>
- Date: Thu, 24 Oct 2013 10:46:00 -0500
- Subject: Re: [PATCH v3] PowerPC: stpcpy optimization for PPC64/POWER7
- Authentication-results: sourceware.org; auth=none
- References: <523715EE dot 9070408 at linux dot vnet dot ibm dot com> <524EDF56 dot 2060706 at linux dot vnet dot ibm dot com>
- Reply-to: will_schmidt at vnet dot ibm dot com
On Fri, 2013-10-04 at 12:31 -0300, Adhemerval Zanella wrote:
> This is a new version of this patch intended to unify both strcpy and stpcpy
> implementations for PPC64 and PPC64/POWER7. It is coded to ajust now Alan's
> pushed his patches for LE. As before the idea for default implementation
> (sysdeps/powerpc/powerpc64/strcpy.S) is to provide both doubleword and word
> aligned memory access.
>
> For PPC64/POWER7 the idea is to also provide doubleword and word memory access,
> remove the branch hints, use the cmpb instruction for compare doubleword/words,
> and add an optimization for inputs of same alignment.
>
> THe performance results based on benchtests are provided in attachments. Tested
> on PPC64 and PPC64/POWER7.
A few comments scattered about below on cosmetic issues. Nothing of
real concern, this looks good to me.
Thanks,
-Will
>
> ---
>
> 2013-10-04 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
>
> * sysdeps/powerpc/powerpc64/strcpy.S (strcpy): Add word load/store
> provide a boost for large inputs with word alignment. Also fix little
> endian issues.
> * sysdeps/powerpc/powerpc64/stpcpy.S (__stpcpy): Rewrite
> implementation based on optimized PPC64 strcpy.
> * sysdeps/powerpc/powerpc64/power7/strcpy.S: New file: optimized
> strcpy for PPC64/POWER7 based on both doubleword and word load/store.
> * sysdeps/powerpc/powerpc64/power7/stpcpy.S: New file: optimized
> stpcpy for PPC64/POWER7 based on PPC64/POWER7 strcpy.
>
> --
>
> diff --git a/sysdeps/powerpc/powerpc64/power7/stpcpy.S b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
> new file mode 100644
> index 0000000..727dd06
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpcpy implementation for PowerPC64/POWER7.
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#define USE_AS_STPCPY
> +#include <sysdeps/powerpc/powerpc64/power7/strcpy.S>
> +
> +weak_alias (__stpcpy, stpcpy)
> +libc_hidden_def (__stpcpy)
> +libc_hidden_builtin_def (stpcpy)
Looks OK.
> diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
> new file mode 100644
> index 0000000..5c341a1
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
> @@ -0,0 +1,274 @@
> +/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER7.
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* Implements the function
> +
> + char * [r3] strcpy (char *dest [r3], const char *src [r4])
> +
> + or
> +
> + char * [r3] strcpy (char *dest [r3], const char *src [r4])
> +
> + if USE_AS_STPCPY is defined. It tries to use aligned memory accesses
> + when possible using the following algorithm:
> +
> + if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
> + goto aligned_doubleword_copy;
> + if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
> + goto aligned_word_copy;
> + if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
> + goto same_alignment;
> + goto unaligned;
> +
> + The aligned comparison are made using cmpb instructions. */
> +
> +#ifdef USE_AS_STPCPY
> +# define FUNC_NAME __stpcpy
> +#else
> +# define FUNC_NAME strcpy
> +#endif
> +
> + .machine power7
> +EALIGN (FUNC_NAME, 4, 0)
> + CALL_MCOUNT 2
> +
> +#define rTMP r0
> +#ifdef USE_AS_STPCPY
> +#define rRTN r3 /* pointer to previous word/doubleword in dest */
> +#else
> +#define rRTN r12 /* pointer to previous word/doubleword in dest */
> +#endif
> +#define rSRC r4 /* pointer to previous word/doubleword in src */
> +#define rMASK r5 /* mask 0xffffffff | 0xffffffffffffffff */
> +#define rWORD r6 /* current word from src */
> +#define rALT r7 /* alternate word from src */
> +#define rRTNAL r8 /* alignment of return pointer */
> +#define rSRCAL r9 /* alignment of source pointer */
> +#define rALCNT r10 /* bytes to read to reach 8 bytes alignment */
> +#define rSUBAL r11 /* doubleword minus unaligned displacement */
> +
> +#ifndef USE_AS_STPCPY
> +/* Save the dst pointer to use as return value. */
> + mr rRTN, r3
> +#endif
Just to get rid of a #ifndef, I'd be tempted to change the above to just
mr rRTN, r3
and live with the "mr r3, r3" for the USE_AS_STPCPY case.
I Don't feel strongly about it, but wanted to have more feedback than
'looks good' :-)
> + or rTMP, rSRC, rRTN
> + clrldi. rTMP, rTMP, 61
> + bne L(check_word_alignment)
> + b L(aligned_doubleword_copy)
> +
> +L(same_alignment):
> +/* Src and dst with same alignment: align both to doubleword. */
> + mr rALCNT, rRTN
> + lbz rWORD, 0(rSRC)
> + subfic rSUBAL, rRTNAL, 8
> + addi rRTN, rRTN, 1
> + addi rSRC, rSRC, 1
> + cmpdi cr7, rWORD, 0
> + stb rWORD, 0(rALCNT)
> + beq cr7, L(s2)
> +
> + add rALCNT, rALCNT, rSUBAL
> + subf rALCNT, rRTN, rALCNT
> + addi rALCNT, rALCNT, 1
> + mtctr rALCNT
> + b L(s1)
> +
> + .align 4
> +L(s0):
> + addi rSRC, rSRC, 1
> + lbz rWORD, -1(rSRC)
> + cmpdi cr7, rWORD, 0
> + stb rWORD, -1(rALCNT)
> + beqlr cr7
> + mr rRTN, rALCNT
> +L(s1):
> + addi rALCNT, rRTN,1
> + bdnz L(s0)
> + b L(aligned_doubleword_copy)
> + .align 4
> +L(s2):
> + mr rRTN, rALCNT
> + blr
> +
> +/* For doubleword aligned memory, operate using doubleword load and stores. */
> + .align 4
> +L(aligned_doubleword_copy):
> + li rMASK, 0
> + addi rRTN, rRTN, -8
> + ld rWORD, 0(rSRC)
> + b L(g2)
> +
> + .align 4
> +L(g0): ldu rALT, 8(rSRC)
> + stdu rWORD, 8(rRTN)
> + cmpb rTMP, rALT, rMASK
> + cmpdi rTMP, 0
> + bne L(g1)
> + ldu rWORD, 8(rSRC)
> + stdu rALT, 8(rRTN)
> +L(g2): cmpb rTMP, rWORD, rMASK
> + cmpdi rTMP, 0 /* If rTMP is 0, no null's have been found. */
> + beq L(g0)
> +
> + mr rALT, rWORD
> +/* We've hit the end of the string. Do the rest byte-by-byte. */
> +L(g1):
> +#ifdef __LITTLE_ENDIAN__
> + extrdi. rTMP, rALT, 8, 56
> + stbu rALT, 8(rRTN)
> + beqlr-
> + extrdi. rTMP, rALT, 8, 48
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + extrdi. rTMP, rALT, 8, 40
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + extrdi. rTMP, rALT, 8, 32
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + extrdi. rTMP, rALT, 8, 24
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + extrdi. rTMP, rALT, 8, 16
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + extrdi. rTMP, rALT, 8, 8
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + extrdi rTMP, rALT, 8, 0
> + stbu rTMP, 1(rRTN)
> +#else
> + extrdi. rTMP, rALT, 8, 0
> + stbu rTMP, 8(rRTN)
> + beqlr
> + extrdi. rTMP, rALT, 8, 8
> + stbu rTMP, 1(rRTN)
> + beqlr
> + extrdi. rTMP, rALT, 8, 16
> + stbu rTMP, 1(rRTN)
> + beqlr
> + extrdi. rTMP, rALT, 8, 24
> + stbu rTMP, 1(rRTN)
> + beqlr
> + extrdi. rTMP, rALT, 8, 32
> + stbu rTMP, 1(rRTN)
> + beqlr
> + extrdi. rTMP, rALT, 8, 40
> + stbu rTMP, 1(rRTN)
> + beqlr
> + extrdi. rTMP, rALT, 8, 48
> + stbu rTMP, 1(rRTN)
> + beqlr
> + stbu rALT, 1(rRTN)
> +#endif
> + blr
> +
> +L(check_word_alignment):
> + clrldi. rTMP, rTMP, 62
> + beq L(aligned_word_copy)
> + rldicl rRTNAL, rRTN, 0, 61
> + rldicl rSRCAL, rSRC, 0, 61
> + cmpld cr7, rSRCAL, rRTNAL
> + beq cr7, L(same_alignment)
> + b L(unaligned)
> +
> +/* For word aligned memory, operate using word load and stores. */
> + .align 4
> +L(aligned_word_copy):
> + li rMASK, 0
> + addi rRTN, rRTN, -4
> + lwz rWORD, 0(rSRC)
> + b L(g5)
> +
> + .align 4
> +L(g3): lwzu rALT, 4(rSRC)
> + stwu rWORD, 4(rRTN)
> + cmpb rTMP, rALT, rMASK
> + cmpwi rTMP, 0
> + bne L(g4)
> + lwzu rWORD, 4(rSRC)
> + stwu rALT, 4(rRTN)
> +L(g5): cmpb rTMP, rWORD, rMASK
> + cmpwi rTMP, 0 /* If rTMP is 0, no null in word. */
> + beq L(g3)
> +
> + mr rALT, rWORD
> +/* We've hit the end of the string. Do the rest byte-by-byte. */
> +L(g4):
> +#ifdef __LITTLE_ENDIAN__
> + rlwinm. rTMP, rALT, 0, 24, 31
> + stbu rALT, 4(rRTN)
> + beqlr-
> + rlwinm. rTMP, rALT, 24, 24, 31
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + rlwinm. rTMP, rALT, 16, 24, 31
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + rlwinm rTMP, rALT, 8, 24, 31
> + stbu rTMP, 1(rRTN)
> +#else
> + rlwinm. rTMP, rALT, 8, 24, 31
> + stbu rTMP, 4(rRTN)
> + beqlr
> + rlwinm. rTMP, rALT, 16, 24, 31
> + stbu rTMP, 1(rRTN)
> + beqlr
> + rlwinm. rTMP, rALT, 24, 24, 31
> + stbu rTMP, 1(rRTN)
> + beqlr
> + stbu rALT, 1(rRTN)
> +#endif
> + blr
> +
> +/* Oh well. In this case, we just do a byte-by-byte copy. */
> + .align 4
> +L(unaligned):
> + lbz rWORD, 0(rSRC)
> + addi rRTN, rRTN, -1
> + cmpdi rWORD, 0
> + beq L(u2)
> +
> + .align 5
> +L(u0): lbzu rALT, 1(rSRC)
> + stbu rWORD, 1(rRTN)
> + cmpdi rALT, 0
> + beq L(u1)
> + lbzu rWORD, 1(rSRC)
> + stbu rALT, 1(rRTN)
> + cmpdi rWORD, 0
> + beq L(u2)
> + lbzu rALT, 1(rSRC)
> + stbu rWORD, 1(rRTN)
> + cmpdi rALT, 0
> + beq L(u1)
> + lbzu rWORD, 1(rSRC)
> + stbu rALT, 1(rRTN)
> + cmpdi rWORD, 0
> + bne L(u0)
> +L(u2): stbu rWORD, 1(rRTN)
> + blr
> +L(u1): stbu rALT, 1(rRTN)
> + blr
> +END (FUNC_NAME)
> +
> +#ifndef USE_AS_STPCPY
> +libc_hidden_builtin_def (strcpy)
> +#endif
I'm not sure if there is style precedence, can or should the #ifndef
above be dropped, so that reads
libc_hidden_builtin_def (FUNC_NAME)
And then remove the statement from the stpcpy.S that includes this file?
I suppose that would be a problem if we needed to mix
libc_hidden_builtin_def and libc_builtin_def incantations...
Looks Ok.
> diff --git a/sysdeps/powerpc/powerpc64/stpcpy.S b/sysdeps/powerpc/powerpc64/stpcpy.S
> index c0b3972..09aa3be 100644
> --- a/sysdeps/powerpc/powerpc64/stpcpy.S
> +++ b/sysdeps/powerpc/powerpc64/stpcpy.S
> @@ -16,103 +16,8 @@
> License along with the GNU C Library; if not, see
> <http://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -
> -/* See strlen.s for comments on how the end-of-string testing works. */
> -
> -/* char * [r3] stpcpy (char *dest [r3], const char *src [r4]) */
> -
> -EALIGN (__stpcpy, 4, 0)
> - CALL_MCOUNT 2
> -
> -#define rTMP r0
> -#define rRTN r3
> -#define rDEST r3 /* pointer to previous word in dest */
> -#define rSRC r4 /* pointer to previous word in src */
> -#define rWORD r6 /* current word from src */
> -#define rFEFE r7 /* 0xfefefeff */
> -#define r7F7F r8 /* 0x7f7f7f7f */
> -#define rNEG r9 /* ~(word in src | 0x7f7f7f7f) */
> -#define rALT r10 /* alternate word from src */
> -
> - or rTMP, rSRC, rDEST
> - clrldi. rTMP, rTMP, 62
> - addi rDEST, rDEST, -4
> - bne L(unaligned)
> -
> - lis rFEFE, -0x101
> - lis r7F7F, 0x7f7f
> - lwz rWORD, 0(rSRC)
> - addi rFEFE, rFEFE, -0x101
> - addi r7F7F, r7F7F, 0x7f7f
> - b L(g2)
> -
> -L(g0): lwzu rALT, 4(rSRC)
> - stwu rWORD, 4(rDEST)
> - add rTMP, rFEFE, rALT
> - nor rNEG, r7F7F, rALT
> - and. rTMP, rTMP, rNEG
> - bne- L(g1)
> - lwzu rWORD, 4(rSRC)
> - stwu rALT, 4(rDEST)
> -L(g2): add rTMP, rFEFE, rWORD
> - nor rNEG, r7F7F, rWORD
> - and. rTMP, rTMP, rNEG
> - beq+ L(g0)
> -
> - mr rALT, rWORD
> -/* We've hit the end of the string. Do the rest byte-by-byte. */
> -L(g1):
> -#ifdef __LITTLE_ENDIAN__
> - rlwinm. rTMP, rALT, 0, 24, 31
> - stbu rALT, 4(rDEST)
> - beqlr-
> - rlwinm. rTMP, rALT, 24, 24, 31
> - stbu rTMP, 1(rDEST)
> - beqlr-
> - rlwinm. rTMP, rALT, 16, 24, 31
> - stbu rTMP, 1(rDEST)
> - beqlr-
> - rlwinm rTMP, rALT, 8, 24, 31
> - stbu rTMP, 1(rDEST)
> - blr
> -#else
> - rlwinm. rTMP, rALT, 8, 24, 31
> - stbu rTMP, 4(rDEST)
> - beqlr-
> - rlwinm. rTMP, rALT, 16, 24, 31
> - stbu rTMP, 1(rDEST)
> - beqlr-
> - rlwinm. rTMP, rALT, 24, 24, 31
> - stbu rTMP, 1(rDEST)
> - beqlr-
> - stbu rALT, 1(rDEST)
> - blr
> -#endif
> -
> -/* Oh well. In this case, we just do a byte-by-byte copy. */
> - .align 4
> - nop
> -L(unaligned):
> - lbz rWORD, 0(rSRC)
> - addi rDEST, rDEST, 3
> - cmpwi rWORD, 0
> - beq- L(u2)
> -
> -L(u0): lbzu rALT, 1(rSRC)
> - stbu rWORD, 1(rDEST)
> - cmpwi rALT, 0
> - beq- L(u1)
> - nop /* Let 601 load start of loop. */
> - lbzu rWORD, 1(rSRC)
> - stbu rALT, 1(rDEST)
> - cmpwi rWORD, 0
> - bne+ L(u0)
> -L(u2): stbu rWORD, 1(rDEST)
> - blr
> -L(u1): stbu rALT, 1(rDEST)
> - blr
> -END (__stpcpy)
> +#define USE_AS_STPCPY
> +#include <sysdeps/powerpc/powerpc64/strcpy.S>
>
> weak_alias (__stpcpy, stpcpy)
> libc_hidden_def (__stpcpy)
Ok.
> diff --git a/sysdeps/powerpc/powerpc64/strcpy.S b/sysdeps/powerpc/powerpc64/strcpy.S
> index a7fd85b..793325d 100644
> --- a/sysdeps/powerpc/powerpc64/strcpy.S
> +++ b/sysdeps/powerpc/powerpc64/strcpy.S
> @@ -22,25 +22,38 @@
>
> /* char * [r3] strcpy (char *dest [r3], const char *src [r4]) */
>
> -EALIGN (strcpy, 4, 0)
> +#ifdef USE_AS_STPCPY
> +# define FUNC_NAME __stpcpy
> +#else
> +# define FUNC_NAME strcpy
> +#endif
> +
> +EALIGN (FUNC_NAME, 4, 0)
> CALL_MCOUNT 2
>
> #define rTMP r0
> -#define rRTN r3 /* incoming DEST arg preserved as result */
> -#define rSRC r4 /* pointer to previous word in src */
> -#define rDEST r5 /* pointer to previous word in dest */
> +#ifdef USE_AS_STPCPY
> +#define rRTN r3 /* pointer to previous word/doubleword in dest */
> +#else
> +#define rRTN r12 /* pointer to previous word/doubleword in dest */
> +#endif
> +#define rSRC r4 /* pointer to previous word/doubleword in src */
> #define rWORD r6 /* current word from src */
> -#define rFEFE r7 /* constant 0xfefefefefefefeff (-0x0101010101010101) */
> -#define r7F7F r8 /* constant 0x7f7f7f7f7f7f7f7f */
> -#define rNEG r9 /* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
> +#define rFEFE r7 /* constant 0xfefefeff | 0xfefefefefefefeff */
> +#define r7F7F r8 /* constant 0x7f7f7f7f | 0x7f7f7f7f7f7f7f7f */
> +#define rNEG r9 /* ~(word in s1 | r7F7F) */
> #define rALT r10 /* alternate word from src */
>
> - dcbt 0,rSRC
> +#ifndef USE_AS_STPCPY
> +/* Save the dst pointer to use as return value. */
> + mr rRTN, r3
> +#endif
> or rTMP, rSRC, rRTN
> clrldi. rTMP, rTMP, 61
> - addi rDEST, rRTN, -8
> - dcbtst 0,rRTN
> - bne L(unaligned)
> + bne L(check_word_alignment)
> +
> +/* For doubleword aligned memory, operate using doubleword load and stores. */
> + addi rRTN, rRTN, -8
>
> lis rFEFE, -0x101
> lis r7F7F, 0x7f7f
> @@ -53,13 +66,13 @@ EALIGN (strcpy, 4, 0)
> b L(g2)
>
> L(g0): ldu rALT, 8(rSRC)
> - stdu rWORD, 8(rDEST)
> + stdu rWORD, 8(rRTN)
> add rTMP, rFEFE, rALT
> nor rNEG, r7F7F, rALT
> and. rTMP, rTMP, rNEG
> bne- L(g1)
> ldu rWORD, 8(rSRC)
> - stdu rALT, 8(rDEST)
> + stdu rALT, 8(rRTN)
> L(g2): add rTMP, rFEFE, rWORD
> nor rNEG, r7F7F, rWORD
> and. rTMP, rTMP, rNEG
> @@ -70,77 +83,134 @@ L(g2): add rTMP, rFEFE, rWORD
> L(g1):
> #ifdef __LITTLE_ENDIAN__
> extrdi. rTMP, rALT, 8, 56
> - stb rALT, 8(rDEST)
> + stbu rALT, 8(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 48
> - stb rTMP, 9(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 40
> - stb rTMP, 10(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 32
> - stb rTMP, 11(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 24
> - stb rTMP, 12(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 16
> - stb rTMP, 13(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 8
> - stb rTMP, 14(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi rTMP, rALT, 8, 0
> - stb rTMP, 15(rDEST)
> - blr
> + stbu rTMP, 1(rRTN)
> #else
> extrdi. rTMP, rALT, 8, 0
> - stb rTMP, 8(rDEST)
> + stbu rTMP, 8(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 8
> - stb rTMP, 9(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 16
> - stb rTMP, 10(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 24
> - stb rTMP, 11(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 32
> - stb rTMP, 12(rDEST)
> - beqlr-
> + stbu rTMP, 1(rRTN)
> + beqlr
> extrdi. rTMP, rALT, 8, 40
> - stb rTMP, 13(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> extrdi. rTMP, rALT, 8, 48
> - stb rTMP, 14(rDEST)
> + stbu rTMP, 1(rRTN)
> beqlr-
> - stb rALT, 15(rDEST)
> + stbu rALT, 1(rRTN)
> +#endif
> blr
> +
> +L(check_word_alignment):
> + clrldi. rTMP, rTMP, 62
> + bne L(unaligned)
> +
> +/* For word aligned memory, operate using word load and stores. */
> + addi rRTN, rRTN, -4
> +
> + lis rFEFE, -0x101
> + lis r7F7F, 0x7f7f
> + lwz rWORD, 0(rSRC)
> + addi rFEFE, rFEFE, -0x101
> + addi r7F7F, r7F7F, 0x7f7f
> + b L(g5)
> +
> +L(g3): lwzu rALT, 4(rSRC)
> + stwu rWORD, 4(rRTN)
> + add rTMP, rFEFE, rALT
> + nor rNEG, r7F7F, rALT
> + and. rTMP, rTMP, rNEG
> + bne- L(g4)
> + lwzu rWORD, 4(rSRC)
> + stwu rALT, 4(rRTN)
> +L(g5): add rTMP, rFEFE, rWORD
> + nor rNEG, r7F7F, rWORD
> + and. rTMP, rTMP, rNEG
> + beq+ L(g3)
> +
> + mr rALT, rWORD
> +/* We've hit the end of the string. Do the rest byte-by-byte. */
> +L(g4):
> +#ifdef __LITTLE_ENDIAN__
> + rlwinm. rTMP, rALT, 0, 24, 31
> + stbu rALT, 4(rRTN)
> + beqlr-
> + rlwinm. rTMP, rALT, 24, 24, 31
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + rlwinm. rTMP, rALT, 16, 24, 31
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + rlwinm rTMP, rALT, 8, 24, 31
> + stbu rTMP, 1(rRTN)
> +#else
> + rlwinm. rTMP, rALT, 8, 24, 31
> + stbu rTMP, 4(rRTN)
> + beqlr-
> + rlwinm. rTMP, rALT, 16, 24, 31
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + rlwinm. rTMP, rALT, 24, 24, 31
> + stbu rTMP, 1(rRTN)
> + beqlr-
> + stbu rALT, 1(rRTN)
> #endif
> + blr
>
> /* Oh well. In this case, we just do a byte-by-byte copy. */
> .align 4
> nop
> L(unaligned):
> lbz rWORD, 0(rSRC)
> - addi rDEST, rRTN, -1
> + addi rRTN, rRTN, -1
> cmpwi rWORD, 0
> beq- L(u2)
>
> L(u0): lbzu rALT, 1(rSRC)
> - stbu rWORD, 1(rDEST)
> + stbu rWORD, 1(rRTN)
> cmpwi rALT, 0
> beq- L(u1)
> nop /* Let 601 load start of loop. */
> lbzu rWORD, 1(rSRC)
> - stbu rALT, 1(rDEST)
> + stbu rALT, 1(rRTN)
> cmpwi rWORD, 0
> bne+ L(u0)
> -L(u2): stb rWORD, 1(rDEST)
> +L(u2): stbu rWORD, 1(rRTN)
> blr
> -L(u1): stb rALT, 1(rDEST)
> +L(u1): stbu rALT, 1(rRTN)
> blr
> +END (FUNC_NAME)
>
> -END (strcpy)
> +#ifndef USE_AS_STPCPY
> libc_hidden_builtin_def (strcpy)
> +#endif
Similar comment as earlier, should the strcpy reference here become
FUNC_NAME, and possibly also remove the #ifndef wrapper.
Either way, looks OK.
Thanks,
-Will