This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Power7 optimization for strncpy and stpncpy.
- From: vidya at linux dot vnet dot ibm dot com
- To: libc-alpha at sourceware dot org
- Cc: Vidya Ranganathan <vidya at linux dot vnet dot ibm dot com>
- Date: Fri, 28 Mar 2014 02:43:56 -0400
- Subject: [PATCH] Power7 optimization for strncpy and stpncpy.
- Authentication-results: sourceware.org; auth=none
From: Vidya Ranganathan <vidya@linux.vnet.ibm.com>
The optimization is achieved by following techniques:
> data alignment [gain from aligned memory access on read/write]
> prefetch data [gain from cache misses by anticipating load]
> POWER7 gains performance with loop unrolling/unwinding
[gain by reduction of branch penalty].
ChangeLog:
2014-03-27 Vidya Ranganathan <vidya@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power7/strncpy.S: New file: Optimization.
* sysdeps/powerpc/powerpc64/multiarch/strncpy.c: New file:
multiarch strncpy for PPC64.
* sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c: New file
* sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S: New file
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c:
(__libc_ifunc_impl_list): Likewise.
* sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strpcpy, stpncpy
multiarch optimizations
* sysdeps/powerpc/powerpc64/power7/stpncpy.S: New file: Optimization.
* sysdeps/powerpc/powerpc64/multiarch/stpncpy.c: New file:
multiarch stpncpy for PPC64.
* sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c: New file
* sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S: New file
Signed-off-by: Vidya Ranganathan <vidya@linux.vnet.ibm.com>
---
sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +-
.../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 16 +
.../powerpc/powerpc64/multiarch/stpncpy-power7.S | 42 ++
.../powerpc/powerpc64/multiarch/stpncpy-ppc64.c | 26 ++
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 33 ++
.../powerpc/powerpc64/multiarch/strncpy-power7.S | 40 ++
.../powerpc/powerpc64/multiarch/strncpy-ppc64.c | 33 ++
sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 35 ++
sysdeps/powerpc/powerpc64/power7/stpncpy.S | 24 +
sysdeps/powerpc/powerpc64/power7/strncpy.S | 483 +++++++++++++++++++++
10 files changed, 734 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy.c
create mode 100644 sysdeps/powerpc/powerpc64/power7/stpncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/power7/strncpy.S
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 8d367aa..35020a7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -16,7 +16,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \
- strpbrk-power7 strpbrk-ppc64
+ strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
+ stpncpy-power7 stpncpy-ppc64
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 91fabb0..d8578fb 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -278,5 +278,21 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strpbrk, 1,
__strpbrk_ppc))
+ /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
+ IFUNC_IMPL (i, name, strncpy,
+ IFUNC_IMPL_ADD (array, i, strncpy,
+ hwcap & PPC_FEATURE_HAS_VSX,
+ __strncpy_power7)
+ IFUNC_IMPL_ADD (array, i, strncpy, 1,
+ __strncpy_ppc))
+
+ /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
+ IFUNC_IMPL (i, name, stpncpy,
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ hwcap & PPC_FEATURE_HAS_VSX,
+ __stpncpy_power7)
+ IFUNC_IMPL_ADD (array, i, stpncpy, 1,
+ __stpncpy_ppc))
+
return i;
}
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S
new file mode 100644
index 0000000..92c4236
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S
@@ -0,0 +1,42 @@
+/* Optimized stpncpy implementation for POWER7.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define USE_AS_STPNCPY
+
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__stpncpy_power7) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__stpncpy_power7): \
+ cfi_startproc; \
+ LOCALENTRY(__stpncpy_power7)
+
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__stpncpy_power7) \
+ END_2(__stpncpy_power7)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power7/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c
new file mode 100644
index 0000000..74f47a7
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c
@@ -0,0 +1,26 @@
+/* Default stpncpy implementation for PowerPC64.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define STPNCPY __stpncpy_ppc
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+ __hidden_ver1 (__stpncpy_ppc, __GI___stpncpy, __stpncpy_ppc);
+#endif
+
+#include <string/stpncpy.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
new file mode 100644
index 0000000..dbf8521
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -0,0 +1,33 @@
+/* Multiple versions of stpncpy. PowerPC64 version.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NOT_IN_libc
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+
+libc_ifunc (__stpncpy,
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __stpncpy_power7
+ : __stpncpy_ppc);
+
+weak_alias (__stpncpy, stpncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S
new file mode 100644
index 0000000..052998c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S
@@ -0,0 +1,40 @@
+/* Optimized strncpy implementation for POWER7.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strncpy_power7) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strncpy_power7): \
+ cfi_startproc; \
+ LOCALENTRY(__strncpy_power7)
+
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strncpy_power7) \
+ END_2(__strncpy_power7)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c
new file mode 100644
index 0000000..e3111d2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STRNCPY __strncpy_ppc
+#undef weak_alias
+#define weak_alias(name, aliasname) \
+ extern __typeof (__strncpy_ppc) aliasname \
+ __attribute__ ((weak, alias ("__strncpy_ppc")));
+#if !defined(NOT_IN_libc) && defined(SHARED)
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1(__strncpy_ppc, __GI_strncpy, __strncpy_ppc);
+#endif
+
+extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
+
+#include <string/strncpy.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
new file mode 100644
index 0000000..0766fa8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strncpy.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/ >. */
+
+/* Define multiple versions only for definition in libc. */
+#ifndef NOT_IN_libc
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
+extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+ ifunc symbol properly. */
+libc_ifunc (strncpy,
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncpy_power7
+ : __strncpy_ppc);
+
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
new file mode 100644
index 0000000..a539093
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000..729401a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,483 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+ char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ AND
+
+ char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ The algorithm is as follows:
+ > if src and dest are 8 byte aligned, perform double word copy
+ else
+ > if src and dest are 4 byte aligned, perform word copy
+ else
+ > copy byte by byte on unaligned addresses.
+
+ The aligned comparison are made using cmpb instructions. */
+
+/* The focus on optimization for performance improvements are as follows:
+ 1. data alignment [gain from aligned memory access on read/write]
+ 2. prefetch data [gain from cache misses by anticipating load]
+ 3. POWER7 gains performance with loop unrolling/unwinding
+ [gain by reduction of branch penalty]. */
+
+#ifdef USE_AS_STPNCPY
+ #ifndef STPNCPY
+ # ifdef weak_alias
+ # define STPNCPY __stpncpy
+ weak_alias (__stpncpy, stpncpy)
+ # else
+ # define STPNCPY stpncpy
+ # endif
+ #endif
+ # define FUNC_NAME __stpncpy
+#else
+ #undef strncpy
+
+ #ifndef STRNCPY
+ #define STRNCPY strncpy
+ #endif
+ # define FUNC_NAME strncpy
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
+
+ .machine power7
+EALIGN(FUNC_NAME, 4, 0)
+ CALL_MCOUNT 3
+
+ dcbt 0, r3 /* CPU pre-fetch dst to avoid cache miss */
+ dcbt 0, r4 /* CPU pre-fetch src to avoid cache miss */
+
+ mflr r0 /* load link register LR to r0 */
+ or r9, r3, r4 /* to verify source and destination */
+ rldicl. r10, r9, 0, 61 /* is doubleWord aligned ..? */
+
+ std r31, -8(r1) /* save callers register , r31 */
+ std r30, -16(r1) /* save callers register , r30 */
+ std r15, -24(r1) /* save callers register , r15 */
+ std r0, 16(r1) /* store the link register */
+ stdu r1, -FRAMESIZE(r1) /* create the stack frame */
+
+ mr r15, r3 /* save r3 into r15 as retcode for strncpy */
+ mr r31, r3 /* save r3 into r31 for use */
+ beq cr0,L(dwordAligned)
+ rldicl. r10, r9, 0, 62 /* is word aligned .. ? */
+ bne cr0,L(byte_by_byte)
+
+
+ srdi r3, r5, 2 /* compute count for CTR ; count = n/4 */
+ cmpldi cr7, r3, 3 /* if count > 4 ; perform unrolling 4 times */
+ ble cr7,L(update2)
+
+ lwz r9, 0(r4) /* load word from src */
+ cmpb r10, r9, r10 /* compare bytes in src we read just now */
+ cmpdi cr7, r10, 0 /* if NULL not found in src, continue copy */
+ bne cr7,L(update5)
+ stw r9, 0(r31) /* store word into dst */
+
+ lwz r9, 4(r4) /* load next word from src ; do unrolling */
+ cmpb r10, r9, r10 /* use cmpb to detect NULL in src */
+ cmpdi cr7, r10, 0 /* if NULL not found in src, continue copy */
+ bne cr7,L(HopBy4)
+ addi r10, r3, -4
+ mr r8, r31
+ srdi r10, r10, 2
+ mr r7, r4
+ addi r10, r10, 1
+ li r12, 0
+ mtctr r10
+ b L(wordCopy)
+ .p2align 4
+L(wordUnroll):
+ stw r10, 8(r31) /* perform loop unrolling on word load/store */
+
+ lwz r10, 12(r4) /* load next to next word from src */
+ cmpb r9, r10, r9
+ cmpdi cr7, r9, 0
+ bne cr7,L(HopBy12)
+ stw r10, 12(r8)
+
+ addi r31, r31, 16
+ addi r4, r4, 16
+ bdz L(leftWords)
+
+ lwz r6, 16(r7) /* unroll for word copy */
+ cmpb r10, r6, r9
+ cmpdi cr7, r10, 0
+ bne cr7,L(update3)
+ stw r6, 16(r8)
+
+ lwz r9, 20(r7)
+ cmpb r10, r9, r10
+ cmpdi cr7, r10, 0
+ bne cr7,L(HopBy20)
+
+ mr r7, r4
+ mr r8, r31
+ mr r3, r11
+ mr r5, r0
+
+L(wordCopy):
+ stw r9, 4(r31)
+ addi r0, r5, -16
+ addi r11, r3, -4
+ lwz r10, 8(r4)
+ cmpb r9, r10, r12
+ cmpdi cr7, r9, 0
+ beq cr7,L(wordUnroll)
+ addi r31, r31, 8
+ addi r4, r4, 8
+ addi r5, r5, -8
+ addi r11, r3, -2
+
+L(wordUnrollOFF):
+ lwz r9, 0(r4)
+ li r10, 0
+ cmpb r10, r9, r10
+ cmpdi cr7, r10, 0
+ bne cr7,L(byte_by_byte)
+ mtctr r11
+ li r8, 0
+ b L(copyWord)
+
+ .p2align 4
+L(loadWordandCompare):
+ lwz r9, 0(r4)
+ cmpb r10, r9, r8
+ cmpdi cr7, r10, 0
+ bne cr7,L(byte_by_byte)
+
+L(copyWord):
+ addi r31, r31, 4
+ stw r9,-4(r31)
+ addi r4, r4, 4
+ addi r5, r5, -4
+ bdnz L(loadWordandCompare)
+ .p2align 4
+L(byte_by_byte):
+ cmpldi cr7, r5, 3
+ subf r30, r5, r31
+ ble cr7,L(verifyByte)
+ srdi r10, r5, 2
+ mr r9, r31
+ mtctr r10
+ b L(firstByteUnroll)
+
+ .p2align 4
+L(bytes_unroll):
+ lbz r10, 1(r4) /* load byte from src */
+ cmpdi cr7, r10, 0 /* compare for NULL */
+ stb r10, 1(r9) /* store byte to dst */
+ beq cr7,L(updtDestComputeN2ndByte)
+
+ addi r4, r4, 4
+
+ lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
+ cmpdi cr7, r10, 0
+ stb r10, 2(r9)
+ beq cr7,L(updtDestComputeN3rdByte)
+
+ lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
+ addi r9, r9, 4
+ cmpdi cr7, r10, 0
+ stb r10, -1(r9)
+ beq cr7, L(updtDestComputeNByte)
+
+ bdz L(updateToContinue)
+
+L(firstByteUnroll):
+ lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
+ cmpdi cr7, r10, 0
+ stb r10, 0(r9)
+ bne cr7, L(bytes_unroll)
+ addi r9, r9, 1
+
+L(updtDestComputeNByte):
+ subf r10, r9, r31
+ mr r31, r9
+ add r10, r10, r5
+
+L(zeroFill):
+ cmpdi cr7, r10, 0 /* compare if length is zero */
+ beq cr7,L(hop2Return)
+ mr r3, r31 /* fill buffer with zero */
+ li r4, 0 /* buffer size to fill zero with */
+ mr r5, r10 /* fill buffer target */
+ bl memset /* fill with zeroes */
+ nop /* trigger CPU activity */
+
+L(hop2Return):
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+
+/* the return value differs based on the call to strncpy or stpncpy,
+ so based on the USE_AS_STPNCPY macro defined the return value
+ is copied to r3. */
+
+#ifdef USE_AS_STPNCPY
+ addi r3, r31, -1 /* set the return value */
+#else
+ mr r3, r15 /* set the return value */
+#endif
+ ld r0, 16(r1) /* read the saved link register */
+ ld r15, -24(r1) /* restore callers save register, r15 */
+ ld r30, -16(r1) /* restore callers save register, r30 */
+ ld r31, -8(r1) /* restore callers save register, r31 */
+ mtlr r0 /* restore link register */
+ blr /* branch to link register */
+
+ .p2align 4
+L(updateToContinue):
+ mr r31, r9
+
+ .p2align 4
+L(verifyByte):
+ rldicl. r10, r5, 0, 62
+ addi r4, r4, -1
+ beq cr0,L(done)
+ mtctr r10
+ b L(oneBYone)
+
+ .p2align 4
+L(proceed):
+ bdz L(done)
+L(oneBYone):
+ lbzu r9, 1(r4) /* copy byte */
+ addi r31, r31, 1
+ addi r10, r10, -1
+ cmpdi cr7, r9, 0
+ stb r9,-1(r31)
+ bne cr7,L(proceed)
+ b L(zeroFill)
+
+ .p2align 4
+L(dwordAligned):
+ srdi r3, r5, 3 /* compute count for CTR ; count = n/8 */
+ cmpldi cr7, r3, 3 /* if count > 4 ; perform unrolling 4 times */
+ ble cr7,L(update0)
+
+ ld r9, 0(r4) /* load doubleWord from src */
+ cmpb r10, r9, r10 /* compare src with NULL ,we read just now */
+ cmpdi cr7, 10, 0 /* if cmpb returned NULL ; we continue */
+ bne cr7,L(update4)
+
+ std r9, 0(r31) /* copy doubleword at offset=0 */
+ ld r9, 8(r4) /* load next doubleword from offset=8 */
+ cmpb r10, r9, r10 /* compare src with NULL , we read just now */
+ cmpdi cr7, r10, 0 /* if cmpb returned NULL ; we continue */
+ bne cr7, L(HopBy8)
+
+ addi r10, r3, -4
+ mr r8, r31
+ srdi r10, r10, 2
+ mr r7, r4
+ addi r10, r10, 1
+ li r12, 0
+ mtctr r10
+ b L(dwordCopy)
+ .p2align 4
+
+L(dWordUnroll):
+ std r10, 16(r31)
+ ld r10, 24(r4) /* load dword , perform loop unrolling again */
+ cmpb r9, r10, r9
+ cmpdi cr7, r9, 0
+ bne 7,L(HopBy24)
+
+ std r10, 24(r8) /* copy dword at offset=24 */
+ addi r31, r31, 32
+ addi r4, r4, 32
+ bdz L(leftDwords) /* continue with loop on counter */
+
+ ld r6, 32(r7)
+ cmpb r10, r6, r9
+ cmpdi cr7, r10, 0
+ bne cr7,L(update1)
+
+ std r6, 32(r8)
+ ld r9, 40(r7)
+ cmpb r10, r9, r10
+ cmpdi cr7, r10, 0
+ bne cr7,L(HopBy40)
+
+ mr r7, r4
+ mr r8, r31
+ mr r3, r11
+ mr r5, r0
+
+L(dwordCopy): /* perform loop unrolling ; copy dword */
+ std r9, 8(r31) /* copy dword at offset=8 */
+ addi r0, r5, -32
+ addi r11, r3, -4
+ ld r10, 16(r4)
+ cmpb r9, r10, r12
+ cmpdi cr7, r9, 0
+ beq cr7,L(dWordUnroll)
+ addi r31, r31, 16
+ addi r4, r4, 16
+ addi r5, r5, -16
+ addi r11, r3, -2
+
+L(dWordUnrollOFF):
+ ld r9, 0(r4)
+ li r10, 0 /* load mask = 0 */
+ cmpb r10, r9, r10
+ cmpdi cr7, r10, 0
+ bne cr7,L(byte_by_byte)
+ mtctr r11
+ li r8, 0
+ b L(CopyDword)
+
+ .p2align 4
+L(loadDWordandCompare):
+ ld r9, 0(r4)
+ cmpb r10, r9, r8
+ cmpdi cr7, r10, 0
+ bne cr7,L(byte_by_byte)
+
+L(CopyDword):
+ addi r31, r31, 8
+ std r9, -8(r31)
+ addi r4, r4, 8
+ addi r5, r5, -8
+ bdnz L(loadDWordandCompare)
+ b L(byte_by_byte)
+
+ .p2align 4
+L(done):
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+#ifdef USE_AS_STPNCPY
+ mr r3, r31 /* set the return value */
+#else
+ mr r3, r15 /* set the return value */
+#endif
+ ld r0, 16(r1) /* read the saved link register */
+ ld r15, -24(r1) /* restore callers save register, r30 */
+ ld r30,-16(r1) /* restore callers save register, r31 */
+ ld r31,-8(r1) /* restore link register */
+ mtlr r0 /* branch to link register */
+ blr
+
+L(update0):
+ mr r11, r3
+ mr r0, r5
+
+ .p2align 4
+L(leftDwords):
+ cmpdi cr7, r11, 0
+ mr r5, r0
+ bne cr7,L(dWordUnrollOFF)
+ b L(byte_by_byte)
+
+ .p2align 4
+L(updtDestComputeN2ndByte):
+ addi r9, r9, 2 /* update dst by 2 */
+ subf r10, r9, r31 /* compute distance covered */
+ mr r31, r9
+ add r10, r10, r5
+ b L(zeroFill)
+
+ .p2align 4
+L(updtDestComputeN3rdByte):
+ addi r9, r9, 3
+ subf r10, r9, r31
+ mr r31, r9
+ add r10, r10, r5
+ b L(zeroFill)
+
+ .p2align 4
+L(HopBy24):
+ addi r31, r31, 24 /* increment dst by 24 */
+ addi r4, r4, 24 /* increment src by 24 */
+ addi r5, r5, -24 /* decrement length 'n' by 24 */
+ addi r11, r3, -3 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+ .p2align 4
+L(update1):
+ mr r5, r0
+ b L(dWordUnrollOFF)
+
+ .p2align 4
+L(HopBy40):
+ addi r31, r8, 40 /* increment dst by 40 */
+ addi r4, r7, 40 /* increment src by 40 */
+ addi r5, r5, -40 /* decrement length 'n' by 40 */
+ addi r11, r3, -5 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+L(update2):
+ mr r11, r3
+ mr r0, r5
+
+L(leftWords):
+ cmpdi cr7, r11, 0 /* if words are left,process with unrollOFF */
+ mr r5, r0
+ bne cr7,L(wordUnrollOFF)
+ b L(byte_by_byte)
+
+L(HopBy12):
+ addi r31, r31, 12 /* increment dst by 12 */
+ addi r4, r4, 12 /* increment src by 12 */
+ addi r5, r5, -12 /* decrement length 'n' by 12 */
+ addi r11, 3, -3 /* decrement loop counter */
+ b L(wordUnrollOFF)
+
+L(update3):
+ mr r5, r0
+ b L(wordUnrollOFF)
+
+L(HopBy20):
+ addi r31, r8, 20 /* increment dst by 20 */
+ addi r4, r7, 20 /* increment src by 20 */
+ addi r5, r5, -20 /* decrement length 'n' by 20 */
+ addi r11, r3, -5 /* decrement loop counter */
+ b L(wordUnrollOFF)
+
+L(update4):
+ mr r11, r3
+ b L(dWordUnrollOFF)
+
+L(HopBy8):
+ addi r31, r31, 8 /* increment dst by 8 */
+ addi r4, r4, 8 /* increment src by 8 */
+ addi r5, r5, -8 /* decrement length 'n' by 8 */
+ addi r11, r3, -1 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+L(update5):
+ mr r11, r3
+ b L(wordUnrollOFF)
+
+L(HopBy4):
+ addi r31, r31, 4 /* increment dst by 4 */
+ addi r4, 4, 4 /* increment src by 4 */
+ addi r5, r5, -4 /* decrement length 'n' by 4 */
+ addi r11, r3, -1 /* decrement loop counter */
+ b L(wordUnrollOFF)
+
+END(FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#else
+libc_hidden_def (__stpncpy)
+#endif
--
1.8.3.1