]> sourceware.org Git - glibc.git/commitdiff
powerpc: Optimized st{r,p}ncpy for POWER8/PPC64
authorAdhemerval Zanella <azanella@linux.vnet.ibm.com>
Wed, 31 Dec 2014 16:47:41 +0000 (11:47 -0500)
committerAdhemerval Zanella <azanella@linux.vnet.ibm.com>
Tue, 13 Jan 2015 16:28:44 +0000 (11:28 -0500)
This patch adds an optimized POWER8 st{r,p}ncpy using unaligned accesses.
It shows 10%-80% improvement over the optimized POWER7 one that uses
only aligned accesses, specially on unaligned inputs.

The algorithm first read and check 16 bytes (if inputs do not cross a 4K
page size).  The it realign source to 16-bytes and issue a 16 bytes read
and compare loop to speedup null byte checks for large strings.  Also,
different from POWER7 optimization, the null pad is done inline in the
implementation using possible unaligned accesses, instead of realying on
a memset call.  Special case is added for page cross reads.

ChangeLog
NEWS
sysdeps/powerpc/powerpc64/multiarch/Makefile
sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/strncpy.c
sysdeps/powerpc/powerpc64/power8/stpncpy.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/power8/strncpy.S [new file with mode: 0644]

index 16199e31308cefc0f81c3030c2fbc1efb06099fe..20aded41ca1872eed295aef0733d1e98e5464e92 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+       * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+       Add strncpy-power8 and stpncpy-power8 objects.
+       * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+       (__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
+       implementations.
+       * sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
+       * sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
+       __stpncpy_power8 implementation.
+       * sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
+       * sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
+       __strncpy_power8 implementation.
+       * sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
+       * sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
+       * NEWS: Update.
+
        * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
        * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
        * sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
diff --git a/NEWS b/NEWS
index e02091802e9b5adf0d8a439fa3fb5abf5f6f63c1..08b3daacd50954fa4d6eecfc3dda9a9205623a49 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -19,7 +19,8 @@ Version 2.21
   17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
   17791, 17793, 17796, 17797, 17803, 17806, 17834
 
-* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+  powerpc64/powerpc64le.
 
 * Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
   and powerpc64le.  This may improve lock scaling of existing programs on
index 74b2daac1dd65ae92ff79ebac49928b2b8b1c9ba..18d337843cd7eb53973e41723301dfd3c23b3511 100644 (file)
@@ -17,9 +17,10 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
                   stpcpy-power7 stpcpy-ppc64 \
                   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
                   strncpy-power7 strncpy-ppc64 \
-                  stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
+                  stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
+                  strcmp-power7 strcmp-ppc64 \
                   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
-                  memmove-ppc64 bcopy-ppc64
+                  memmove-ppc64 bcopy-ppc64 strncpy-power8
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
index dbb21fddb9f236ec16cf21385c98f19c6f5af415..132cb13eac6efd5e0c3a28085098f8de4667ad4e 100644 (file)
@@ -278,6 +278,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+             IFUNC_IMPL_ADD (array, i, strncpy,
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __strncpy_power8)
              IFUNC_IMPL_ADD (array, i, strncpy,
                              hwcap & PPC_FEATURE_HAS_VSX,
                              __strncpy_power7)
@@ -286,6 +289,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
+             IFUNC_IMPL_ADD (array, i, stpncpy,
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __stpncpy_power8)
              IFUNC_IMPL_ADD (array, i, stpncpy,
                              hwcap & PPC_FEATURE_HAS_VSX,
                              __stpncpy_power7)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
new file mode 100644 (file)
index 0000000..d5d835d
--- /dev/null
@@ -0,0 +1,39 @@
+/* Optimized stpncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define USE_AS_STPNCPY
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)                            \
+  .section ".text";                                            \
+  ENTRY_2(__stpncpy_power8)                                    \
+  .align ALIGNARG(alignt);                                     \
+  EALIGN_W_##words;                                            \
+  BODY_LABEL(__stpncpy_power8):                                        \
+  cfi_startproc;                                               \
+  LOCALENTRY(__stpncpy_power8)
+
+#undef END
+#define END(name)                                              \
+  cfi_endproc;                                                 \
+  TRACEBACK(__stpncpy_power8)                                  \
+  END_2(__stpncpy_power8)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
index 9e5a270303263c75558d9f020fbd1f8e5a5d9e5b..0f4072f9824589cf0ae91f47abe2aa792b44ea97 100644 (file)
 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
 
 libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __stpncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __stpncpy_power7
             : __stpncpy_ppc);
 
 weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
new file mode 100644 (file)
index 0000000..ed906a4
--- /dev/null
@@ -0,0 +1,40 @@
+/* Optimized strncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)                            \
+  .section ".text";                                            \
+  ENTRY_2(__strncpy_power8)                                    \
+  .align ALIGNARG(alignt);                                     \
+  EALIGN_W_##words;                                            \
+  BODY_LABEL(__strncpy_power8):                                        \
+  cfi_startproc;                                               \
+  LOCALENTRY(__strncpy_power8)
+
+#undef END
+#define END(name)                                              \
+  cfi_endproc;                                                 \
+  TRACEBACK(__strncpy_power8)                                  \
+  END_2(__strncpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
index ae4e97a265615ccd461ed6c344c4d53953baa0eb..ffb0f23643ff2d8a85c96beb3769599738e09d6b 100644 (file)
 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc (strncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncpy_power7
             : __strncpy_ppc);
 
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
new file mode 100644 (file)
index 0000000..76a1466
--- /dev/null
@@ -0,0 +1,20 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644 (file)
index 0000000..5fda953
--- /dev/null
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+       .machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+
+        /* Check if the [src]+15 will cross a 4K page by checking if the bit
+           indicating the page size changes.  Basically:
+
+           uint64_t srcin = (uint64_t)src;
+           uint64_t ob = srcin & 4096UL;
+           uint64_t nb = (srcin+15UL) & 4096UL;
+           if (ob ^ nb)
+             goto pagecross;  */
+
+       addi    r10,r4,16
+       rlwinm  r9,r4,0,19,19
+
+       /* Since it is a leaf function, save some non-volatile registers on the
+          protected/red zone.  */
+       std     r26,-48(r1)
+       std     r27,-40(r1)
+
+       rlwinm  r8,r10,0,19,19
+
+       std     r28,-32(r1)
+       std     r29,-24(r1)
+
+       cmpld   r7,r9,r8
+
+       std     r30,-16(r1)
+       std     r31,-8(r1)
+
+       beq     cr7,L(unaligned_lt_16)
+       rldicl  r9,r4,0,61
+       subfic  r8,r9,8
+       cmpld   cr7,r5,r8
+       bgt     cr7,L(pagecross)
+
+       /* At this points there is 1 to 15 bytes to check and write.  Since it could
+          be either from first unaligned 16 bytes access or from bulk copy, the code
+          uses an unrolled byte read/write instead of trying to analyze the cmpb
+          results.  */
+L(short_path):
+       mr      r9,r3
+L(short_path_1):
+       cmpdi   cr7,r5,0
+       beq     cr7,L(short_path_loop_end_1)
+L(short_path_2):
+       lbz     r10,0(r4)
+       cmpdi   cr7,r10,0
+       stb     r10,0(r9)
+       beq     cr7,L(zero_pad_start_1)
+       cmpdi   cr0,r5,1
+       addi    r8,r9,1
+       addi    r6,r5,-1
+       beq     cr0,L(short_path_loop_end_0)
+       lbz     r10,1(r4)
+       cmpdi   cr7,r10,0
+       stb     r10,1(r9)
+       beq     cr7,L(zero_pad_start_prepare_1)
+       addi    r10,r5,-3
+       b       L(short_path_loop_1)
+
+       .align  4
+L(short_path_loop):
+       lbz     r8,0(r4)
+       addi    r7,r10,-2
+       cmpdi   cr5,r8,0
+       stb     r8,0(r9)
+       beq     cr5,L(zero_pad_start_1)
+       beq     r7,L(short_path_loop_end_0)
+       lbz     r8,1(r4)
+       cmpdi   cr7,r8,0
+       stb     r8,1(r9)
+       beq     cr7,L(zero_pad_start)
+       mr      r10,r7
+L(short_path_loop_1):
+       addic.  r5,r5,-2
+       addi    r9,r9,2
+       cmpdi   cr7,r10,0
+       addi    r4,r4,2
+       addi    r6,r9,1
+       bne     cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+       mr      r3,r9
+       b       L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+       addi    r3,r9,1
+       b       L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+       mr      r3,r9
+#endif
+L(short_path_loop_end):
+       /* Restore non-volatile registers.  */
+       ld      r26,-48(r1)
+       ld      r27,-40(r1)
+       ld      r28,-32(r1)
+       ld      r29,-24(r1)
+       ld      r30,-16(r1)
+       ld      r31,-8(r1)
+       blr
+
+       /* This code pads the remainder dest with NULL bytes.  The algorithm
+          calculate the remanining size and issues a doubleword unrolled
+          loops followed by a byte a byte set.  */
+       .align  4
+L(zero_pad_start):
+       mr      r5,r10
+       mr      r9,r6
+L(zero_pad_start_1):
+       srdi.   r8,r5,r3
+       mr      r10,r9
+#ifdef USE_AS_STPNCPY
+       mr      r3,r9
+#endif
+       beq-    cr0,L(zero_pad_loop_b_start)
+       cmpldi  cr7,r8,1
+       li      cr7,0
+       std     r7,0(r9)
+       beq     cr7,L(zero_pad_loop_b_prepare)
+       addic.  r8,r8,-2
+       addi    r10,r9,r16
+       std     r7,8(r9)
+       beq     cr0,L(zero_pad_loop_dw_2)
+       std     r7,16(r9)
+       li      r9,0
+       b       L(zero_pad_loop_dw_1)
+
+       .align  4
+L(zero_pad_loop_dw):
+       addi    r10,r10,16
+       std     r9,-8(r10)
+       beq     cr0,L(zero_pad_loop_dw_2)
+       std     r9,0(r10)
+L(zero_pad_loop_dw_1):
+       cmpldi  cr7,r8,1
+       std     r9,0(r10)
+       addic.  r8,r8,-2
+       bne     cr7,L(zero_pad_loop_dw)
+       addi    r10,r10,8
+L(zero_pad_loop_dw_2):
+       rldicl  r5,r5,0,61
+L(zero_pad_loop_b_start):
+       cmpdi   cr7,r5,0
+       addi    r5,r5,-1
+       addi    r9,r10,-1
+       add     r10,r10,5
+       subf    r10,r9,r10
+       li      r8,0
+       beq-    cr7,L(short_path_loop_end)
+
+       /* Write remaining 1-8 bytes.  */
+        .align  4
+       addi    r9,r9,1
+       mtocrf  0x1,r10
+       bf      29,4f
+        stw     r8,0(r9)
+        addi   r9,r9,4
+
+        .align  4
+4:      bf      30,2f
+        sth     r8,0(r9)
+        addi   r9,r9,2
+
+        .align  4
+2:      bf     31,1f
+        stb    r8,0(r9)
+
+       /* Restore non-volatile registers.  */
+1:     ld      r26,-48(r1)
+       ld      r27,-40(r1)
+       ld      r28,-32(r1)
+       ld      r29,-24(r1)
+       ld      r30,-16(r1)
+       ld      r31,-8(r1)
+       blr
+
+       /* The common case where [src]+16 will not cross a 4K page boundary.
+          In this case the code fast check the first 16 bytes by using doubleword
+          read/compares and update destiny if neither total size or null byte
+          is found in destiny. */
+       .align  4
+L(unaligned_lt_16):
+       cmpldi  cr7,r5,7
+       ble     cr7,L(short_path)
+       ld      r7,0(r4)
+       li      r8,0
+       cmpb    r8,r7,r8
+       cmpdi   cr7,r8,0
+       bne     cr7,L(short_path_prepare_2)
+       addi    r6,r5,-8
+       std     r7,0(r3)
+       addi    r9,r3,r8
+       cmpldi  cr7,r6,7
+       addi    r7,r4,8
+       ble     cr7,L(short_path_prepare_1_1)
+       ld      r4,8(r4)
+       cmpb    r8,r4,r8
+       cmpdi   cr7,r8,0
+       bne     cr7,L(short_path_prepare_2_1)
+       std     r4,8(r3)
+       addi    r29,r3,16
+       addi    r5,r5,-16
+       /* Neither the null byte was found or total length was reached,
+          align to 16 bytes and issue a bulk copy/compare.  */
+       b       L(align_to_16b)
+
+       /* In the case of 4k page boundary cross, the algorithm first align
+          the address to a doubleword, calculate a mask based on alignment
+          to ignore the bytes and continue using doubleword.  */
+       .align  4
+L(pagecross):
+       rldicr  r11,r4,0,59     /* Align the address to 8 bytes boundary.  */
+       li      r6,-1           /* MASK = 0xffffffffffffffffUL.  */
+       sldi    r9,r9,3         /* Calculate padding.  */
+       ld      r7,0(r11)       /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+       sld     r9,r6,r9        /* MASK = MASK << padding.  */
+#else
+       srd     r9,r6,r9        /* MASK = MASK >> padding.  */
+#endif
+       orc     r9,r7,r9        /* Mask bits that are not part of the
+                                  string.  */
+       li      cr7,0
+       cmpb    r9,r9,r7        /* Check for null bytes in DWORD1.  */
+       cmpdi   cr7,r9,0
+       bne     cr7,L(short_path_prepare_2)
+       subf    r8,r8,r5        /* Adjust total length.  */
+       cmpldi  cr7,r8,8        /* Check if length was reached.  */
+       ble     cr7,L(short_path_prepare_2)
+
+       /* For next checks we have aligned address, so we check for more
+          three doublewords to make sure we can read 16 unaligned bytes
+          to start the bulk copy with 16 aligned addresses.  */
+       ld      cr7,8(r11)
+       cmpb    r9,r7,r9
+       cmpdi   cr7,r9,0
+       bne     cr7,L(short_path_prepare_2)
+       addi    cr7,r8,-8
+       cmpldi  cr7,r7,8
+       ble     cr7,L(short_path_prepare_2)
+       ld      cr7,16(r11)
+       cmpb    r9,r7,r9
+       cmpdi   cr7,r9,0
+       bne     cr7,L(short_path_prepare_2)
+       addi    r8,r8,-16
+       cmpldi  r7,r8,8
+       ble     cr7,L(short_path_prepare_2)
+       ld      r8,24(r11)
+       cmpb    r9,r8,r9
+       cmpdi   r7,r9,0
+       bne     cr7,L(short_path_prepare_2)
+
+       /* No null byte found in the 32 bytes readed and length not reached,
+          read source again using unaligned loads and store them.  */
+       ld      r9,0(r4)
+       addi    r29,r3,16
+       addi    r5,r5,-16
+       std     r9,0(r3)
+       ld      r9,8(r4)
+       std     r9,8(r3)
+
+       /* Align source to 16 bytes and adjust destiny and size.  */
+L(align_to_16b):
+       rldicl  r9,r10,0,60
+       rldicr  r28,r10,0,59
+       add     r12,r5,r9
+       subf    r29,r9,r29
+
+       /* The bulk read/compare/copy loads two doublewords, compare and merge
+          in a single register for speed.  This is an attempt to speed up the
+          null-checking process for bigger strings.  */
+
+       cmpldi  cr7,r12,15
+       ble     cr7,L(short_path_prepare_1_2)
+
+       /* Main loop for large sizes, unrolled 2 times to get better use of
+          pipeline.  */
+       ld      r8,0(28)
+       ld      r10,8(28)
+       li      r9,0
+       cmpb    r7,r8,r9
+       cmpb    r9,r10,r9
+       or.     r6,r9,r7
+       bne     cr0,L(short_path_prepare_2_3)
+       addi    r5,r12,-16
+       addi    r4,r28,16
+       std     r8,0(r29)
+       std     r10,8(r29)
+       cmpldi  cr7,r5,15
+       addi    r9,r29,16
+       ble     cr7,L(short_path_1)
+       mr      r11,r28
+       mr      r6,r29
+       li      r30,0
+       subfic  r26,r4,48
+       subfic  r27,r9,48
+
+       b       L(loop_16b)
+
+       .align  4
+L(loop_start):
+       ld      r31,0(r11)
+       ld      r10,8(r11)
+       cmpb    r0,r31,r7
+       cmpb    r8,r10,r7
+       or.     r7,r0,r8
+       addi    r5,r5,-32
+       cmpldi  cr7,r5,15
+       add     r4,r4,r26
+       add     r9,r9,r27
+       bne     cr0,L(short_path_prepare_2_2)
+       add     r4,r28,r4
+       std     r31,0(r6)
+       add     r9,r29,r9
+       std     r10,8(r6)
+       ble     cr7,L(short_path_1)
+
+L(loop_16b):
+       ld      r10,16(r11)
+       ld      r0,24(r11)
+       cmpb    r8,r10,r30
+       cmpb    r7,r0,r30
+       or.     r7,r8,r7
+       addi    r12,r12,-32
+       cmpldi  r7,r12,15
+       addi    r11,r11,32
+       bne     cr0,L(short_path_2)
+       std     r10,16(r6)
+       addi    r6,r6,32
+       std     r0,-8(r6)
+       bgt     cr7,L(loop_start)
+
+       mr      r5,r12
+       mr      r4,r11
+       mr      r9,r6
+       b       L(short_path_1)
+
+       .align  4
+L(short_path_prepare_1_1):
+       mr      r5,r6
+       mr      r4,r7
+       b       L(short_path_1)
+L(short_path_prepare_1_2):
+       mr      r5,r12
+       mr      r4,r28
+       mr      r9,r29
+       b       L(short_path_1)
+L(short_path_prepare_2):
+       mr      r9,r3
+       b       L(short_path_2)
+L(short_path_prepare_2_1):
+       mr      r5,r6
+       mr      r4,r7
+       b       L(short_path_2)
+L(short_path_prepare_2_2):
+       mr      r5,r12
+       mr      r4,r11
+       mr      r9,r6
+       b       L(short_path_2)
+L(short_path_prepare_2_3):
+       mr      r5,r12
+       mr      r4,r28
+       mr      r9,r29
+       b       L(short_path_2)
+L(zero_pad_loop_b_prepare):
+       addi    r10,r9,8
+       rldicl  r5,r5,0,61
+       b       L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+       mr      r5,r6
+       mr      r9,r8
+       b       L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif
This page took 0.134656 seconds and 5 git commands to generate.