Hi Vidya,
Some comments:
On 10-04-2014 10:40, vidya@linux.vnet.ibm.com wrote:
From: Vidya Ranganathan <vidya@linux.vnet.ibm.com>
The optimization is achieved by following techniques:
> data alignment [gain from aligned memory access on read/write]
> prefetch data [gain from cache misses by anticipating load]
> POWER7 gains performance with loop unrolling/unwinding
[gain by reduction of branch penalty].
ChangeLog:
2014-03-27 Vidya Ranganathan <vidya@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power7/strncpy.S: New file: Optimization.
* sysdeps/powerpc/powerpc64/multiarch/strncpy.c: New file:
multiarch strncpy for PPC64.
* sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c: New file
* sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S: New file
* sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strpcpy, stpncpy
multiarch optimizations
Missing period.
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
new file mode 100644
index 0000000..0766fa8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strncpy.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/ >. */
+
+/* Define multiple versions only for definition in libc. */
+#ifndef NOT_IN_libc
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
+extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+ ifunc symbol properly. */
+libc_ifunc (strncpy,
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncpy_power7
+ : __strncpy_ppc);
+
+#endif
The indentation is strange, please follow the other files.
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000..31fecc6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,468 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+ char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ AND
+
+ char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ The algorithm is as follows:
+ > if src and dest are 8 byte aligned, perform double word copy
+ else
+ > if src and dest are 4 byte aligned, perform word copy
+ else
+ > copy byte by byte on unaligned addresses.
+
+ The aligned comparison are made using cmpb instructions. */
+
+/* The focus on optimization for performance improvements are as follows:
+ 1. data alignment [gain from aligned memory access on read/write]
+ 2. prefetch data [gain from cache misses by anticipating load]
+ 3. POWER7 gains performance with loop unrolling/unwinding
+ [gain by reduction of branch penalty]. */
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
+
+ .machine power7
+EALIGN(FUNC_NAME, 4, 0)
+ CALL_MCOUNT 3
+
+ dcbt 0, r3 /* CPU pre-fetch dst to avoid cache miss */
+ dcbt 0, r4 /* CPU pre-fetch src to avoid cache miss */
+
+ mflr r0 /* load link register LR to r0 */
+ or r9, r3, r4 /* to verify source and destination */
+ rldicl. r10, r9, 0, 61 /* is doubleWord aligned ..? */
+
+ std r31, -8(r1) /* save callers register , r31 */
+ std r30, -16(r1) /* save callers register , r30 */
+ std r15, -24(r1) /* save callers register , r15 */
+ std r0, 16(r1) /* store the link register */
+ stdu r1, -FRAMESIZE(r1) /* create the stack frame */
+
+ mr r15, r3 /* save r3 into r15 as retcode for strncpy */
+ mr r31, r3 /* save r3 into r31 for use */
+ beq cr0,L(dwordAligned)
+ rldicl. r10, r9, 0, 62 /* is word aligned .. ? */
+ bne cr0,L(byte_by_byte)
Checking the patch performance I noted the aligned load/store using words is not
showing much difference in word aligned source/destiny. I tested removing the
code and the shorted code path helped latency a little for shorter strings. I
think it is better to use doubleword logic.
+
+L(zeroFill):
+ cmpdi cr7, r10, 0 /* compare if length is zero */
+ beq cr7,L(hop2Return)
+ mr r3, r31 /* fill buffer with zero */
+ li r4, 0 /* buffer size to fill zero with */
+ mr r5, r10 /* fill buffer target */
+ bl memset /* fill with zeroes */
+ nop /* trigger CPU activity */
This will generate a PLT relocation, you need to call __memset_power7 for the IFUNC optimization
(st[r/p]ncpy-power7) and __GI_memset/memset (if not SHARED) for non multiarch builds. I fixed
a similar issue for strncat optimization recently, check de21c33c068c8e39afb5711613a7c083c11ce6a1.