This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH] Power7 optimizations for strncpy and stpncpy.

From: R Vidya <vidya at linux dot vnet dot ibm dot com>
To: libc-alpha at sourceware dot org
Date: Fri, 02 May 2014 23:44:53 +0530
Subject: Re: [PATCH] Power7 optimizations for strncpy and stpncpy.
Authentication-results: sourceware.org; auth=none
References: <1397137214-63282-1-git-send-email-vidya at linux dot vnet dot ibm dot com> <5347F61C dot 4040006 at linux dot vnet dot ibm dot com>

Hi , Comments incorporated and patch sent.

On Friday 11 April 2014 07:33 PM, Adhemerval Zanella wrote:

Hi Vidya,

Some comments:


On 10-04-2014 10:40, vidya@linux.vnet.ibm.com wrote:

From: Vidya Ranganathan <vidya@linux.vnet.ibm.com>

The optimization is achieved by following techniques:
    > data alignment [gain from aligned memory access on read/write]
    > prefetch data [gain from cache misses by anticipating load]
    > POWER7 gains performance with loop unrolling/unwinding
       [gain by reduction of branch penalty].

ChangeLog:
2014-03-27  Vidya Ranganathan  <vidya@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/power7/strncpy.S: New file: Optimization.
	* sysdeps/powerpc/powerpc64/multiarch/strncpy.c: New file:
	multiarch strncpy for PPC64.
	* sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c: New file
	* sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S: New file
	* sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strpcpy, stpncpy
	multiarch optimizations

Missing period.

diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
new file mode 100644
index 0000000..0766fa8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strncpy.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/ >. */
+
+/* Define multiple versions only for definition in libc. */
+#ifndef NOT_IN_libc
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
+extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+ ifunc symbol properly. */
+libc_ifunc (strncpy,
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncpy_power7
+ : __strncpy_ppc);
+
+#endif

The indentation is strange, please follow the other files.

diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000..31fecc6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,468 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+   char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   AND
+
+   char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   The algorithm is as follows:
+   > if src and dest are 8 byte aligned, perform double word copy
+     else
+   > if src and dest are 4 byte aligned, perform word copy
+     else
+   > copy byte by byte on unaligned addresses.
+
+   The aligned comparison are made using cmpb instructions.  */
+
+/* The focus on optimization for performance improvements are as follows:
+   1. data alignment [gain from aligned memory access on read/write]
+   2. prefetch data [gain from cache misses by anticipating load]
+   3. POWER7 gains performance with loop unrolling/unwinding
+      [gain by reduction of branch penalty].  */
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+#define		FRAMESIZE	(FRAME_MIN_SIZE+32)
+
+	.machine  power7
+EALIGN(FUNC_NAME, 4, 0)
+	CALL_MCOUNT 3
+
+	dcbt 0, r3		/* CPU pre-fetch dst to avoid cache miss  */
+	dcbt 0, r4		/* CPU pre-fetch src to avoid cache miss  */
+
+	mflr r0			/* load link register LR to r0  */
+	or r9, r3, r4		/* to verify source and destination  */
+	rldicl. r10, r9, 0, 61	/* is doubleWord aligned ..?  */
+
+	std r31, -8(r1)		/* save callers register , r31  */
+	std r30, -16(r1)	/* save callers register , r30  */
+	std r15, -24(r1)	/* save callers register , r15  */
+	std r0, 16(r1)		/* store the link register  */
+	stdu r1, -FRAMESIZE(r1)	/* create the stack frame  */
+
+	mr r15, r3		/* save r3 into r15 as retcode for strncpy  */
+	mr r31, r3		/* save r3 into r31 for use  */
+	beq cr0,L(dwordAligned)
+	rldicl. r10, r9, 0, 62	/* is word aligned .. ?  */
+	bne cr0,L(byte_by_byte)

Checking the patch performance I noted the aligned load/store using words is not
showing much difference in word aligned source/destiny.  I tested removing the
code and the shorted code path helped latency a little for shorter strings.  I
think it is better to use doubleword logic.

+
+L(zeroFill):
+	cmpdi cr7, r10, 0	/* compare if length is zero  */
+	beq cr7,L(hop2Return)
+	mr r3, r31		/* fill buffer with zero  */
+	li r4, 0		/* buffer size to fill zero with  */
+	mr r5, r10		/* fill buffer target  */
+	bl memset		/* fill with zeroes  */
+	nop			/* trigger CPU activity  */

This will generate a PLT relocation, you need to call __memset_power7 for the IFUNC optimization
(st[r/p]ncpy-power7) and __GI_memset/memset (if not SHARED) for non multiarch builds.  I fixed
a similar issue for strncat optimization recently, check de21c33c068c8e39afb5711613a7c083c11ce6a1.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]