This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
>From 464f5b0d19f0bd8873151627cd1449288f80e9fc Mon Sep 17 00:00:00 2001 From: Vidya Ranganathan <vidya@linux.vnet.ibm.com> Date: Fri, 28 Feb 2014 11:32:09 -0500 Subject: [PATCH] Multiarch optimization for strncat() on ppc64 and ppc64le. I have attached the benchtest output to show the performance improvement. The optimization is achieved by following techniques: 1.Word and doubleWord aligned memory access using Power7 cmpb instruction 2.loop unrolling Power7 gain 3.CPU pre-fetch to avoid cache miss ChangeLog: 2014-02-28 Vidya Ranganathan <vidya@linux.vnet.ibm.com> * sysdeps/powerpc/powerpc64/power7/strncat.S: New file: Optimization. * sysdeps/powerpc/powerpc64/multiarch/strncat.c: New file: multiarch strncat for PPC64. * sysdeps/powerpc/powerpc64/multiarch/strncat-ppc64.c: New file * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: New file * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: (__libc_ifunc_impl_list): Likewise. * sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strncat multiarch optimizations Signed-off-by: Vidya Ranganathan <vidya@linux.vnet.ibm.com> --- sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +- .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 8 + .../powerpc/powerpc64/multiarch/strncat-power7.S | 40 ++++ .../powerpc/powerpc64/multiarch/strncat-ppc64.c | 30 +++ sysdeps/powerpc/powerpc64/multiarch/strncat.c | 31 +++ sysdeps/powerpc/powerpc64/power7/strncat.S | 237 +++++++++++++++++++++ 6 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat-ppc64.c create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat.c create mode 100644 sysdeps/powerpc/powerpc64/power7/strncat.S diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 3c47316..795bb50 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -13,7 +13,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \ wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \ wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \ - strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 + strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \ + strncat-power7 strncat-ppc64 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 6bbdd4e..3ebc066 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -238,5 +238,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strncat.c */ + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, + hwcap & PPC_FEATURE_HAS_VSX, + __strncat_power7) + IFUNC_IMPL_ADD (array, i, strncat, 1, + __strncat_ppc)) + return i; } diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S new file mode 100644 index 0000000..0e8933c --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S @@ -0,0 +1,40 @@ +/* Optimized strncat implementation for POWER7. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strncat_power7) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strncat_power7): \ + cfi_startproc; \ + LOCALENTRY(__strncat_power7) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strncat_power7) \ + END_2(__strncat_power7) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/power7/strncat.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncat-ppc64.c new file mode 100644 index 0000000..14b56f5 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-ppc64.c @@ -0,0 +1,30 @@ +/* Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/ >. */ + +#include <string.h> + +#define STRNCAT __strncat_ppc +#ifdef SHARED + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncat_ppc, __GI_strncat, __strncat_ppc); +#endif + +extern __typeof (strncat) __strncat_ppc attribute_hidden; + +#include <string/strncat.c> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat.c b/sysdeps/powerpc/powerpc64/multiarch/strncat.c new file mode 100644 index 0000000..db98ec1 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strncat.c @@ -0,0 +1,31 @@ +/* Multiple versions of strncat. PowerPC64 version. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef NOT_IN_libc +# include <string.h> +# include <shlib-compat.h> +# include "init-arch.h" + +extern __typeof (strncat) __strncat_ppc attribute_hidden; +extern __typeof (strncat) __strncat_power7 attribute_hidden; + +libc_ifunc (strncat, + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strncat_power7 + : __strncat_ppc); +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/strncat.S b/sysdeps/powerpc/powerpc64/power7/strncat.S new file mode 100644 index 0000000..9c7491e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/strncat.S @@ -0,0 +1,237 @@ +/* Optimized strncat implementation for PowerPC64/POWER7. + + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The algorithm is as follows for aligned memory access : + + if address of s2 is divisible by 0x7UL, + perform aligned doubleword catenation + else + if address of s2 is divisible by 0x3UL, + perform aligned word catenation + else + perform unaligned catenation + + The aligned comparison are made using cmpb instructions. */ + +/* char* [r3] strncat (const char *s1 [r3], + const char *s2 [r4], + size_t size [r5]) */ + +#include <sysdep.h> + +#ifndef STRNCAT +# undef strncat +# define STRNCAT strncat +#endif + + .machine power7 +EALIGN(STRNCAT, 4, 0) + CALL_MCOUNT 3 + + mflr r0 /* load link register LR to r0 */ + + /* we shall use r29, r30 and r31 non volatile register for retention. + save all the calle registers in the GPR save area. + */ + std r29, -24(r1) /* save callers register , r29 */ + std r30, -16(r1) /* save callers register , r30 */ + std r31, -8(r1) /* save callers register , r31 */ + + std r0, 16(r1) /* store the link register */ + stdu r1,-144(r1) /* create the stack frame */ + + /* improve performance with CPU pre-fetch */ + dcbt 0, r3 /* pre-fetch str to avoid cache miss */ + dcbt 0, r4 /* pre-fetch accept to avoid cache miss */ + + mr. r29, r5 /* save "n" in r29 */ + mr r30, r3 /* save "s1" in r30 from r3 */ + beq cr0,L(done) + + mr r31, r4 /* save "s2" in r31 from r4 */ + bl strlen /* call optimized strlen on s1; goto end of s1 */ + nop /* no-operation ; generate CPU activity */ + cmpldi cr7, r29, 7 /* if s2 is <=7 ; process byte-by-byte */ + add r3, r30, r3 /* grab the last character of s1 */ + bgt cr7,L(alignment) /* process by aligned strings */ + + cmpldi cr7, r29, 3 /* if n is >= 4, we can byte-unroll */ + addi r9, r3, -1 /* Make S1 point before next character, increment when read */ + bgt cr7, L(bytes_unroll) /* process each byte */ + +L(byte_by_byte): + lbz r10, 0(r31) + addi r8, r9, 1 + cmpdi cr7, r10, 0 /* check for NULL in s2 */ + stb r10, 1(r9) + beq cr7, L(done) + add r9, r9, r29 + subf r9, r8, r9 + addi r9, r9, 1 + mtctr r9 + b L(loop2) + .p2align 4 +L(loop1): + lbzu r10, 1(r31) + cmpdi cr7, r10, 0 + stbu r10, 1(r8) + beq cr7,L(done) +L(loop2): + mr r9, r8 + bdnz L(loop1) + beq cr7,L(done) +L(nullTerminate): + li r10, 0 /* load NULL for termination */ + stb r10, 1(r9) /* append or terminate s1 with NULL */ + .p2align 4 /* a small section here */ +L(done): /* we return now */ + addi r1, r1, 144 /* restore stack pointer*/ + mr r3, r30 /* set the return value, length of string */ + ld r0, 16(r1) /* read the saved link register */ + ld r29, -24(r1) /* restore callers save register, r29 */ + ld r30, -16(r1) /* restore callers save register, r30 */ + ld r31, -8(r1) /* restore callers save register, r31 */ + mtlr r0 /* restore link register */ + blr /* branch to link register */ + + .p2align 4 +L(alignment): + rldicl. r9, r31, 0, 61 /* check if s2 is 8byte aligned */ + beq cr0,L(dwordAligned) + rldicl. r10, r31, 0, 62 /* check if s2 is 4byte aligned */ + bne cr0,L(bytes_unroll) + + /* if s2 is word aligned ; we load and store word */ + srdi r8, r29, 2 /* compute count for CTR to loop ; count = n/4 */ + li r7, 0 /* load r7 with NULL */ + li r9, 0 /* load r9 with MASK '0' */ + + /* read, write 4 bytes at a time */ + mtctr r8 /* move count to CTR */ +L(loop4): + lwz r10, 0(r31) /* read word from s2 */ + cmpb r6, r10, r9 /* compare bytes in s2 we read just now */ + cmpwi r6, 0 /* if cmpb returned NULL ; we continue */ + bne+ L(a4) + stw r10, 0(r3) /* append word from s2 with s1 */ + addi r3, r3, 4 /* increment s1 */ + addi r31, r31, 4 /* increment s2 */ + subi r29, r29, 4 /* decrement count by 4 */ + bdnz L(loop4) /* continue until "count" is non zero */ + +L(a4): + cmpdi r29, 0 /* if "n" is already zero ; we skip */ + beq+ L(align4align) + + mtctr r29 /* process left over bytes in "n" */ + unaligned1: lbz r10, 0(r31) /* read a byte from s2 */ + cmpw r10, r7 /* if byte is NULL, we stop here */ + beq+ L(align4align) /* skip processing further if NULL */ + stb r10, 0(r3) /* if not NULL ; store byte into s1 */ + addi r3, r3, 1 /* increment s1 by 1 */ + addi r31, r31, 1 /* increment s2 by 1 */ + bdnz unaligned1 /* decrement counter "n" and loop until non zero */ +L(align4align): + stb r7, 0(3) /* terminate s1 with NULL */ + + b L(done) + .p2align 4 + /* unaligned bytes in string; so process byte by byte. + POWER7 has performance gains over loop unroll. */ +L(bytes_unroll): + addi r9, r3, -1 + srdi r10, r29, 2 + mtctr r10 + b L(L10) + .p2align 4 +L(L44): + lbz r10, 1(r31) /* load byte */ + cmpdi cr7, r10, 0 /* compare ; if byte not zero ; continue */ + stb r10, 2(r9) /* store byte */ + beq cr7, L(done) + addi r31, r31, 4 + + lbz r10, -2(r31) /* perform loop unroll here on byte load and store */ + cmpdi cr7, r10, 0 + stb r10, 3(r9) + beq cr7, L(done) + + lbz r10, -1(r31) /* loop unroll here */ + cmpdi cr7, r10, 0 + stbu r10, 4(r9) + beq cr7, L(done) + + bdz L(leftNbytes) + +L(L10): + lbz r10, 0(r31) /* loop unroll here */ + cmpdi cr7, r10, 0 + stb r10, 1(r9) + bne cr7,L(L44) + b L(done) + .p2align 4 + /* if s2 is double word aligned ; we load and store double word. */ +L(dwordAligned): + /* read, write 8 bytes at a time */ + srdi r8, r29, 3 /* compute count for CTR to loop; count = n/8 */ + li r7, 0 /* load r7 with NULL */ + li r10, 0 /* load r10 with MASK '0' */ + + mtctr r8 /* move count to CTR */ +L(loop8): + ld r9, 0(r31) /* read double word from s2 */ + cmpb r6, r9, r10 /* compare bytes in s2 we read just now */ + cmpdi r6, 0 /* if cmpb returned NULL ; we continue */ + bne+ L(a8) + std r9, 0(r3) /* append double word from s2 with s1 */ + addi r3, r3, 8 /* increment s1 */ + addi r31, r31, 8 /* increment s2 */ + subi r29, r29, 8 /* decrement count by 8 */ + bdnz L(loop8) /* continue until "count" is non zero */ + +L(a8): + cmpdi r29, 0 /* if "n" is already zero ; we skip */ + beq+ L(align8align) + + mtctr r29 /* process left over bytes in "n" */ + unaligned0: lbz r9, 0(r31) /* read a byte from s2 */ + cmpw r9, r7 /* if byte is NULL, we stop here */ + beq+ L(align8align) /* skip processing further if NULL */ + stb r9, 0(r3) /* if not NULL ; store byte into s1 */ + addi r3, r3, 1 /* increment s1 by 1 */ + addi r31, r31, 1 /* increment s2 by 1 */ + bdnz unaligned0 /* decrement counter "n" and loop until non zero */ +L(align8align): + stb r7, 0(r3) /* terminate s1 with NULL */ + + addi r1, r1, 144 /* restore stack pointer*/ + mr r3, r30 /* set the return value, length of string */ + ld r0, 16(r1) /* read the saved link register */ + ld r29, -24(r1) /* restore callers save register, r29 */ + ld r30, -16(r1) /* restore callers save register, r30 */ + ld r31, -8(r1) /* restore callers save register, r31 */ + mtlr r0 /* restore link register */ + blr /* branch to link register */ + + .p2align 4 +L(leftNbytes): + rldicl. r29, r29, 0, 62 /* check if n>0 and n < 4 bytes */ + bne cr0,L(byte_by_byte) /* process bytes one by one */ + b L(nullTerminate) /* now, finish catenation with NULL termination */ +END(STRNCAT) -- 1.8.3.1
Attachment:
bench-strncat.ppc64.out
Description: Text document
Attachment:
bench-strncat.ppc64le.out
Description: Text document
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |