From 9314d3545e6641063b490918e2e8716556ba20db Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 27 Dec 2016 17:48:37 -0200 Subject: [PATCH] powerpc64: strchr/strchrnul optimization for power8 The P7 code is used for <=32B strings and for > 32B vectorized loops are used. This shows as an average 25% improvement depending on the position of search character. The performance is same for shorter strings. Tested on ppc64 and ppc64le. --- ChangeLog | 16 + sysdeps/powerpc/powerpc64/multiarch/Makefile | 4 +- .../powerpc64/multiarch/ifunc-impl-list.c | 6 + .../powerpc64/multiarch/strchr-power8.S | 39 ++ sysdeps/powerpc/powerpc64/multiarch/strchr.c | 3 + .../powerpc64/multiarch/strchrnul-power8.S | 39 ++ .../powerpc/powerpc64/multiarch/strchrnul.c | 3 + sysdeps/powerpc/powerpc64/power8/strchr.S | 368 ++++++++++++++++++ sysdeps/powerpc/powerpc64/power8/strchrnul.S | 23 ++ 9 files changed, 499 insertions(+), 2 deletions(-) create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strchr-power8.S create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strchrnul-power8.S create mode 100644 sysdeps/powerpc/powerpc64/power8/strchr.S create mode 100644 sysdeps/powerpc/powerpc64/power8/strchrnul.S diff --git a/ChangeLog b/ChangeLog index b794caccc0..d9b2c98156 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2016-12-28 Rajalakshmi Srinivasaraghavan + + * sysdeps/powerpc/powerpc64/multiarch/Makefile + (sysdep_routines): Add strchr-power8 and strchrnul_power8. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (strchr): Add __strchr_power8 to list of strchr functions. + (strchrnul): Add __strchrnul_power8 to list of strchr functions. + * sysdeps/powerpc/powerpc64/multiarch/strchr-power8.S: New file. + * sysdeps/powerpc/powerpc64/multiarch/strchrnul-power8.S: New file. + * sysdeps/powerpc/powerpc64/multiarch/strchr.c + (strchr): Add __strchr_power8 to ifunc list. + * sysdeps/powerpc/powerpc64/multiarch/strchrnul.c + (__strchrnul): Add __strchrnul_power8 to ifunc list. + * sysdeps/powerpc/powerpc64/power8/strchr.S: New file. + * sysdeps/powerpc/powerpc64/power8/strchrnul.S: New file. + 2016-12-28 Florian Weimer * support/Makefile (libsupport-routines): Add diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 2997b9d6fb..f5889a322b 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -10,8 +10,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ strncase-power7 strncase_l-power7 \ strncmp-power9 strncmp-power8 strncmp-power7 \ strncmp-power4 strncmp-ppc64 \ - strchr-power7 strchr-ppc64 \ - strchrnul-power7 strchrnul-ppc64 \ + strchr-power8 strchr-power7 strchr-ppc64 \ + strchrnul-power8 strchrnul-power7 strchrnul-ppc64 \ strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \ stpcpy-power7 stpcpy-ppc64 \ strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \ diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 2d085a206b..703a49b421 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -123,6 +123,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strchr.c. */ IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strchr_power8) IFUNC_IMPL_ADD (array, i, strchr, hwcap & PPC_FEATURE_HAS_VSX, __strchr_power7) @@ -131,6 +134,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strchrnul.c. */ IFUNC_IMPL (i, name, strchrnul, + IFUNC_IMPL_ADD (array, i, strchrnul, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strchrnul_power8) IFUNC_IMPL_ADD (array, i, strchrnul, hwcap & PPC_FEATURE_HAS_VSX, __strchrnul_power7) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchr-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strchr-power8.S new file mode 100644 index 0000000000..dd0b7f5b28 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strchr-power8.S @@ -0,0 +1,39 @@ +/* Optimized strchr implementation for POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#undef ENTRY +#define ENTRY(name) \ + .section ".text"; \ + ENTRY_2(__strchr_power8) \ + .align ALIGNARG(2); \ + BODY_LABEL(__strchr_power8): \ + cfi_startproc; \ + LOCALENTRY(__strchr_power8) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strchr_power8) \ + END_2(__strchr_power8) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchr.c b/sysdeps/powerpc/powerpc64/multiarch/strchr.c index e24d6b319e..2ffb1f6c5d 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strchr.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strchr.c @@ -27,11 +27,14 @@ extern __typeof (strchr) __strchr_ppc attribute_hidden; extern __typeof (strchr) __strchr_power7 attribute_hidden; +extern __typeof (strchr) __strchr_power8 attribute_hidden; # undef strchr /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc_redirected (__redirect_strchr, strchr, + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strchr_power8 : (hwcap & PPC_FEATURE_HAS_VSX) ? __strchr_power7 : __strchr_ppc); diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strchrnul-power8.S new file mode 100644 index 0000000000..d0bfedac22 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul-power8.S @@ -0,0 +1,39 @@ +/* Optimized strchrnul implementation for POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#undef ENTRY +#define ENTRY(name) \ + .section ".text"; \ + ENTRY_2(__strchrnul_power8) \ + .align ALIGNARG(2); \ + BODY_LABEL(__strchrnul_power8): \ + cfi_startproc; \ + LOCALENTRY(__strchrnul_power8) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strchrnul_power8) \ + END_2(__strchrnul_power8) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c index 682aa0fef7..63df401d51 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c @@ -23,10 +23,13 @@ extern __typeof (__strchrnul) __strchrnul_ppc attribute_hidden; extern __typeof (__strchrnul) __strchrnul_power7 attribute_hidden; +extern __typeof (__strchrnul) __strchrnul_power8 attribute_hidden; /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc (__strchrnul, + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strchrnul_power8 : (hwcap & PPC_FEATURE_HAS_VSX) ? __strchrnul_power7 : __strchrnul_ppc); diff --git a/sysdeps/powerpc/powerpc64/power8/strchr.S b/sysdeps/powerpc/powerpc64/power8/strchr.S new file mode 100644 index 0000000000..331d0a6c48 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strchr.S @@ -0,0 +1,368 @@ +/* Optimized strchr implementation for PowerPC64/POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#ifdef USE_AS_STRCHRNUL +# define FUNC_NAME __strchrnul +#else +# define FUNC_NAME strchr +#endif +/* int [r3] strchr (char *s [r3], int c [r4]) */ +/* TODO: change these to the actual instructions when the minimum required + binutils allows it. */ +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) +/* TODO: change this to .machine power8 when the minimum required binutils + allows it. */ + .machine power7 +ENTRY (FUNC_NAME) + CALL_MCOUNT 2 + dcbt 0,r3 + clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ + cmpdi cr7,r4,0 + ld r12,0(r8) /* Load doubleword from memory. */ + li r0,0 /* Doubleword with null chars to use + with cmpb. */ + + rlwinm r6,r3,3,26,28 /* Calculate padding. */ + + beq cr7,L(null_match) + + /* Replicate byte to doubleword. */ + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 + insrdi r4,r4,32,0 + + /* Now r4 has a doubleword of c bytes and r0 has + a doubleword of null bytes. */ + + cmpb r10,r12,r4 /* Compare each byte against c byte. */ + cmpb r11,r12,r0 /* Compare each byte against null byte. */ + + /* Move the doublewords left and right to discard the bits that are + not part of the string and bring them back as zeros. */ +#ifdef __LITTLE_ENDIAN__ + srd r10,r10,r6 + srd r11,r11,r6 + sld r10,r10,r6 + sld r11,r11,r6 +#else + sld r10,r10,r6 + sld r11,r11,r6 + srd r10,r10,r6 + srd r11,r11,r6 +#endif + or r5,r10,r11 /* OR the results to speed things up. */ + cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes + have been found. */ + bne cr7,L(done) + + mtcrf 0x01,r8 + + /* Are we now aligned to a doubleword boundary? If so, skip to + the main loop. Otherwise, go through the alignment code. */ + + bt 28,L(loop) + + /* Handle WORD2 of pair. */ + ldu r12,8(r8) + cmpb r10,r12,r4 + cmpb r11,r12,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + bne cr7,L(done) + b L(loop) /* We branch here (rather than falling through) + to skip the nops due to heavy alignment + of the loop below. */ + + .p2align 5 +L(loop): + /* Load two doublewords, compare and merge in a + single register for speed. This is an attempt + to speed up the null-checking process for bigger strings. */ + ld r12,8(r8) + ldu r9,16(r8) + cmpb r10,r12,r4 + cmpb r11,r12,r0 + cmpb r6,r9,r4 + cmpb r7,r9,r0 + or r5,r10,r11 + or r9,r6,r7 + or r12,r5,r9 + cmpdi cr7,r12,0 + beq cr7,L(vector) + /* OK, one (or both) of the doublewords contains a c/null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a c/null byte. */ + + cmpdi cr6,r5,0 + addi r8,r8,-8 + bne cr6,L(done) + + /* The c/null byte must be in the second doubleword. Adjust the + address again and move the result of cmpb to r10 so we can calculate + the pointer. */ + + mr r10,r6 + mr r11,r7 + addi r8,r8,8 +#ifdef USE_AS_STRCHRNUL + mr r5, r9 +#endif + /* r10/r11 have the output of the cmpb instructions, that is, + 0xff in the same position as the c/null byte in the original + doubleword from the string. Use that to calculate the pointer. */ +L(done): +#ifdef USE_AS_STRCHRNUL + mr r10, r5 +#endif +#ifdef __LITTLE_ENDIAN__ + addi r3,r10,-1 + andc r3,r3,r10 + popcntd r0,r3 +# ifndef USE_AS_STRCHRNUL + addi r4,r11,-1 + andc r4,r4,r11 + cmpld cr7,r3,r4 + bgt cr7,L(no_match) +# endif +#else + cntlzd r0,r10 /* Count leading zeros before c matches. */ +# ifndef USE_AS_STRCHRNUL + cmpld cr7,r11,r10 + bgt cr7,L(no_match) +# endif +#endif + srdi r0,r0,3 /* Convert leading zeros to bytes. */ + add r3,r8,r0 /* Return address of the matching c byte + or null in case c was not found. */ + blr + + /* Check the first 32B in GPR's and move to vectorized loop. */ + .p2align 5 +L(vector): + addi r3, r8, 8 + andi. r10, r3, 31 + bne cr0, L(loop) + vspltisb v0, 0 + /* Precompute vbpermq constant. */ + vspltisb v10, 3 + lvsl v11, r0, r0 + vslb v10, v11, v10 + MTVRD(v1,r4) + li r5, 16 + vspltb v1, v1, 7 + /* Compare 32 bytes in each loop. */ +L(continue): + lvx v4, 0, r3 + lvx v5, r3, r5 + vcmpequb v2, v0, v4 + vcmpequb v3, v0, v5 + vcmpequb v6, v1, v4 + vcmpequb v7, v1, v5 + vor v8, v2, v3 + vor v9, v6, v7 + vor v11, v8, v9 + vcmpequb. v11, v0, v11 + addi r3, r3, 32 + blt cr6, L(continue) + /* One (or both) of the quadwords contains a c/null byte. */ + addi r3, r3, -32 +#ifndef USE_AS_STRCHRNUL + vcmpequb. v11, v0, v9 + blt cr6, L(no_match) +#endif + /* Permute the first bit of each byte into bits 48-63. */ + VBPERMQ(v2, v2, v10) + VBPERMQ(v3, v3, v10) + VBPERMQ(v6, v6, v10) + VBPERMQ(v7, v7, v10) + /* Shift each component into its correct position for merging. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v3, v3, v3, 2 + vsldoi v7, v7, v7, 2 +#else + vsldoi v2, v2, v2, 6 + vsldoi v3, v3, v3, 4 + vsldoi v6, v6, v6, 6 + vsldoi v7, v7, v7, 4 +#endif + + /* Merge the results and move to a GPR. */ + vor v1, v3, v2 + vor v2, v6, v7 + vor v4, v1, v2 + MFVRD(r5, v4) +#ifdef __LITTLE_ENDIAN__ + addi r6, r5, -1 + andc r6, r6, r5 + popcntd r6, r6 +#else + cntlzd r6, r5 /* Count leading zeros before the match. */ +#endif + add r3, r3, r6 /* Compute final length. */ + /* Return NULL if null found before c. */ +#ifndef USE_AS_STRCHRNUL + lbz r4, 0(r3) + cmpdi cr7, r4, 0 + beq cr7, L(no_match) +#endif + blr + +#ifndef USE_AS_STRCHRNUL + .align 4 +L(no_match): + li r3,0 + blr +#endif + +/* We are here because strchr was called with a null byte. */ + .align 4 +L(null_match): + /* r0 has a doubleword of null bytes. */ + + cmpb r5,r12,r0 /* Compare each byte against null bytes. */ + + /* Move the doublewords left and right to discard the bits that are + not part of the string and bring them back as zeros. */ +#ifdef __LITTLE_ENDIAN__ + srd r5,r5,r6 + sld r5,r5,r6 +#else + sld r5,r5,r6 + srd r5,r5,r6 +#endif + cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes + have been found. */ + bne cr7,L(done_null) + + mtcrf 0x01,r8 + + /* Are we now aligned to a quadword boundary? If so, skip to + the main loop. Otherwise, go through the alignment code. */ + + bt 28,L(loop_null) + + /* Handle WORD2 of pair. */ + ldu r12,8(r8) + cmpb r5,r12,r0 + cmpdi cr7,r5,0 + bne cr7,L(done_null) + b L(loop_null) /* We branch here (rather than falling through) + to skip the nops due to heavy alignment + of the loop below. */ + + /* Main loop to look for the end of the string. Since it's a + small loop (< 8 instructions), align it to 32-bytes. */ + .p2align 5 +L(loop_null): + /* Load two doublewords, compare and merge in a + single register for speed. This is an attempt + to speed up the null-checking process for bigger strings. */ + ld r12,8(r8) + ldu r11,16(r8) + cmpb r5,r12,r0 + cmpb r10,r11,r0 + or r6,r5,r10 + cmpdi cr7,r6,0 + beq cr7,L(vector1) + + /* OK, one (or both) of the doublewords contains a null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a null byte. */ + + cmpdi cr6,r5,0 + addi r8,r8,-8 + bne cr6,L(done_null) + + /* The null byte must be in the second doubleword. Adjust the address + again and move the result of cmpb to r10 so we can calculate the + pointer. */ + + mr r5,r10 + addi r8,r8,8 + + /* r5 has the output of the cmpb instruction, that is, it contains + 0xff in the same position as the null byte in the original + doubleword from the string. Use that to calculate the pointer. */ +L(done_null): +#ifdef __LITTLE_ENDIAN__ + addi r0,r5,-1 + andc r0,r0,r5 + popcntd r0,r0 +#else + cntlzd r0,r5 /* Count leading zeros before the match. */ +#endif + srdi r0,r0,3 /* Convert leading zeros to bytes. */ + add r3,r8,r0 /* Return address of the matching null byte. */ + blr + .p2align 5 +L(vector1): + addi r3, r8, 8 + andi. r10, r3, 31 + bne cr0, L(loop_null) + vspltisb v8, -1 + vspltisb v0, 0 + vspltisb v10, 3 + lvsl v11, r0, r0 + vslb v10, v11, v10 + li r5, 16 +L(continue1): + lvx v4, 0, r3 + lvx v5, r3, r5 + vcmpequb v2, v0, v4 + vcmpequb v3, v0, v5 + vor v8, v2, v3 + vcmpequb. v11, v0, v8 + addi r3, r3, 32 + blt cr6, L(continue1) + addi r3, r3, -32 +L(end1): + VBPERMQ(v2, v2, v10) + VBPERMQ(v3, v3, v10) + /* Shift each component into its correct position for merging. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v3, v3, v3, 2 +#else + vsldoi v2, v2, v2, 6 + vsldoi v3, v3, v3, 4 +#endif + + /* Merge the results and move to a GPR. */ + vor v4, v3, v2 + MFVRD(r5, v4) +#ifdef __LITTLE_ENDIAN__ + addi r6, r5, -1 + andc r6, r6, r5 + popcntd r6, r6 +#else + cntlzd r6, r5 /* Count leading zeros before the match. */ +#endif + add r3, r3, r6 /* Compute final length. */ + blr +END (FUNC_NAME) + +#ifndef USE_AS_STRCHRNUL +weak_alias (strchr, index) +libc_hidden_builtin_def (strchr) +#endif diff --git a/sysdeps/powerpc/powerpc64/power8/strchrnul.S b/sysdeps/powerpc/powerpc64/power8/strchrnul.S new file mode 100644 index 0000000000..0229496872 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strchrnul.S @@ -0,0 +1,23 @@ +/* Optimized strchrnul implementation for PowerPC64/POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define USE_AS_STRCHRNUL 1 +#include + +weak_alias (__strchrnul,strchrnul) +libc_hidden_builtin_def (__strchrnul) -- 2.43.5