]> sourceware.org Git - glibc.git/commitdiff
powerpc: Optimized strcmp for power10
authorAmrita H S <amritahs@linux.vnet.ibm.com>
Wed, 6 Dec 2023 16:43:11 +0000 (11:43 -0500)
committerRajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Thu, 7 Dec 2023 17:10:40 +0000 (11:10 -0600)
This patch is based on __strcmp_power9 and __strlen_power10.

Improvements from __strcmp_power9:

    1. Uses new POWER10 instructions
       - This code uses lxvp to decrease contention on load
         by loading 32 bytes per instruction.

    2. Performance implication
       - This version has around 30% better performance on average.
       - Performance regression is seen for a specific combination
         of sizes and alignments. Some of them is observed without
         changes also, while rest may be induced by the patch.

Signed-off-by: Amrita H S <amritahs@linux.vnet.ibm.com>
Reviewed-by: Paul E. Murphy <murphyp@linux.ibm.com>
sysdeps/powerpc/powerpc64/le/power10/strcmp.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/Makefile
sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/strcmp.c

diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
new file mode 100644 (file)
index 0000000..a3c1ada
--- /dev/null
@@ -0,0 +1,204 @@
+/* Optimized strcmp implementation for PowerPC64/POWER10.
+   Copyright (C) 2021-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+#include <sysdep.h>
+
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
+/* Implements the function
+   int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]).  */
+
+/* TODO: Change this to actual instructions when minimum binutils is upgraded
+   to 2.27.  Macros are defined below for these newer instructions in order
+   to maintain compatibility.  */
+
+#define LXVP(xtp,dq,ra)                     \
+       .long(((6)<<(32-6))          \
+       | ((((xtp)-32)>>1)<<(32-10)) \
+       | ((1)<<(32-11))             \
+       | ((ra)<<(32-16))            \
+       | dq)
+
+#define COMPARE_16(vreg1,vreg2,offset)  \
+       lxv       vreg1+32,offset(r3);  \
+       lxv       vreg2+32,offset(r4);  \
+       vcmpnezb. v7,vreg1,vreg2;       \
+       bne       cr6,L(different);     \
+
+#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
+       LXVP(vreg1+32,offset,r3);                    \
+       LXVP(vreg2+32,offset,r4);                    \
+       vcmpnezb. v7,vreg1+1,vreg2+1;                \
+       bne       cr6,L(label1);                     \
+       vcmpnezb. v7,vreg1,vreg2;                    \
+       bne       cr6,L(label2);                     \
+
+#define TAIL(vreg1,vreg2)     \
+       vctzlsbb r6,v7;       \
+       vextubrx r5,r6,vreg1; \
+       vextubrx r4,r6,vreg2; \
+       subf     r3,r4,r5;    \
+       blr;                  \
+
+#define CHECK_N_BYTES(reg1,reg2,len_reg) \
+       sldi      r0,len_reg,56;         \
+       lxvl      32+v4,reg1,r0;         \
+       lxvl      32+v5,reg2,r0;         \
+       add       reg1,reg1,len_reg;     \
+       add       reg2,reg2,len_reg;     \
+       vcmpnezb. v7,v4,v5;              \
+       vctzlsbb  r6,v7;                 \
+       cmpld     cr7,r6,len_reg;        \
+       blt       cr7,L(different);      \
+
+       /* TODO: change this to .machine power10 when the minimum required
+       binutils allows it.  */
+
+       .machine  power9
+ENTRY_TOCLESS (STRCMP, 4)
+       li       r11,16
+       /* eq bit of cr1 used as swap status flag to indicate if
+       source pointers were swapped.  */
+       crclr    4*cr1+eq
+       vspltisb v19,-1
+       andi.    r7,r3,15
+       sub      r7,r11,r7      /* r7(nalign1) = 16 - (str1 & 15).  */
+       andi.    r9,r4,15
+       sub      r5,r11,r9      /* r5(nalign2) = 16 - (str2 & 15).  */
+       cmpld    cr7,r7,r5
+       beq      cr7,L(same_aligned)
+       blt      cr7,L(nalign1_min)
+       /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
+       pointer which is closer to the next 16B boundary so that only
+       one CHECK_N_BYTES is needed before entering the loop below.  */
+       mr       r8,r4
+       mr       r4,r3
+       mr       r3,r8
+       mr       r12,r7
+       mr       r7,r5
+       mr       r5,r12
+       crset    4*cr1+eq       /* Set bit on swapping source pointers.  */
+
+       .p2align 5
+L(nalign1_min):
+       CHECK_N_BYTES(r3,r4,r7)
+
+       .p2align 5
+L(s1_aligned):
+       /* r9 and r5 is number of bytes to be read after and before
+        page boundary correspondingly.  */
+       sub     r5,r5,r7
+       subfic  r9,r5,16
+       /* Now let r7 hold the count of quadwords which can be
+       checked without crossing a page boundary. quadword offset is
+       (str2>>4)&0xFF.  */
+       rlwinm  r7,r4,28,0xFF
+       /* Below check is required only for first iteration. For second
+       iteration and beyond, the new loop counter is always 255.  */
+       cmpldi  r7,255
+       beq     L(L3)
+       /* Get the initial loop count by 255-((str2>>4)&0xFF).  */
+       subfic  r11,r7,255
+
+       .p2align 5
+L(L1):
+       mtctr   r11
+
+       .p2align 5
+L(L2):
+       COMPARE_16(v4,v5,0)     /* Load 16B blocks using lxv.  */
+       addi    r3,r3,16
+       addi    r4,r4,16
+       bdnz    L(L2)
+       /* Cross the page boundary of s2, carefully.  */
+
+       .p2align 5
+L(L3):
+       CHECK_N_BYTES(r3,r4,r5)
+       CHECK_N_BYTES(r3,r4,r9)
+       li      r11,255         /* Load the new loop counter.  */
+       b       L(L1)
+
+       .p2align 5
+L(same_aligned):
+       CHECK_N_BYTES(r3,r4,r7)
+        /* Align s1 to 32B and adjust s2 address.
+          Use lxvp only if both s1 and s2 are 32B aligned.  */
+       COMPARE_16(v4,v5,0)
+       COMPARE_16(v4,v5,16)
+       COMPARE_16(v4,v5,32)
+       COMPARE_16(v4,v5,48)
+       addi    r3,r3,64
+       addi    r4,r4,64
+       COMPARE_16(v4,v5,0)
+       COMPARE_16(v4,v5,16)
+
+       clrldi  r6,r3,59
+       subfic  r5,r6,32
+       add     r3,r3,r5
+       add     r4,r4,r5
+       andi.   r5,r4,0x1F
+       beq     cr0,L(32B_aligned_loop)
+
+       .p2align 5
+L(16B_aligned_loop):
+       COMPARE_16(v4,v5,0)
+       COMPARE_16(v4,v5,16)
+       COMPARE_16(v4,v5,32)
+       COMPARE_16(v4,v5,48)
+       addi    r3,r3,64
+       addi    r4,r4,64
+       b       L(16B_aligned_loop)
+
+       /* Calculate and return the difference.  */
+L(different):
+       vctzlsbb r6,v7
+       vextubrx r5,r6,v4
+       vextubrx r4,r6,v5
+       bt       4*cr1+eq,L(swapped)
+       subf     r3,r4,r5
+       blr
+
+       /* If src pointers were swapped, then swap the
+       indices and calculate the return value.  */
+L(swapped):
+       subf     r3,r5,r4
+       blr
+
+       .p2align 5
+L(32B_aligned_loop):
+       COMPARE_32(v14,v16,0,tail1,tail2)
+       COMPARE_32(v18,v20,32,tail3,tail4)
+       COMPARE_32(v22,v24,64,tail5,tail6)
+       COMPARE_32(v26,v28,96,tail7,tail8)
+       addi    r3,r3,128
+       addi    r4,r4,128
+       b       L(32B_aligned_loop)
+
+L(tail1): TAIL(v15,v17)
+L(tail2): TAIL(v14,v16)
+L(tail3): TAIL(v19,v21)
+L(tail4): TAIL(v18,v20)
+L(tail5): TAIL(v23,v25)
+L(tail6): TAIL(v22,v24)
+L(tail7): TAIL(v27,v29)
+L(tail8): TAIL(v26,v28)
+
+END (STRCMP)
+libc_hidden_builtin_def (strcmp)
index 27d8495503a5a1fefea19913d6758f69c50127de..d7824a922b0de4703682c36ab104fc9639ca79a6 100644 (file)
@@ -33,7 +33,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
                   rawmemchr-power9 rawmemchr-power10 \
-                  strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
+                  strcmp-power9 strcmp-power10 strncmp-power9 \
+                  strcpy-power9 stpcpy-power9 \
                   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
index fc26dd0e17275f72b469b3c65d22d64f67c3066a..965dd17786346db830542ca67c7af4a0bff0e582 100644 (file)
@@ -377,6 +377,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c.  */
   IFUNC_IMPL (i, name, strcmp,
 #ifdef __LITTLE_ENDIAN__
+             IFUNC_IMPL_ADD (array, i, strcmp,
+                             (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+                             && (hwcap & PPC_FEATURE_HAS_VSX),
+                             __strcmp_power10)
              IFUNC_IMPL_ADD (array, i, strcmp,
                              hwcap2 & PPC_FEATURE2_ARCH_3_00
                              && hwcap & PPC_FEATURE_HAS_ALTIVEC,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
new file mode 100644 (file)
index 0000000..c80067c
--- /dev/null
@@ -0,0 +1,26 @@
+/* Optimized strcmp implementation for POWER10/PPC64.
+   Copyright (C) 2021-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRCMP __strcmp_power10
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S>
+#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */
index 31fcdee91672ba8e1027fb2ffc1d08118db9073a..f1dac99b660f8d963a5c44a4b0651ac3b80a0f73 100644 (file)
@@ -29,12 +29,16 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
 extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
 # ifdef __LITTLE_ENDIAN__
 extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
+extern __typeof (strcmp) __strcmp_power10 attribute_hidden;
 # endif
 
 # undef strcmp
 
 libc_ifunc_redirected (__redirect_strcmp, strcmp,
 # ifdef __LITTLE_ENDIAN__
+                       (hwcap2 & PPC_FEATURE2_ARCH_3_1
+                        && hwcap & PPC_FEATURE_HAS_VSX)
+                       ? __strcmp_power10 :
                        (hwcap2 & PPC_FEATURE2_ARCH_3_00
                         && hwcap & PPC_FEATURE_HAS_ALTIVEC)
                        ? __strcmp_power9 :
This page took 0.05173 seconds and 5 git commands to generate.