x86-64: Optimize memrchr with AVX2

author H.J. Lu <hjl.tools@gmail.com>

Fri, 9 Jun 2017 12:44:30 +0000 (05:44 -0700)

committer H.J. Lu <hjl.tools@gmail.com>

Fri, 9 Jun 2017 12:44:41 +0000 (05:44 -0700)
author H.J. Lu <hjl.tools@gmail.com>
Fri, 9 Jun 2017 12:44:30 +0000 (05:44 -0700)
committer H.J. Lu <hjl.tools@gmail.com>
Fri, 9 Jun 2017 12:44:41 +0000 (05:44 -0700)
diff --git a/ChangeLog b/ChangeLog

index e8eb9e7b48fc22d2b3837687d509f67e0573d1d6..8fea821f91a0f93f7e5f20c4f2deae5daed70368 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2017-06-09  H.J. Lu  <hongjiu.lu@intel.com>
+
+       * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+       memrchr-sse2 and memrchr-avx2.
+       * sysdeps/x86_64/multiarch/ifunc-impl-list.c
+       (__libc_ifunc_impl_list): Add tests for __memrchr_avx2 and
+       __memrchr_sse2.
+       * sysdeps/x86_64/multiarch/memrchr-avx2.S: New file.
+       * sysdeps/x86_64/multiarch/memrchr-sse2.S: Likewise.
+       * sysdeps/x86_64/multiarch/memrchr.c: Likewise.
+
  2017-06-09  H.J. Lu  <hongjiu.lu@intel.com>
  
         * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile

index 4523f51095c822a6beefedf1469bcbbf64337f60..2fa390b3dd553f2cd895d81ff85a769fdcbe9652 100644 (file)
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,6 +7,7 @@ ifeq ($(subdir),string)
  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
                    strcmp-sse2-unaligned strncmp-ssse3 \
                    memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
+                  memrchr-sse2 memrchr-avx2 \
                    memcmp-avx2-movbe \
                    memcmp-sse4 memcpy-ssse3 \
                    memmove-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c

index 8dda1b040a5054351595fa5bc6e3717c1e630578..5670eb7e9eccd7c08f496737859c99a72c1717e1 100644 (file)
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -112,6 +112,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
               IFUNC_IMPL_ADD (array, i, memmove, 1,
                               __memmove_sse2_unaligned_erms))
  
+  /* Support sysdeps/x86_64/multiarch/memrchr.S.  */
+  IFUNC_IMPL (i, name, memrchr,
+             IFUNC_IMPL_ADD (array, i, memrchr,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __memrchr_avx2)
+             IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+
    /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
    IFUNC_IMPL (i, name, __memset_chk,
               IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S

new file mode 100644 (file)

index 0000000..3ee02e1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -0,0 +1,359 @@
+/* memrchr optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER   vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+       .section .text.avx,"ax",@progbits
+ENTRY (__memrchr_avx2)
+       /* Broadcast CHAR to YMM0.  */
+       vmovd   %esi, %xmm0
+       vpbroadcastb %xmm0, %ymm0
+
+       subq    $VEC_SIZE, %rdx
+       jbe     L(last_vec_or_less)
+
+       addq    %rdx, %rdi
+
+       /* Check the last VEC_SIZE bytes.  */
+       vpcmpeqb (%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x0)
+
+       subq    $(VEC_SIZE * 4), %rdi
+       movl    %edi, %ecx
+       andl    $(VEC_SIZE - 1), %ecx
+       jz      L(aligned_more)
+
+       /* Align data for aligned loads in the loop.  */
+       addq    $VEC_SIZE, %rdi
+       addq    $VEC_SIZE, %rdx
+       andq    $-VEC_SIZE, %rdi
+       subq    %rcx, %rdx
+
+       .p2align 4
+L(aligned_more):
+       subq    $(VEC_SIZE * 4), %rdx
+       jbe     L(last_4x_vec_or_less)
+
+       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+          since data is only aligned to VEC_SIZE.  */
+       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x3)
+
+       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+       vpmovmskb %ymm2, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x2)
+
+       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+       vpmovmskb %ymm3, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x1)
+
+       vpcmpeqb (%rdi), %ymm0, %ymm4
+       vpmovmskb %ymm4, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x0)
+
+       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
+          There are some overlaps with above if data isn't aligned
+          to 4 * VEC_SIZE.  */
+       movl    %edi, %ecx
+       andl    $(VEC_SIZE * 4 - 1), %ecx
+       jz      L(loop_4x_vec)
+
+       addq    $(VEC_SIZE * 4), %rdi
+       addq    $(VEC_SIZE * 4), %rdx
+       andq    $-(VEC_SIZE * 4), %rdi
+       subq    %rcx, %rdx
+
+       .p2align 4
+L(loop_4x_vec):
+       /* Compare 4 * VEC at a time forward.  */
+       subq    $(VEC_SIZE * 4), %rdi
+       subq    $(VEC_SIZE * 4), %rdx
+       jbe     L(last_4x_vec_or_less)
+
+       vmovdqa (%rdi), %ymm1
+       vmovdqa VEC_SIZE(%rdi), %ymm2
+       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
+       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
+
+       vpcmpeqb %ymm1, %ymm0, %ymm1
+       vpcmpeqb %ymm2, %ymm0, %ymm2
+       vpcmpeqb %ymm3, %ymm0, %ymm3
+       vpcmpeqb %ymm4, %ymm0, %ymm4
+
+       vpor    %ymm1, %ymm2, %ymm5
+       vpor    %ymm3, %ymm4, %ymm6
+       vpor    %ymm5, %ymm6, %ymm5
+
+       vpmovmskb %ymm5, %eax
+       testl   %eax, %eax
+       jz      L(loop_4x_vec)
+
+       /* There is a match.  */
+       vpmovmskb %ymm4, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x3)
+
+       vpmovmskb %ymm3, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x2)
+
+       vpmovmskb %ymm2, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x1)
+
+       vpmovmskb %ymm1, %eax
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_4x_vec_or_less):
+       addl    $(VEC_SIZE * 4), %edx
+       cmpl    $(VEC_SIZE * 2), %edx
+       jbe     L(last_2x_vec)
+
+       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x3)
+
+       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+       vpmovmskb %ymm2, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x2)
+
+       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+       vpmovmskb %ymm3, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x1_check)
+       cmpl    $(VEC_SIZE * 3), %edx
+       jbe     L(zero)
+
+       vpcmpeqb (%rdi), %ymm0, %ymm4
+       vpmovmskb %ymm4, %eax
+       testl   %eax, %eax
+       jz      L(zero)
+       bsrl    %eax, %eax
+       subq    $(VEC_SIZE * 4), %rdx
+       addq    %rax, %rdx
+       jl      L(zero)
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_2x_vec):
+       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+       testl   %eax, %eax
+       jnz     L(last_vec_x3_check)
+       cmpl    $VEC_SIZE, %edx
+       jbe     L(zero)
+
+       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+       testl   %eax, %eax
+       jz      L(zero)
+       bsrl    %eax, %eax
+       subq    $(VEC_SIZE * 2), %rdx
+       addq    %rax, %rdx
+       jl      L(zero)
+       addl    $(VEC_SIZE * 2), %eax
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_vec_x0):
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_vec_x1):
+       bsrl    %eax, %eax
+       addl    $VEC_SIZE, %eax
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_vec_x2):
+       bsrl    %eax, %eax
+       addl    $(VEC_SIZE * 2), %eax
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_vec_x3):
+       bsrl    %eax, %eax
+       addl    $(VEC_SIZE * 3), %eax
+       addq    %rdi, %rax
+       ret
+
+       .p2align 4
+L(last_vec_x1_check):
+       bsrl    %eax, %eax
+       subq    $(VEC_SIZE * 3), %rdx
+       addq    %rax, %rdx
+       jl      L(zero)
+       addl    $VEC_SIZE, %eax
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_vec_x3_check):
+       bsrl    %eax, %eax
+       subq    $VEC_SIZE, %rdx
+       addq    %rax, %rdx
+       jl      L(zero)
+       addl    $(VEC_SIZE * 3), %eax
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(zero):
+       VZEROUPPER
+L(null):
+       xorl    %eax, %eax
+       ret
+
+       .p2align 4
+L(last_vec_or_less_aligned):
+       movl    %edx, %ecx
+
+       vpcmpeqb (%rdi), %ymm0, %ymm1
+
+       movl    $1, %edx
+       /* Support rdx << 32.  */
+       salq    %cl, %rdx
+       subq    $1, %rdx
+
+       vpmovmskb %ymm1, %eax
+
+       /* Remove the trailing bytes.  */
+       andl    %edx, %eax
+       testl   %eax, %eax
+       jz      L(zero)
+
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_vec_or_less):
+       addl    $VEC_SIZE, %edx
+
+       /* Check for zero length.  */
+       testl   %edx, %edx
+       jz      L(null)
+
+       movl    %edi, %ecx
+       andl    $(VEC_SIZE - 1), %ecx
+       jz      L(last_vec_or_less_aligned)
+
+       movl    %ecx, %esi
+       movl    %ecx, %r8d
+       addl    %edx, %esi
+       andq    $-VEC_SIZE, %rdi
+
+       subl    $VEC_SIZE, %esi
+       ja      L(last_vec_2x_aligned)
+
+       /* Check the last VEC.  */
+       vpcmpeqb (%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+
+       /* Remove the leading and trailing bytes.  */
+       sarl    %cl, %eax
+       movl    %edx, %ecx
+
+       movl    $1, %edx
+       sall    %cl, %edx
+       subl    $1, %edx
+
+       andl    %edx, %eax
+       testl   %eax, %eax
+       jz      L(zero)
+
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+       addq    %r8, %rax
+       VZEROUPPER
+       ret
+
+       .p2align 4
+L(last_vec_2x_aligned):
+       movl    %esi, %ecx
+
+       /* Check the last VEC.  */
+       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+
+       movl    $1, %edx
+       sall    %cl, %edx
+       subl    $1, %edx
+
+       vpmovmskb %ymm1, %eax
+
+       /* Remove the trailing bytes.  */
+       andl    %edx, %eax
+
+       testl   %eax, %eax
+       jnz     L(last_vec_x1)
+
+       /* Check the second last VEC.  */
+       vpcmpeqb (%rdi), %ymm0, %ymm1
+
+       movl    %r8d, %ecx
+
+       vpmovmskb %ymm1, %eax
+
+       /* Remove the leading bytes.  Must use unsigned right shift for
+          bsrl below.  */
+       shrl    %cl, %eax
+       testl   %eax, %eax
+       jz      L(zero)
+
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+       addq    %r8, %rax
+       VZEROUPPER
+       ret
+END (__memrchr_avx2)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S

new file mode 100644 (file)

index 0000000..f518819
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
@@ -0,0 +1,26 @@
+/* memrchr optimized with SSE2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define __memrchr __memrchr_sse2
+
+# undef weak_alias
+# define weak_alias(__memrchr, memrchr)
+#endif
+
+#include "../memrchr.S"
diff --git a/sysdeps/x86_64/multiarch/memrchr.c b/sysdeps/x86_64/multiarch/memrchr.c

new file mode 100644 (file)

index 0000000..003d403
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr.c
@@ -0,0 +1,31 @@
+/* Multiple versions of memrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define memrchr __redirect_memrchr
+# include <string.h>
+# undef memrchr
+
+# define SYMBOL_NAME memrchr
+# include "ifunc-avx2.h"
+
+libc_ifunc_redirected (__redirect_memrchr, __memrchr, IFUNC_SELECTOR ());
+weak_alias (__memrchr, memrchr)
+#endif
author	H.J. Lu <hjl.tools@gmail.com>
	Fri, 9 Jun 2017 12:44:30 +0000 (05:44 -0700)
committer	H.J. Lu <hjl.tools@gmail.com>
	Fri, 9 Jun 2017 12:44:41 +0000 (05:44 -0700)
ChangeLog		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/Makefile		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/ifunc-impl-list.c		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memrchr-avx2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/memrchr-sse2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/memrchr.c	[new file with mode: 0644]	patch \| blob