>From 283100684b2644af982d5b86156cbdda4dae3e25 Mon Sep 17 00:00:00 2001 From: Ondrej Bilka Date: Wed, 6 Mar 2013 21:41:32 +0100 Subject: [PATCH] * sysdeps/x86_64/strlen.S: Replace with new SSE2 based implementation which is faster on all x86_64 architectures. Tested on AMD, Intel Nehalem, SNB, IVB. --- sysdeps/x86_64/multiarch/Makefile | 6 +- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 13 - sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 232 ++++++++- sysdeps/x86_64/multiarch/strcat-ssse3.S | 316 ++++++++++- sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S | 685 ---------------------- sysdeps/x86_64/multiarch/strlen-sse2-pminub.S | 259 -------- sysdeps/x86_64/multiarch/strlen-sse4.S | 84 --- sysdeps/x86_64/multiarch/strlen.S | 68 --- sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S | 3 - sysdeps/x86_64/multiarch/strnlen.S | 57 -- sysdeps/x86_64/strcat.S | 1 + sysdeps/x86_64/strlen.S | 272 +++++++--- sysdeps/x86_64/strnlen.S | 67 +-- 13 files changed, 755 insertions(+), 1308 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse4.S delete mode 100644 sysdeps/x86_64/multiarch/strlen.S delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S delete mode 100644 sysdeps/x86_64/multiarch/strnlen.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index dd6c27d..67686ad 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -10,14 +10,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ - strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \ + strncase_l-ssse3 memset-x86-64 strcat-ssse3 strncat-ssse3\ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ - strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \ - memcmp-ssse3 + strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 643cb2d..848991e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -187,11 +187,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) - /* Support sysdeps/x86_64/multiarch/strnlen.S. */ - IFUNC_IMPL (i, name, strnlen, - IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2_no_bsf) - IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) - /* Support sysdeps/x86_64/multiarch/strpbrk.S. */ IFUNC_IMPL (i, name, strpbrk, IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2, @@ -262,14 +257,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2)) - /* Support sysdeps/x86_64/multiarch/strlen.S. */ - IFUNC_IMPL (i, name, strlen, - IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE4_2, __strlen_sse42) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_pminub) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_no_bsf) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) - /* Support sysdeps/x86_64/multiarch/strncmp.S. */ IFUNC_IMPL (i, name, strncmp, IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2, diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S index 72bb609..028c6d3 100644 --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -34,10 +34,236 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif -# define RETURN jmp L(StartStrcpyPart) -# include "strlen-sse2-pminub.S" -# undef RETURN +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + xor %rax, %rax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) +L(next): + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + pmovmskb %xmm0, %edx + and %r10d, %edx + jnz L(exit) + +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 80(%rax), %xmm0 + add $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm1 + add $16, %rax + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm2 + add $16, %rax + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm3 + add $16, %rax + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit) + + add $16, %rax + .p2align 4 + L(align64_loop): + movaps (%rax), %xmm4 + pminub 16(%rax), %xmm4 + movaps 32(%rax), %xmm5 + pminub 48(%rax), %xmm5 + add $64, %rax + pminub %xmm4, %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx + test %edx, %edx + jz L(align64_loop) + + pcmpeqb -64(%rax), %xmm0 + sub $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $16, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $32, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $48, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit64): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + + .p2align 4 L(StartStrcpyPart): lea (%r9, %rax), %rdi mov %rsi, %rcx diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index fea9d11..8101b91 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -33,11 +33,321 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif -# define RETURN jmp L(StartStrcpyPart) -# include "strlen-sse2-no-bsf.S" -# undef RETURN +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + + xor %eax, %eax + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + lea 16(%rdi), %rcx + lea 16(%rdi), %rax + and $-16, %rax + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax + .p2align 4 +L(aligned_64): + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %r9d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %r11d, %r11d + jnz L(aligned_64_exit_32) + test %r10d, %r10d + jnz L(aligned_64_exit_48) + +L(aligned_64_exit_64): + pmovmskb %xmm3, %edx + jmp L(exit) + +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %r10d, %edx + jmp L(exit) + +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %r11d, %edx + jmp L(exit) + +L(aligned_64_exit_16): + lea -48(%rax), %rax + +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail1): + add $1, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail2): + add $2, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail3): + add $3, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail4): + add $4, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail5): + add $5, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail6): + add $6, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail7): + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail8): + add $8, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail9): + add $9, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail10): + add $10, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail11): + add $11, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail12): + add $12, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail13): + add $13, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail14): + add $14, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail15): + add $15, %eax + + .p2align 4 L(StartStrcpyPart): mov %rsi, %rcx lea (%rdi, %rax), %rdx diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S deleted file mode 100644 index ff2ab70..0000000 --- a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S +++ /dev/null @@ -1,685 +0,0 @@ -/* strlen SSE2 without bsf - Copyright (C) 2010-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -/* only for strlen case we don't use optimized version for STATIC build just for SHARED */ - -#if (defined SHARED || defined USE_AS_STRCAT || defined USE_AS_STRNLEN) && !defined NOT_IN_libc - -# ifndef USE_AS_STRCAT - -# include - -# define RETURN ret - -# ifndef STRLEN -# define STRLEN __strlen_sse2_no_bsf -# endif - - atom_text_section -ENTRY (STRLEN) -# endif - xor %eax, %eax -# ifdef USE_AS_STRNLEN - mov %rsi, %r8 - sub $4, %rsi - jbe L(len_less4_prolog) -# endif - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - -# ifdef USE_AS_STRNLEN - sub $4, %rsi - jbe L(len_less8_prolog) -# endif - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - -# ifdef USE_AS_STRNLEN - sub $4, %rsi - jbe L(len_less12_prolog) -# endif - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - -# ifdef USE_AS_STRNLEN - sub $4, %rsi - jbe L(len_less16_prolog) -# endif - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - lea 16(%rdi), %rcx - lea 16(%rdi), %rax - and $-16, %rax - -# ifdef USE_AS_STRNLEN - and $15, %rdi - add %rdi, %rsi - sub $64, %rsi - jbe L(len_less64) -# endif - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %rsi - jbe L(len_less64) -# endif - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %rsi - jbe L(len_less64) -# endif - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %rsi - jbe L(len_less64) -# endif - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - mov %rax, %rdx - and $63, %rdx - add %rdx, %rsi -# endif - - and $-0x40, %rax - - .p2align 4 -L(aligned_64): -# ifdef USE_AS_STRNLEN - sub $64, %rsi - jbe L(len_less64) -# endif - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_16): - lea -48(%rax), %rax -L(aligned_64_exit): -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - RETURN - -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - RETURN - -# ifdef USE_AS_STRNLEN - - .p2align 4 -L(len_less64): - pxor %xmm0, %xmm0 - add $64, %rsi - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - lea 16(%rax), %rax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %rsi - jbe L(return_start_len) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - lea 16(%rax), %rax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %rsi - jbe L(return_start_len) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - lea 16(%rax), %rax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %rsi - jbe L(return_start_len) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - lea 16(%rax), %rax - test %edx, %edx - jnz L(strnlen_exit) - - mov %r8, %rax - ret - - .p2align 4 -L(strnlen_exit): - sub %rcx, %rax - - test %dl, %dl - jz L(strnlen_exit_high) - mov %dl, %cl - and $15, %cl - jz L(strnlen_exit_8) - test $0x01, %dl - jnz L(exit_tail0) - test $0x02, %dl - jnz L(strnlen_exit_tail1) - test $0x04, %dl - jnz L(strnlen_exit_tail2) - sub $4, %rsi - jb L(return_start_len) - lea 3(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_8): - test $0x10, %dl - jnz L(strnlen_exit_tail4) - test $0x20, %dl - jnz L(strnlen_exit_tail5) - test $0x40, %dl - jnz L(strnlen_exit_tail6) - sub $8, %rsi - jb L(return_start_len) - lea 7(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_high): - mov %dh, %ch - and $15, %ch - jz L(strnlen_exit_high_8) - test $0x01, %dh - jnz L(strnlen_exit_tail8) - test $0x02, %dh - jnz L(strnlen_exit_tail9) - test $0x04, %dh - jnz L(strnlen_exit_tail10) - sub $12, %rsi - jb L(return_start_len) - lea 11(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_high_8): - test $0x10, %dh - jnz L(strnlen_exit_tail12) - test $0x20, %dh - jnz L(strnlen_exit_tail13) - test $0x40, %dh - jnz L(strnlen_exit_tail14) - sub $16, %rsi - jb L(return_start_len) - lea 15(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail1): - sub $2, %rsi - jb L(return_start_len) - lea 1(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail2): - sub $3, %rsi - jb L(return_start_len) - lea 2(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail4): - sub $5, %rsi - jb L(return_start_len) - lea 4(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail5): - sub $6, %rsi - jb L(return_start_len) - lea 5(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail6): - sub $7, %rsi - jb L(return_start_len) - lea 6(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail8): - sub $9, %rsi - jb L(return_start_len) - lea 8(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail9): - sub $10, %rsi - jb L(return_start_len) - lea 9(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail10): - sub $11, %rsi - jb L(return_start_len) - lea 10(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail12): - sub $13, %rsi - jb L(return_start_len) - lea 12(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail13): - sub $14, %rsi - jb L(return_start_len) - lea 13(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail14): - sub $15, %rsi - jb L(return_start_len) - lea 14(%eax), %eax - ret - - .p2align 4 -L(return_start_len): - mov %r8, %rax - ret - -/* for prolog only */ - - .p2align 4 -L(len_less4_prolog): - add $4, %rsi - jz L(exit_tail0) - - cmpb $0, (%rdi) - jz L(exit_tail0) - cmp $1, %esi - je L(exit_tail1) - - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmp $2, %esi - je L(exit_tail2) - - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmp $3, %esi - je L(exit_tail3) - - cmpb $0, 3(%rdi) - jz L(exit_tail3) - mov $4, %eax - ret - - .p2align 4 -L(len_less8_prolog): - add $4, %rsi - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmp $1, %esi - je L(exit_tail5) - - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmp $2, %esi - je L(exit_tail6) - - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmp $3, %esi - je L(exit_tail7) - - cmpb $0, 7(%rdi) - jz L(exit_tail7) - mov $8, %eax - ret - - .p2align 4 -L(len_less12_prolog): - add $4, %rsi - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmp $1, %esi - je L(exit_tail9) - - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmp $2, %esi - je L(exit_tail10) - - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmp $3, %esi - je L(exit_tail11) - - cmpb $0, 11(%rdi) - jz L(exit_tail11) - mov $12, %eax - ret - - .p2align 4 -L(len_less16_prolog): - add $4, %rsi - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmp $1, %esi - je L(exit_tail13) - - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmp $2, %esi - je L(exit_tail14) - - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmp $3, %esi - je L(exit_tail15) - - cmpb $0, 15(%rdi) - jz L(exit_tail15) - mov $16, %eax - ret -# endif - - .p2align 4 -L(exit_tail1): - add $1, %eax - RETURN - - .p2align 4 -L(exit_tail2): - add $2, %eax - RETURN - - .p2align 4 -L(exit_tail3): - add $3, %eax - RETURN - - .p2align 4 -L(exit_tail4): - add $4, %eax - RETURN - - .p2align 4 -L(exit_tail5): - add $5, %eax - RETURN - - .p2align 4 -L(exit_tail6): - add $6, %eax - RETURN - - .p2align 4 -L(exit_tail7): - add $7, %eax - RETURN - - .p2align 4 -L(exit_tail8): - add $8, %eax - RETURN - - .p2align 4 -L(exit_tail9): - add $9, %eax - RETURN - - .p2align 4 -L(exit_tail10): - add $10, %eax - RETURN - - .p2align 4 -L(exit_tail11): - add $11, %eax - RETURN - - .p2align 4 -L(exit_tail12): - add $12, %eax - RETURN - - .p2align 4 -L(exit_tail13): - add $13, %eax - RETURN - - .p2align 4 -L(exit_tail14): - add $14, %eax - RETURN - - .p2align 4 -L(exit_tail15): - add $15, %eax -# ifndef USE_AS_STRCAT - RETURN -END (STRLEN) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S deleted file mode 100644 index cc4bb57..0000000 --- a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S +++ /dev/null @@ -1,259 +0,0 @@ -/* strlen SSE2 - Copyright (C) 2011-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT) - -# ifndef USE_AS_STRCAT - -# include - -# define RETURN ret - - .section .text.sse2,"ax",@progbits -ENTRY (__strlen_sse2_pminub) - -# endif - xor %rax, %rax - mov %edi, %ecx - and $0x3f, %ecx - pxor %xmm0, %xmm0 - cmp $0x30, %ecx - ja L(next) - movdqu (%rdi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit_less16) - mov %rdi, %rax - and $-16, %rax - jmp L(align16_start) -L(next): - mov %rdi, %rax - and $-16, %rax - pcmpeqb (%rax), %xmm0 - mov $-1, %r10d - sub %rax, %rcx - shl %cl, %r10d - pmovmskb %xmm0, %edx - and %r10d, %edx - jnz L(exit) -L(align16_start): - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - pcmpeqb 16(%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 80(%rax), %xmm0 - add $80, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm1 - add $16, %rax - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm2 - add $16, %rax - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm3 - add $16, %rax - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit) - - add $16, %rax - .p2align 4 - L(align64_loop): - movaps (%rax), %xmm4 - pminub 16(%rax), %xmm4 - movaps 32(%rax), %xmm5 - pminub 48(%rax), %xmm5 - add $64, %rax - pminub %xmm4, %xmm5 - pcmpeqb %xmm0, %xmm5 - pmovmskb %xmm5, %edx - test %edx, %edx - jz L(align64_loop) - - - pcmpeqb -64(%rax), %xmm0 - sub $80, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $64, %rax - RETURN - - .p2align 4 -L(exit): - sub %rdi, %rax -L(exit_less16): - bsf %rdx, %rdx - add %rdx, %rax - RETURN - .p2align 4 -L(exit16): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $16, %rax - RETURN - .p2align 4 -L(exit32): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $32, %rax - RETURN - .p2align 4 -L(exit48): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $48, %rax - RETURN - .p2align 4 -L(exit64): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $64, %rax -# ifndef USE_AS_STRCAT - RETURN - -END (__strlen_sse2_pminub) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S deleted file mode 100644 index 8d685df..0000000 --- a/sysdeps/x86_64/multiarch/strlen-sse4.S +++ /dev/null @@ -1,84 +0,0 @@ -/* strlen with SSE4 - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper . - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if defined SHARED && !defined NOT_IN_libc - -#include - - .section .text.sse4.2,"ax",@progbits -ENTRY (__strlen_sse42) - pxor %xmm1, %xmm1 - movl %edi, %ecx - movq %rdi, %r8 - andq $~15, %rdi - xor %edi, %ecx - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %edx - shrl %cl, %edx - shll %cl, %edx - andl %edx, %edx - jnz L(less16bytes) - pxor %xmm1, %xmm1 - - .p2align 4 -L(more64bytes_loop): - pcmpistri $0x08, 16(%rdi), %xmm1 - jz L(more32bytes) - - pcmpistri $0x08, 32(%rdi), %xmm1 - jz L(more48bytes) - - pcmpistri $0x08, 48(%rdi), %xmm1 - jz L(more64bytes) - - add $64, %rdi - pcmpistri $0x08, (%rdi), %xmm1 - jnz L(more64bytes_loop) - leaq (%rdi,%rcx), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more32bytes): - leaq 16(%rdi,%rcx, 1), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more48bytes): - leaq 32(%rdi,%rcx, 1), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more64bytes): - leaq 48(%rdi,%rcx, 1), %rax - subq %r8, %rax - ret - - .p2align 4 -L(less16bytes): - subq %r8, %rdi - bsfl %edx, %eax - addq %rdi, %rax - ret - -END (__strlen_sse42) - -#endif diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S deleted file mode 100644 index ab29cef..0000000 --- a/sysdeps/x86_64/multiarch/strlen.S +++ /dev/null @@ -1,68 +0,0 @@ -/* Multiple versions of strlen(str) -- determine the length of the string STR. - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper . - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - - -/* Define multiple versions only for the definition in libc and for - the DSO. In static binaries we need strlen before the initialization - happened. */ -#if defined SHARED && !defined NOT_IN_libc - .text -ENTRY(strlen) - .type strlen, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __strlen_sse2_pminub(%rip), %rax - testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip) - jnz 2f - leaq __strlen_sse2(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jz 2f - leaq __strlen_sse42(%rip), %rax - ret -2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) - jz 3f - leaq __strlen_sse2_no_bsf(%rip), %rax -3: ret -END(strlen) - -# undef ENTRY -# define ENTRY(name) \ - .type __strlen_sse2, @function; \ - .align 16; \ - .globl __strlen_sse2; \ - .hidden __strlen_sse2; \ - __strlen_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __strlen_sse2, .-__strlen_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strlen calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_strlen; __GI_strlen = __strlen_sse2 -#endif - -#include "../strlen.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S deleted file mode 100644 index 248328d..0000000 --- a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNLEN -#define STRLEN __strnlen_sse2_no_bsf -#include "strlen-sse2-no-bsf.S" diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S deleted file mode 100644 index 124f845..0000000 --- a/sysdeps/x86_64/multiarch/strnlen.S +++ /dev/null @@ -1,57 +0,0 @@ -/* multiple version of strnlen - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - - -/* Define multiple versions only for the definition in libc. */ -#ifndef NOT_IN_libc - - .text -ENTRY(__strnlen) - .type __strnlen, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __strnlen_sse2(%rip), %rax - testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) - jz 2f - leaq __strnlen_sse2_no_bsf(%rip), %rax -2: ret -END(__strnlen) - -# undef ENTRY -# define ENTRY(name) \ - .type __strnlen_sse2, @function; \ - .align 16; \ - .globl __strnlen_sse2; \ - .hidden __strnlen_sse2; \ - __strnlen_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2 - -# undef libc_hidden_def -# define libc_hidden_def(name) \ - .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2 -#endif - -#include "../strnlen.S" diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S index 287ffd2..8bea6fb 100644 --- a/sysdeps/x86_64/strcat.S +++ b/sysdeps/x86_64/strcat.S @@ -21,6 +21,7 @@ #include #include "asm-syntax.h" +/* Will be removed when new strcpy implementation gets merged. */ .text ENTRY (strcat) diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index 4bdca0a..6abb3f0 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,6 +1,5 @@ -/* strlen(str) -- determine the length of the string STR. - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper . +/* SSE2 version of strlen. + Copyright (C) 2012-2013 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,83 +18,222 @@ #include +/* Long lived register in strlen(s), strnlen(s, n) are: - .text + %xmm11 - zero + %rdi - s + %r10 (s+n) & (~(64-1)) + %r11 s+n +*/ + + +.text ENTRY(strlen) + +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +#define FIND_ZERO \ + pcmpeqb (%rax), %xmm8; \ + pcmpeqb 16(%rax), %xmm9; \ + pcmpeqb 32(%rax), %xmm10; \ + pcmpeqb 48(%rax), %xmm11; \ + pmovmskb %xmm8, %esi; \ + pmovmskb %xmm9, %edx; \ + pmovmskb %xmm10, %r8d; \ + pmovmskb %xmm11, %ecx; \ + salq $16, %rdx; \ + salq $16, %rcx; \ + orq %rsi, %rdx; \ + orq %r8, %rcx; \ + salq $32, %rcx; \ + orq %rcx, %rdx; + +#ifdef AS_STRNLEN +/* Do not read anything when n==0. */ + test %rsi, %rsi + jne L(n_nonzero) xor %rax, %rax - mov %edi, %ecx - and $0x3f, %ecx - pxor %xmm0, %xmm0 - cmp $0x30, %ecx - ja L(next) - movdqu (%rdi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit_less16) - mov %rdi, %rax - and $-16, %rax - jmp L(align16_start) -L(next): - mov %rdi, %rax - and $-16, %rax - pcmpeqb (%rax), %xmm0 - mov $-1, %esi - sub %rax, %rcx - shl %cl, %esi - pmovmskb %xmm0, %edx - and %esi, %edx - jnz L(exit) -L(align16_start): - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - .p2align 4 -L(align16_loop): - pcmpeqb 16(%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) + ret +L(n_nonzero): - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) +/* Initialize long lived registers. */ - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) + add %rdi, %rsi + mov %rsi, %r10 + and $-64, %r10 + mov %rsi, %r11 +#endif - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - lea 64(%rax), %rax + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + movq %rdi, %rax + movq %rdi, %rcx + andq $4095, %rcx +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ + cmpq $4047, %rcx +/* We cannot unify this branching as it would be ~6 cycles slower. */ + ja L(cross_page) + +#ifdef AS_STRNLEN +/* Test if end is among first 64 bytes. */ +# define STRNLEN_PROLOG \ + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) +#else +# define STRNLEN_PROLOG andq $-64, %rax; +#endif + +/* Ignore bits in mask that come before start of string. */ +#define PROLOG(lab) \ + movq %rdi, %rcx; \ + xorq %rax, %rcx; \ + STRNLEN_PROLOG; \ + sarq %cl, %rdx; \ + test %rdx, %rdx; \ + je L(lab); \ + bsfq %rdx, %rax; \ + ret + +#ifdef AS_STRNLEN + andq $-16, %rax + FIND_ZERO +#else + /* Test first 16 bytes unaligned. */ + movdqu (%rax), %xmm12 + pcmpeqb %xmm8, %xmm12 + pmovmskb %xmm12, %edx test %edx, %edx - jz L(align16_loop) -L(exit): - sub %rdi, %rax -L(exit_less16): - bsf %rdx, %rdx - add %rdx, %rax + je L(next48_bytes) + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + ret + +L(next48_bytes): +/* Same as FIND_ZERO except we do not check first 16 bytes. */ + andq $-16, %rax + pcmpeqb 16(%rax), %xmm9 + pcmpeqb 32(%rax), %xmm10 + pcmpeqb 48(%rax), %xmm11 + pmovmskb %xmm9, %edx + pmovmskb %xmm10, %r8d + pmovmskb %xmm11, %ecx + salq $16, %rdx + salq $16, %rcx + orq %r8, %rcx + salq $32, %rcx + orq %rcx, %rdx +#endif + + /* When no zero byte is found xmm9-11 are zero so we do not have to + zero them. */ + PROLOG(loop) + + .p2align 4 +L(cross_page): + andq $-64, %rax + FIND_ZERO + PROLOG(loop_init) + +#ifdef AS_STRNLEN +/* We must do this check to correctly handle strnlen (s, -1). */ +L(strnlen_ret): + bts %rsi, %rdx + sarq %cl, %rdx + test %rdx, %rdx + je L(loop_init) + bsfq %rdx, %rax ret +#endif + .p2align 4 +L(loop_init): + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 +#ifdef AS_STRNLEN + .p2align 4 +L(loop): + + addq $64, %rax + cmpq %rax, %r10 + je L(exit_end) + + movdqa (%rax), %xmm8 + pminub 16(%rax), %xmm8 + pminub 32(%rax), %xmm8 + pminub 48(%rax), %xmm8 + pcmpeqb %xmm11, %xmm8 + pmovmskb %xmm8, %edx + testl %edx, %edx + jne L(exit) + jmp L(loop) + .p2align 4 -L(exit16): - sub %rdi, %rax - bsf %rdx, %rdx - lea 16(%rdx,%rax), %rax +L(exit_end): + cmp %rax, %r11 + je L(first) /* Do not read when end is at page boundary. */ + pxor %xmm8, %xmm8 + FIND_ZERO + +L(first): + bts %r11, %rdx + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax ret + .p2align 4 -L(exit32): - sub %rdi, %rax - bsf %rdx, %rdx - lea 32(%rdx,%rax), %rax +L(exit): + pxor %xmm8, %xmm8 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax ret + +#else + + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ + .p2align 4 +L(loop): + + movdqa 64(%rax), %xmm8 + pminub 80(%rax), %xmm8 + pminub 96(%rax), %xmm8 + pminub 112(%rax), %xmm8 + pcmpeqb %xmm11, %xmm8 + pmovmskb %xmm8, %edx + testl %edx, %edx + jne L(exit64) + + subq $-128, %rax + + movdqa (%rax), %xmm8 + pminub 16(%rax), %xmm8 + pminub 32(%rax), %xmm8 + pminub 48(%rax), %xmm8 + pcmpeqb %xmm11, %xmm8 + pmovmskb %xmm8, %edx + testl %edx, %edx + jne L(exit0) + jmp L(loop) + .p2align 4 -L(exit48): - sub %rdi, %rax - bsf %rdx, %rdx - lea 48(%rdx,%rax), %rax +L(exit64): + addq $64, %rax +L(exit0): + pxor %xmm8, %xmm8 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax ret + +#endif + END(strlen) libc_hidden_builtin_def (strlen) diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S index 6e53503..d3c43ac 100644 --- a/sysdeps/x86_64/strnlen.S +++ b/sysdeps/x86_64/strnlen.S @@ -1,63 +1,6 @@ -/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN. - Copyright (C) 2010-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper . - This file is part of the GNU C Library. +#define AS_STRNLEN +#define strlen __strnlen +#include "strlen.S" - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - - - .text -ENTRY(__strnlen) - movq %rsi, %rax - testq %rsi, %rsi - jz 3f - pxor %xmm2, %xmm2 - movq %rdi, %rcx - movq %rdi, %r8 - movq $16, %r9 - andq $~15, %rdi - movdqa %xmm2, %xmm1 - pcmpeqb (%rdi), %xmm2 - orl $0xffffffff, %r10d - subq %rdi, %rcx - shll %cl, %r10d - subq %rcx, %r9 - pmovmskb %xmm2, %edx - andl %r10d, %edx - jnz 1f - subq %r9, %rsi - jbe 3f - -2: movdqa 16(%rdi), %xmm0 - leaq 16(%rdi), %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jnz 1f - subq $16, %rsi - jnbe 2b -3: ret - -1: subq %r8, %rdi - bsfl %edx, %edx - addq %rdi, %rdx - cmpq %rdx, %rax - cmovnbq %rdx, %rax - ret -END(__strnlen) -weak_alias (__strnlen, strnlen) -libc_hidden_def (strnlen) +weak_alias (__strnlen, strnlen); +libc_hidden_builtin_def (strnlen) -- 1.7.4.4