This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH v3 neleai/string-x64] Optimize strcmp, strncmp, strcasecmp, strncasecmp
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Thu, 2 Jul 2015 01:38:12 +0200
- Subject: [PING][PATCH v3 neleai/string-x64] Optimize strcmp, strncmp, strcasecmp, strncasecmp
- Authentication-results: sourceware.org; auth=none
- References: <20150624111855 dot GA15322 at domone>
On Wed, Jun 24, 2015 at 01:18:55PM +0200, OndÅej BÃlka wrote:
> Hi,
>
> As I wrote previous patches and was about to write strncasecmp I
> realized that it would be easier to write and review that using macros
> like existing one.
>
> So here is condensed version of previous patches. I will use that as so
> we don't have to write same optimization twice in future.
>
> Also with these a sse42 implementation could be finally removed.
>
>
> * sysdeps/x86_64/locale-defines.sym: Add LOCALE_TOLOWER.
> * sysdeps/x86_64/multiarch/Makefile (routines):
> Add strcmp-avx2, strncmp-avx2, strcasecmp-avx2,
> strncasecmp-avx2
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
> * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Implement
> strncasecmp, strncmp, strcasecmp.
> * sysdeps/x86_64/multiarch/strcmp-sse42.S: Remove.
> * sysdeps/x86_64/multiarch/strcmp-avx2.S: New file.
> * sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S: Likewise.
> sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S: Likewise.
> * sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S: Likewise.
> * sysdeps/x86_64/multiarch/strncase_l-avx2.S: Likewise.
> * sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S: Likewise.
> * sysdeps/x86_64/multiarch/strncmp-avx2.S: Likewise.
> ---
> sysdeps/x86_64/locale-defines.sym | 1 +
> sysdeps/x86_64/multiarch/Makefile | 5 +-
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 37 +-
> sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 5 +
> .../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S | 3 +
> sysdeps/x86_64/multiarch/strcmp-avx2.S | 3 +
> sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 469 ++++-
> sysdeps/x86_64/multiarch/strcmp-sse42.S | 1792 --------------------
> sysdeps/x86_64/multiarch/strcmp.S | 87 +-
> sysdeps/x86_64/multiarch/strncase_l-avx2.S | 6 +
> .../x86_64/multiarch/strncase_l-sse2-unaligned.S | 4 +
> sysdeps/x86_64/multiarch/strncmp-avx2.S | 4 +
> sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S | 3 +
> 13 files changed, 467 insertions(+), 1952 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
> create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2.S
> delete mode 100644 sysdeps/x86_64/multiarch/strcmp-sse42.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S
> create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2.S
> create mode 100644 sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
>
> diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
> index aebff9a..804debb 100644
> --- a/sysdeps/x86_64/locale-defines.sym
> +++ b/sysdeps/x86_64/locale-defines.sym
> @@ -8,4 +8,5 @@ LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales)
> LC_CTYPE
> _NL_CTYPE_NONASCII_CASE
> LOCALE_DATA_VALUES offsetof (struct __locale_data, values)
> +LOCALE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower)
> SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0])
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 679db2a..8094162 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -7,12 +7,13 @@ endif
> ifeq ($(subdir),string)
>
> sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
> - strcmp-sse2-unaligned strncmp-ssse3 \
> + strcmp-sse2-unaligned strncmp-sse2-unaligned strncmp-ssse3 \
> memcpy-ssse3 \
> memcpy-sse2-unaligned mempcpy-ssse3 \
> memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
> memmove-ssse3-back strcasecmp_l-ssse3 \
> + strcasecmp_l-sse2-unaligned strncase_l-sse2-unaligned \
> strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> strcpy-sse2-unaligned strncpy-sse2-unaligned \
> @@ -29,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4
> endif
>
> ifeq (yes,$(config-cflags-avx2))
> -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
> +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2 strncmp-avx2 strcasecmp_l-avx2 strncase_l-avx2
> endif
> endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index b3dbe65..8c71030 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -94,20 +94,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */
> IFUNC_IMPL (i, name, strcasecmp,
> - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX,
> - __strcasecmp_avx)
> - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
> - __strcasecmp_sse42)
> + IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
> + __strcasecmp_sse2_unaligned)
> + IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX2,
> + __strcasecmp_avx2)
> IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
> __strcasecmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */
> IFUNC_IMPL (i, name, strcasecmp_l,
> - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_AVX,
> - __strcasecmp_l_avx)
> - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2,
> - __strcasecmp_l_sse42)
> + IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
> + __strcasecmp_sse2_unaligned_l)
> IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3,
> __strcasecmp_l_ssse3)
> IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
> @@ -130,7 +128,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcmp.S. */
> IFUNC_IMPL (i, name, strcmp,
> - IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42)
> + IFUNC_IMPL_ADD (array, i, strcmp, HAS_AVX2, __strcmp_avx2)
> IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
> @@ -150,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.S. */
> IFUNC_IMPL (i, name, strncasecmp,
> - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX,
> - __strncasecmp_avx)
> - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSE4_2,
> - __strncasecmp_sse42)
> + IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX2,
> + __strncasecmp_avx2)
> + IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
> + __strncasecmp_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSSE3,
> __strncasecmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
> @@ -161,10 +159,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.S. */
> IFUNC_IMPL (i, name, strncasecmp_l,
> - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX,
> - __strncasecmp_l_avx)
> - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSE4_2,
> - __strncasecmp_l_sse42)
> + IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX2,
> + __strncasecmp_avx2_l)
> + IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
> + __strncasecmp_sse2_unaligned_l)
> IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSSE3,
> __strncasecmp_l_ssse3)
> IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
> @@ -261,8 +259,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncmp.S. */
> IFUNC_IMPL (i, name, strncmp,
> - IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
> - __strncmp_sse42)
> + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2_unaligned)
> + IFUNC_IMPL_ADD (array, i, strncmp, HAS_AVX2, __strncmp_avx2)
> +
> IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3,
> __strncmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> new file mode 100644
> index 0000000..d10379f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> @@ -0,0 +1,5 @@
> +#define AS_STRCASECMP
> +#define USE_AVX2
> +#define __strcasecmp_sse2_unaligned __strcasecmp_avx2
> +#define STRCMP __strcasecmp_avx2_l
> +#include "strcmp-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
> new file mode 100644
> index 0000000..e2ed03f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
> @@ -0,0 +1,3 @@
> +#define AS_STRCASECMP
> +#define STRCMP __strcasecmp_sse2_unaligned_l
> +#include "strcmp-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> new file mode 100644
> index 0000000..606df63
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -0,0 +1,3 @@
> +#define USE_AVX2
> +#define STRCMP __strcmp_avx2
> +#include "strcmp-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> index 20b65fa..ef67fb0 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> @@ -18,29 +18,127 @@
>
> #include "sysdep.h"
>
> -ENTRY ( __strcmp_sse2_unaligned)
> - movl %edi, %eax
> - xorl %edx, %edx
> +#ifndef STRCMP
> +# define STRCMP __strcmp_sse2_unaligned
> +#endif
> +
> +#ifdef AS_STRCASECMP
> +# include "locale-defines.h"
> +
> +# ifdef AS_STRNCMP
> +ENTRY (__strncasecmp_sse2_unaligned)
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %rcx
> + // XXX 5 byte should be before the function
> + /* 5-byte NOP. */
> + .byte 0x0f,0x1f,0x44,0x00,0x00
> +
> +END (__strncasecmp_sse2_unaligned)
> +
> +ENTRY (STRCMP)
> + test %rdx, %rdx
> + je L(ret_zero)
> + mov LOCALE_TOLOWER(%rcx), %r11
> +# else
> +ENTRY (__strcasecmp_sse2_unaligned)
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %rdx
> + // XXX 5 byte should be before the function
> + /* 5-byte NOP. */
> + .byte 0x0f,0x1f,0x44,0x00,0x00
> +
> +END (__strcasecmp_sse2_unaligned)
> +
> +ENTRY (STRCMP)
> + mov LOCALE_TOLOWER(%rdx), %r11
> +# endif
> + movzbl (%rdi), %eax
> + movzbl (%rsi), %ecx
> + movl (%r11,%rax,4), %eax
> + subl (%r11,%rcx,4), %eax
> + je L(next)
> +L(return):
> + ret
> +L(next):
> + test %ecx, %ecx
> + je L(return)
> + leaq 1(%rsi), %rsi
> + leaq 1(%rdi), %rdi
> +#ifdef AS_STRNCMP
> + sub $1, %rdx
> +#endif
> +
> +#else
> +ENTRY (STRCMP)
> +#endif
> +
> +#ifdef AS_STRNCMP
> + lea -1(%rdx), %r10
> + test %rdx, %rdx
> + je L(ret_zero)
> +L(back_to_start):
> + xor %rdx, %rdx
> +#endif
> +
> pxor %xmm7, %xmm7
> - orl %esi, %eax
> + movl %esi, %eax
> + andl $4095, %eax
> + cmpl $4032, %eax
> + jg L(cross_page)
> +
> + movl %edi, %eax
> andl $4095, %eax
> cmpl $4032, %eax
> jg L(cross_page)
> +#ifdef AS_STRNCMP
> + cmp $64, %r10
> + jae L(dont_set_mask)
> + bts %r10, %rdx
> +L(dont_set_mask):
> +#endif
> +
> movdqu (%rdi), %xmm1
> movdqu (%rsi), %xmm0
> pcmpeqb %xmm1, %xmm0
> pminub %xmm1, %xmm0
> - pxor %xmm1, %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %eax
> - testq %rax, %rax
> + pcmpeqb %xmm7, %xmm0
> + pmovmskb %xmm0, %ecx
> +#ifdef AS_STRNCMP
> + or %dx, %cx
> +#else
> + test %ecx, %ecx
> +#endif
> je L(next_48_bytes)
> -L(return):
> - bsfq %rax, %rdx
> +#ifdef AS_STRCASECMP
> +L(caseloop1):
> + bsf %ecx, %r9d
> + movzbl (%rdi,%r9), %eax
> + movzbl (%rsi,%r9), %r8d
> + movl (%r11,%rax,4), %eax
> + subl (%r11,%r8,4), %eax
> + jne L(return)
> + test %r8d, %r8d
> + je L(return)
> +# ifdef AS_STRNCMP
> + cmp %r9, %r10
> + je L(return)
> +# endif
> + leaq -1(%rcx), %rax
> + andq %rax, %rcx
> + je L(next_48_bytes)
> + jmp L(caseloop1)
> +#else
> + bsf %ecx, %edx
> movzbl (%rdi, %rdx), %eax
> movzbl (%rsi, %rdx), %edx
> subl %edx, %eax
> ret
> +#endif
> +#ifdef AS_STRNCMP
> + L(ret_zero):
> + xor %eax, %eax
> + ret
> +#endif
>
> .p2align 4
> L(next_48_bytes):
> @@ -50,49 +148,108 @@ L(next_48_bytes):
> pcmpeqb %xmm6, %xmm3
> movdqu 32(%rsi), %xmm2
> pminub %xmm6, %xmm3
> - pcmpeqb %xmm1, %xmm3
> + pcmpeqb %xmm7, %xmm3
> movdqu 48(%rdi), %xmm4
> pcmpeqb %xmm5, %xmm2
> - pmovmskb %xmm3, %edx
> movdqu 48(%rsi), %xmm0
> pminub %xmm5, %xmm2
> - pcmpeqb %xmm1, %xmm2
> + pcmpeqb %xmm7, %xmm2
> pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm2, %eax
> - salq $16, %rdx
> - pminub %xmm4, %xmm0
> - pcmpeqb %xmm1, %xmm0
> + pmovmskb %xmm2, %eax
> salq $32, %rax
> +#ifdef AS_STRNCMP
> + or %rdx, %rax
> +#endif
> + pmovmskb %xmm3, %edx
> + sal $16, %edx
> + pminub %xmm4, %xmm0
> + pcmpeqb %xmm7, %xmm0
> orq %rdx, %rax
> - pmovmskb %xmm0, %ecx
> - movq %rcx, %rdx
> - salq $48, %rdx
> - orq %rdx, %rax
> + pmovmskb %xmm0, %ecx
> + salq $48, %rcx
> + orq %rax, %rcx
> + je L(main_loop_header)
> +#ifdef AS_STRCASECMP
> +L(caseloop2):
> + bsf %rcx, %r9
> + movzbl (%rdi,%r9), %eax
> + movzbl (%rsi,%r9), %r8d
> + movl (%r11,%rax,4), %eax
> + subl (%r11,%r8,4), %eax
> jne L(return)
> + test %r8d, %r8d
> + je L(return)
> +# ifdef AS_STRNCMP
> + cmp %r9, %r10
> + je L(return)
> +# endif
> + leaq -1(%rcx), %rax
> + andq %rax, %rcx
> + je L(main_loop_header)
> + jmp L(caseloop2)
> +#else
> + bsf %rcx, %rdx
> + movzbl (%rdi, %rdx), %eax
> + movzbl (%rsi, %rdx), %edx
> + subl %edx, %eax
> + ret
> +#endif
> +
> L(main_loop_header):
> +#ifdef USE_AVX2
> + vpxor %xmm7, %xmm7, %xmm7
> +#endif
> leaq 64(%rdi), %rdx
> - movl $4096, %ecx
> - pxor %xmm9, %xmm9
> andq $-64, %rdx
> +# ifdef AS_STRNCMP
> + addq %rdi, %r10
> + subq %rdx, %r10
> +# endif
> subq %rdi, %rdx
> leaq (%rdi, %rdx), %rax
> addq %rsi, %rdx
> - movq %rdx, %rsi
> - andl $4095, %esi
> - subq %rsi, %rcx
> - shrq $6, %rcx
> - movq %rcx, %rsi
> - jmp L(loop_start)
> + movl $4096, %esi
> + mov %edx, %ecx
> + andl $4095, %ecx
> + sub %ecx, %esi
> + shr $6, %esi
> +#ifdef AS_STRNCMP
> + mov %r10, %r9
> + addq %rdx, %r10
> + shr $6, %r9
> + cmp %r9, %rsi
> + jb L(dont_set_page_bound)
> + mov %r9, %rsi
> +L(dont_set_page_bound):
> +#endif
>
> .p2align 4
> L(loop):
> + add $-1, %rsi
> + ja L(loop_cross_page)
> +L(back_to_loop):
> +#ifdef USE_AVX2
> + vmovdqa (%rax), %ymm4
> + vmovdqa 32(%rax), %ymm5
> + vmovdqu (%rdx), %ymm0
> + vmovdqu 32(%rdx), %ymm1
> + vpcmpeqb %ymm4, %ymm0, %ymm0
> + vpminub %ymm4, %ymm0, %ymm0
> + vpcmpeqb %ymm5, %ymm1, %ymm1
> + vpminub %ymm5, %ymm1, %ymm1
> + vpminub %ymm0, %ymm1, %ymm2
> + vpcmpeqb %ymm7, %ymm2, %ymm2
> addq $64, %rax
> addq $64, %rdx
> -L(loop_start):
> - testq %rsi, %rsi
> - leaq -1(%rsi), %rsi
> - je L(loop_cross_page)
> -L(back_to_loop):
> + vpmovmskb %ymm2, %edi
> + test %edi, %edi
> + je L(loop)
> + shl $32, %rdi
> + vpcmpeqb %ymm7, %ymm0, %ymm0
> + vpmovmskb %ymm0, %ecx
> + or %rdi, %rcx
> + vzeroupper
> +#else
> movdqu (%rdx), %xmm0
> movdqu 16(%rdx), %xmm1
> movdqa (%rax), %xmm2
> @@ -104,61 +261,99 @@ L(back_to_loop):
> movdqu 48(%rdx), %xmm6
> pminub %xmm3, %xmm1
> movdqa 32(%rax), %xmm2
> - pminub %xmm1, %xmm0
> movdqa 48(%rax), %xmm3
> pcmpeqb %xmm2, %xmm5
> pcmpeqb %xmm3, %xmm6
> + addq $64, %rax
> pminub %xmm2, %xmm5
> pminub %xmm3, %xmm6
> - pminub %xmm5, %xmm0
> - pminub %xmm6, %xmm0
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %ecx
> + addq $64, %rdx
> + pminub %xmm5, %xmm6
> + pminub %xmm1, %xmm6
> + pminub %xmm0, %xmm6
> + pcmpeqb %xmm7, %xmm6
> + pmovmskb %xmm6, %ecx
> testl %ecx, %ecx
> je L(loop)
> - pcmpeqb %xmm7, %xmm5
> - movdqu (%rdx), %xmm0
> - pcmpeqb %xmm7, %xmm1
> - movdqa (%rax), %xmm2
> - pcmpeqb %xmm2, %xmm0
> - pminub %xmm2, %xmm0
> - pcmpeqb %xmm7, %xmm6
> pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm1, %ecx
> - pmovmskb %xmm5, %r8d
> - pmovmskb %xmm0, %edi
> - salq $16, %rcx
> + pcmpeqb %xmm7, %xmm1
> + pcmpeqb %xmm7, %xmm5
> + pmovmskb %xmm0, %edi
> + pmovmskb %xmm1, %r9d
> + pmovmskb %xmm5, %r8d
> + salq $48, %rcx
> salq $32, %r8
> - pmovmskb %xmm6, %esi
> orq %r8, %rcx
> orq %rdi, %rcx
> - salq $48, %rsi
> - orq %rsi, %rcx
> + sal $16, %r9d
> + orq %r9, %rcx
> +#endif
> +#ifdef AS_STRCASECMP
> +L(caseloop3):
> + bsf %rcx, %r9
> + movzbl -64(%rax,%r9), %edi
> + movzbl -64(%rdx,%r9), %r8d
> + movl (%r11,%rdi,4), %edi
> + subl (%r11,%r8,4), %edi
> + jne L(return2)
> + test %r8d, %r8d
> + je L(return2)
> + leaq -1(%rcx), %rdi
> + andq %rdi, %rcx
> + je L(loop)
> + jmp L(caseloop3)
> +L(return2):
> + mov %rdi, %rax
> + ret
> +#else
> bsfq %rcx, %rcx
> - movzbl (%rax, %rcx), %eax
> - movzbl (%rdx, %rcx), %edx
> + movzbl -64(%rax, %rcx), %eax
> + movzbl -64(%rdx, %rcx), %edx
> subl %edx, %eax
> ret
> +#endif
>
> .p2align 4
> L(loop_cross_page):
> - xor %r10, %r10
> - movq %rdx, %r9
> - and $63, %r9
> - subq %r9, %r10
> -
> - movdqa (%rdx, %r10), %xmm0
> - movdqa 16(%rdx, %r10), %xmm1
> - movdqu (%rax, %r10), %xmm2
> - movdqu 16(%rax, %r10), %xmm3
> +#ifdef AS_STRNCMP
> + mov %r10, %r9
> + sub %rdx, %r9
> + cmp $64, %r9
> + jb L(prepare_back_to_start)
> +#endif
> +
> + mov %edx, %ecx
> + and $63, %ecx
> + neg %rcx
> +#ifdef USE_AVX2
> + vmovdqu (%rax, %rcx), %ymm4
> + vmovdqu 32(%rax, %rcx), %ymm5
> + vmovdqa (%rdx, %rcx), %ymm0
> + vmovdqa 32(%rdx, %rcx), %ymm1
> + vpcmpeqb %ymm4, %ymm0, %ymm0
> + vpminub %ymm4, %ymm0, %ymm0
> + vpcmpeqb %ymm5, %ymm1, %ymm1
> + vpminub %ymm5, %ymm1, %ymm1
> + vpminub %ymm0, %ymm1, %ymm2
> + vpcmpeqb %ymm7, %ymm2, %ymm2
> + vpmovmskb %ymm2, %esi
> + shl $32, %rsi
> + vpcmpeqb %ymm7, %ymm0, %ymm0
> + vpmovmskb %ymm0, %edi
> + or %rsi, %rdi
> +#else
> + movdqa (%rdx, %rcx), %xmm0
> + movdqa 16(%rdx, %rcx), %xmm1
> + movdqu (%rax, %rcx), %xmm2
> + movdqu 16(%rax, %rcx), %xmm3
> pcmpeqb %xmm2, %xmm0
> - movdqa 32(%rdx, %r10), %xmm5
> + movdqa 32(%rdx, %rcx), %xmm5
> pcmpeqb %xmm3, %xmm1
> pminub %xmm2, %xmm0
> - movdqa 48(%rdx, %r10), %xmm6
> + movdqa 48(%rdx, %rcx), %xmm6
> pminub %xmm3, %xmm1
> - movdqu 32(%rax, %r10), %xmm2
> - movdqu 48(%rax, %r10), %xmm3
> + movdqu 32(%rax, %rcx), %xmm2
> + movdqu 48(%rax, %rcx), %xmm3
> pcmpeqb %xmm2, %xmm5
> pcmpeqb %xmm3, %xmm6
> pminub %xmm2, %xmm5
> @@ -169,41 +364,143 @@ L(loop_cross_page):
> pcmpeqb %xmm7, %xmm5
> pcmpeqb %xmm7, %xmm6
>
> - pmovmskb %xmm1, %ecx
> - pmovmskb %xmm5, %r8d
> - pmovmskb %xmm0, %edi
> - salq $16, %rcx
> + pmovmskb %xmm1, %ecx
> + pmovmskb %xmm5, %r8d
> + pmovmskb %xmm0, %edi
> + sal $16, %ecx
> salq $32, %r8
> - pmovmskb %xmm6, %esi
> + pmovmskb %xmm6, %esi
> orq %r8, %rdi
> orq %rcx, %rdi
> salq $48, %rsi
> orq %rsi, %rdi
> - movq %r9, %rcx
> - movq $63, %rsi
> +#endif
> + mov %edx, %ecx
> + mov $63, %esi
> +#ifdef AS_STRNCMP
> + shr $6, %r9
> + sub $1, %r9
> + cmp %r9, %rsi
> + jb L(dont_set_bound2)
> + mov %r9, %rsi
> +L(dont_set_bound2):
> +#endif
> shrq %cl, %rdi
> test %rdi, %rdi
> je L(back_to_loop)
> +#ifdef USE_AVX2
> + vzeroupper
> +#endif
> +
> +#ifdef AS_STRCASECMP
> + mov %rdi, %rcx
> +L(caseloop4):
> + bsf %rcx, %r9
> + movzbl (%rax,%r9), %edi
> + movzbl (%rdx,%r9), %r8d
> + movl (%r11,%rdi,4), %edi
> + subl (%r11,%r8,4), %edi
> + jne L(return2)
> + test %r8d, %r8d
> + je L(return2)
> + leaq -1(%rcx), %rdi
> + andq %rdi, %rcx
> + je L(back_to_loop)
> + jmp L(caseloop4)
> +#else
> bsfq %rdi, %rcx
> movzbl (%rax, %rcx), %eax
> movzbl (%rdx, %rcx), %edx
> subl %edx, %eax
> ret
> +#endif
> +#ifdef AS_STRNCMP
> +L(prepare_back_to_start):
> +# ifdef USE_AVX2
> + vzeroupper
> +# endif
> + mov %r9, %r10
> + mov %rdx, %rsi
> + mov %rax, %rdi
> + jmp L(back_to_start)
> +#endif
>
> +
> +L(cross_page):
> + xorl %edx, %edx
> .p2align 4
> L(cross_page_loop):
> - cmpb %cl, %al
> - jne L(different)
> - addq $1, %rdx
> - cmpq $64, %rdx
> - je L(main_loop_header)
> -L(cross_page):
> movzbl (%rdi, %rdx), %eax
> movzbl (%rsi, %rdx), %ecx
> - testb %al, %al
> - jne L(cross_page_loop)
> - xorl %eax, %eax
> -L(different):
> +#ifdef AS_STRCASECMP
> + movl (%r11,%rax,4), %eax
> + subl (%r11,%rcx,4), %eax
> +#else
> + subl %ecx, %eax
> +#endif
> + jne L(different)
> +#ifdef AS_STRNCMP
> + cmp %rdx, %r10
> + je L(different)
> +#endif
> + test %ecx, %ecx
> + je L(different)
> +
> + movzbl 1(%rdi, %rdx), %eax
> + movzbl 1(%rsi, %rdx), %ecx
> +#ifdef AS_STRCASECMP
> + movl (%r11,%rax,4), %eax
> + subl (%r11,%rcx,4), %eax
> +#else
> subl %ecx, %eax
> +#endif
> + jne L(different)
> +#ifdef AS_STRNCMP
> + lea 1(%rdx), %r9
> + cmp %r9, %r10
> + je L(different)
> +#endif
> + test %ecx, %ecx
> + je L(different)
> +
> + movzbl 2(%rdi, %rdx), %eax
> + movzbl 2(%rsi, %rdx), %ecx
> +#ifdef AS_STRCASECMP
> + movl (%r11,%rax,4), %eax
> + subl (%r11,%rcx,4), %eax
> +#else
> + subl %ecx, %eax
> +#endif
> + jne L(different)
> +#ifdef AS_STRNCMP
> + lea 2(%rdx), %r9
> + cmp %r9, %r10
> + je L(different)
> +#endif
> + test %ecx, %ecx
> + je L(different)
> +
> + movzbl 3(%rdi, %rdx), %eax
> + movzbl 3(%rsi, %rdx), %ecx
> +#ifdef AS_STRCASECMP
> + movl (%r11,%rax,4), %eax
> + subl (%r11,%rcx,4), %eax
> +#else
> + subl %ecx, %eax
> +#endif
> + jne L(different)
> +#ifdef AS_STRNCMP
> + lea 3(%rdx), %r9
> + cmp %r9, %r10
> + je L(different)
> +#endif
> + test %ecx, %ecx
> + je L(different)
> +
> + add $4, %edx
> + cmp $64, %edx
> + je L(main_loop_header)
> + jmp L(cross_page_loop)
> +L(different):
> ret
> -END (__strcmp_sse2_unaligned)
> +END (STRCMP)
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> deleted file mode 100644
> index 4dff0a5..0000000
> --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> +++ /dev/null
> @@ -1,1792 +0,0 @@
> -/* strcmp with SSE4.2
> - Copyright (C) 2009-2015 Free Software Foundation, Inc.
> - Contributed by Intel Corporation.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -
> -/* We use 0x1a:
> - _SIDD_SBYTE_OPS
> - | _SIDD_CMP_EQUAL_EACH
> - | _SIDD_NEGATIVE_POLARITY
> - | _SIDD_LEAST_SIGNIFICANT
> - on pcmpistri to find out if two 16byte data elements are the same
> - and the offset of the first different byte. There are 4 cases:
> -
> - 1. Both 16byte data elements are valid and identical.
> - 2. Both 16byte data elements have EOS and identical.
> - 3. Both 16byte data elements are valid and they differ at offset X.
> - 4. At least one 16byte data element has EOS at offset X. Two 16byte
> - data elements must differ at or before offset X.
> -
> - Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
> -
> - case ECX CFlag ZFlag SFlag
> - 1 16 0 0 0
> - 2 16 0 1 1
> - 3 X 1 0 0
> - 4 0 <= X 1 0/1 0/1
> -
> - We exit from the loop for cases 2, 3 and 4 with jbe which branches
> - when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
> - case 2. */
> -
> - /* Put all SSE 4.2 functions together. */
> - .section .text.SECTION,"ax",@progbits
> - .align 16
> - .type STRCMP_SSE42, @function
> - .globl STRCMP_SSE42
> - .hidden STRCMP_SSE42
> -#ifdef USE_AS_STRCASECMP_L
> -ENTRY (GLABEL(__strcasecmp))
> - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> - mov %fs:(%rax),%RDX_LP
> -
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> -END (GLABEL(__strcasecmp))
> - /* FALLTHROUGH to strcasecmp_l. */
> -#endif
> -#ifdef USE_AS_STRNCASECMP_L
> -ENTRY (GLABEL(__strncasecmp))
> - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> - mov %fs:(%rax),%RCX_LP
> -
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> -END (GLABEL(__strncasecmp))
> - /* FALLTHROUGH to strncasecmp_l. */
> -#endif
> -
> -
> -#ifdef USE_AVX
> -# define movdqa vmovdqa
> -# define movdqu vmovdqu
> -# define pmovmskb vpmovmskb
> -# define pcmpistri vpcmpistri
> -# define psubb vpsubb
> -# define pcmpeqb vpcmpeqb
> -# define psrldq vpsrldq
> -# define pslldq vpslldq
> -# define palignr vpalignr
> -# define pxor vpxor
> -# define D(arg) arg, arg
> -#else
> -# define D(arg) arg
> -#endif
> -
> -STRCMP_SSE42:
> - cfi_startproc
> - CALL_MCOUNT
> -
> -/*
> - * This implementation uses SSE to compare up to 16 bytes at a time.
> - */
> -#ifdef USE_AS_STRCASECMP_L
> - /* We have to fall back on the C implementation for locales
> - with encodings not matching ASCII for single bytes. */
> -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
> -# else
> - mov (%rdx), %RAX_LP
> -# endif
> - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> - jne __strcasecmp_l_nonascii
> -#endif
> -#ifdef USE_AS_STRNCASECMP_L
> - /* We have to fall back on the C implementation for locales
> - with encodings not matching ASCII for single bytes. */
> -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
> -# else
> - mov (%rcx), %RAX_LP
> -# endif
> - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> - jne __strncasecmp_l_nonascii
> -#endif
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - test %rdx, %rdx
> - je LABEL(strcmp_exitz)
> - cmp $1, %rdx
> - je LABEL(Byte0)
> - mov %rdx, %r11
> -#endif
> - mov %esi, %ecx
> - mov %edi, %eax
> -/* Use 64bit AND here to avoid long NOP padding. */
> - and $0x3f, %rcx /* rsi alignment in cache line */
> - and $0x3f, %rax /* rdi alignment in cache line */
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - .section .rodata.cst16,"aM",@progbits,16
> - .align 16
> -LABEL(belowupper):
> - .quad 0x4040404040404040
> - .quad 0x4040404040404040
> -LABEL(topupper):
> -# ifdef USE_AVX
> - .quad 0x5a5a5a5a5a5a5a5a
> - .quad 0x5a5a5a5a5a5a5a5a
> -# else
> - .quad 0x5b5b5b5b5b5b5b5b
> - .quad 0x5b5b5b5b5b5b5b5b
> -# endif
> -LABEL(touppermask):
> - .quad 0x2020202020202020
> - .quad 0x2020202020202020
> - .previous
> - movdqa LABEL(belowupper)(%rip), %xmm4
> -# define UCLOW_reg %xmm4
> - movdqa LABEL(topupper)(%rip), %xmm5
> -# define UCHIGH_reg %xmm5
> - movdqa LABEL(touppermask)(%rip), %xmm6
> -# define LCQWORD_reg %xmm6
> -#endif
> - cmp $0x30, %ecx
> - ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> - cmp $0x30, %eax
> - ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
> - movdqu (%rdi), %xmm1
> - movdqu (%rsi), %xmm2
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# ifdef USE_AVX
> -# define TOLOWER(reg1, reg2) \
> - vpcmpgtb UCLOW_reg, reg1, %xmm7; \
> - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
> - vpcmpgtb UCLOW_reg, reg2, %xmm9; \
> - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
> - vpandn %xmm7, %xmm8, %xmm8; \
> - vpandn %xmm9, %xmm10, %xmm10; \
> - vpand LCQWORD_reg, %xmm8, %xmm8; \
> - vpand LCQWORD_reg, %xmm10, %xmm10; \
> - vpor reg1, %xmm8, reg1; \
> - vpor reg2, %xmm10, reg2
> -# else
> -# define TOLOWER(reg1, reg2) \
> - movdqa reg1, %xmm7; \
> - movdqa UCHIGH_reg, %xmm8; \
> - movdqa reg2, %xmm9; \
> - movdqa UCHIGH_reg, %xmm10; \
> - pcmpgtb UCLOW_reg, %xmm7; \
> - pcmpgtb reg1, %xmm8; \
> - pcmpgtb UCLOW_reg, %xmm9; \
> - pcmpgtb reg2, %xmm10; \
> - pand %xmm8, %xmm7; \
> - pand %xmm10, %xmm9; \
> - pand LCQWORD_reg, %xmm7; \
> - pand LCQWORD_reg, %xmm9; \
> - por %xmm7, reg1; \
> - por %xmm9, reg2
> -# endif
> - TOLOWER (%xmm1, %xmm2)
> -#else
> -# define TOLOWER(reg1, reg2)
> -#endif
> - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
> - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> - pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
> - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
> - jnz LABEL(less16bytes)/* If not, find different value or null char */
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)/* finish comparison */
> -#endif
> - add $16, %rsi /* prepare to search next 16 bytes */
> - add $16, %rdi /* prepare to search next 16 bytes */
> -
> - /*
> - * Determine source and destination string offsets from 16-byte
> - * alignment. Use relative offset difference between the two to
> - * determine which case below to use.
> - */
> - .p2align 4
> -LABEL(crosscache):
> - and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
> - and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
> - mov $0xffff, %edx /* for equivalent offset */
> - xor %r8d, %r8d
> - and $0xf, %ecx /* offset of rsi */
> - and $0xf, %eax /* offset of rdi */
> - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
> - cmp %eax, %ecx
> - je LABEL(ashr_0) /* rsi and rdi relative offset same */
> - ja LABEL(bigger)
> - mov %edx, %r8d /* r8d is offset flag for exit tail */
> - xchg %ecx, %eax
> - xchg %rsi, %rdi
> -LABEL(bigger):
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - lea 15(%rax), %r9
> - sub %rcx, %r9
> - lea LABEL(unaligned_table)(%rip), %r10
> - movslq (%r10, %r9,4), %r9
> - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> - lea (%r10, %r9), %r10
> - jmp *%r10 /* jump to corresponding case */
> -
> -/*
> - * The following cases will be handled by ashr_0
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(0~15) n(0~15) 15(15+ n-n) ashr_0
> - */
> - .p2align 4
> -LABEL(ashr_0):
> -
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
> -#else
> - movdqa (%rdi), %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
> -#endif
> - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
> - pmovmskb %xmm1, %r9d
> - shr %cl, %edx /* adjust 0xffff for offset */
> - shr %cl, %r9d /* adjust for 16-byte offset */
> - sub %r9d, %edx
> - /*
> - * edx must be the same with r9d if in left byte (16-rcx) is equal to
> - * the start from (16-rax) and no null char was seen.
> - */
> - jne LABEL(less32bytes) /* mismatch or null char */
> - UPDATE_STRNCMP_COUNTER
> - mov $16, %rcx
> - mov $16, %r9
> -
> - /*
> - * Now both strings are aligned at 16-byte boundary. Loop over strings
> - * checking 32-bytes per iteration.
> - */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> - .p2align 4
> -LABEL(ashr_0_use):
> - movdqa (%rdi,%rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - lea 16(%rdx), %rdx
> - jbe LABEL(ashr_0_exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - movdqa (%rdi,%rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - lea 16(%rdx), %rdx
> - jbe LABEL(ashr_0_exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - jmp LABEL(ashr_0_use)
> -
> -
> - .p2align 4
> -LABEL(ashr_0_exit_use):
> - jnc LABEL(strcmp_exitz)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub %rcx, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - lea -16(%rdx, %rcx), %rcx
> - movzbl (%rdi, %rcx), %eax
> - movzbl (%rsi, %rcx), %edx
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
> - movl (%rcx,%rax,4), %eax
> - movl (%rcx,%rdx,4), %edx
> -#endif
> - sub %edx, %eax
> - ret
> -
> -
> -
> -/*
> - * The following cases will be handled by ashr_1
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(15) n -15 0(15 +(n-15) - n) ashr_1
> - */
> - .p2align 4
> -LABEL(ashr_1):
> - pslldq $15, D(%xmm2) /* shift first string to align with second */
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
> - psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx /* adjust 0xffff for offset */
> - shr %cl, %r9d /* adjust for 16-byte offset */
> - sub %r9d, %edx
> - jnz LABEL(less32bytes) /* mismatch or null char seen */
> - movdqa (%rdi), %xmm3
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads*/
> - mov $1, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 1(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_1_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_1_use)
> -
> -LABEL(nibble_ashr_1_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $1, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_1_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $1, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_1_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_1_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $1, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $14, %ecx
> - ja LABEL(nibble_ashr_1_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_2
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
> - */
> - .p2align 4
> -LABEL(ashr_2):
> - pslldq $14, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $2, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 2(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_2_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_2_use)
> -
> -LABEL(nibble_ashr_2_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $2, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_2_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $2, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_2_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_2_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $2, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $13, %ecx
> - ja LABEL(nibble_ashr_2_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_3
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
> - */
> - .p2align 4
> -LABEL(ashr_3):
> - pslldq $13, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $3, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 3(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> -LABEL(loop_ashr_3_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_3_use)
> -
> -LABEL(nibble_ashr_3_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $3, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_3_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $3, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_3_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_3_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $3, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $12, %ecx
> - ja LABEL(nibble_ashr_3_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_4
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
> - */
> - .p2align 4
> -LABEL(ashr_4):
> - pslldq $12, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $4, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 4(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_4_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_4_use)
> -
> -LABEL(nibble_ashr_4_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $4, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_4_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $4, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_4_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_4_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $4, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $11, %ecx
> - ja LABEL(nibble_ashr_4_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_5
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
> - */
> - .p2align 4
> -LABEL(ashr_5):
> - pslldq $11, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $5, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 5(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_5_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_5_use)
> -
> -LABEL(nibble_ashr_5_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $5, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_5_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> -
> - palignr $5, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_5_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_5_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $5, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $10, %ecx
> - ja LABEL(nibble_ashr_5_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_6
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
> - */
> - .p2align 4
> -LABEL(ashr_6):
> - pslldq $10, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $6, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 6(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_6_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_6_use)
> -
> -LABEL(nibble_ashr_6_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $6, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_6_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $6, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_6_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_6_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $6, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $9, %ecx
> - ja LABEL(nibble_ashr_6_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_7
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
> - */
> - .p2align 4
> -LABEL(ashr_7):
> - pslldq $9, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $7, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 7(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_7_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_7_use)
> -
> -LABEL(nibble_ashr_7_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $7, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_7_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $7, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_7_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_7_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $7, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $8, %ecx
> - ja LABEL(nibble_ashr_7_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_8
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
> - */
> - .p2align 4
> -LABEL(ashr_8):
> - pslldq $8, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $8, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 8(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_8_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_8_use)
> -
> -LABEL(nibble_ashr_8_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $8, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_8_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $8, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_8_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_8_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $8, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $7, %ecx
> - ja LABEL(nibble_ashr_8_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_9
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
> - */
> - .p2align 4
> -LABEL(ashr_9):
> - pslldq $7, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $9, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 9(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_9_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_9_use)
> -
> -LABEL(nibble_ashr_9_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> -
> - palignr $9, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_9_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $9, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_9_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_9_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $9, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $6, %ecx
> - ja LABEL(nibble_ashr_9_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_10
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
> - */
> - .p2align 4
> -LABEL(ashr_10):
> - pslldq $6, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $10, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 10(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_10_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_10_use)
> -
> -LABEL(nibble_ashr_10_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $10, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_10_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $10, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_10_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_10_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $10, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $5, %ecx
> - ja LABEL(nibble_ashr_10_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_11
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
> - */
> - .p2align 4
> -LABEL(ashr_11):
> - pslldq $5, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $11, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 11(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_11_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_11_use)
> -
> -LABEL(nibble_ashr_11_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $11, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_11_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $11, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_11_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_11_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $11, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $4, %ecx
> - ja LABEL(nibble_ashr_11_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_12
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
> - */
> - .p2align 4
> -LABEL(ashr_12):
> - pslldq $4, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $12, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 12(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_12_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_12_use)
> -
> -LABEL(nibble_ashr_12_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $12, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_12_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $12, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_12_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_12_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $12, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $3, %ecx
> - ja LABEL(nibble_ashr_12_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_13
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
> - */
> - .p2align 4
> -LABEL(ashr_13):
> - pslldq $3, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $13, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 13(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_13_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_13_use)
> -
> -LABEL(nibble_ashr_13_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $13, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_13_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $13, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_13_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_13_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $13, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $2, %ecx
> - ja LABEL(nibble_ashr_13_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_14
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
> - */
> - .p2align 4
> -LABEL(ashr_14):
> - pslldq $2, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $14, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 14(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_14_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_14_use)
> -
> -LABEL(nibble_ashr_14_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $14, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_14_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $14, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_14_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_14_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $14, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $1, %ecx
> - ja LABEL(nibble_ashr_14_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_15
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
> - */
> - .p2align 4
> -LABEL(ashr_15):
> - pslldq $1, D(%xmm2)
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, D(%xmm2)
> - psubb %xmm0, D(%xmm2)
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> -
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $15, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 15(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> -
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_15_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_15_use)
> -
> -LABEL(nibble_ashr_15_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $15, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_15_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $15, -16(%rdi, %rdx), D(%xmm0)
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_15_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_15_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $15, D(%xmm0)
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $0, %ecx
> - ja LABEL(nibble_ashr_15_restart_use)
> -
> -LABEL(nibble_ashr_exit_use):
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - .p2align 4
> -LABEL(exit_use):
> - jnc LABEL(strcmp_exitz)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub %rcx, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add %rcx, %rdx
> - lea -16(%rdi, %r9), %rdi
> - movzbl (%rdi, %rdx), %eax
> - movzbl (%rsi, %rdx), %edx
> - test %r8d, %r8d
> - jz LABEL(ret_use)
> - xchg %eax, %edx
> -LABEL(ret_use):
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
> - movl (%rcx,%rdx,4), %edx
> - movl (%rcx,%rax,4), %eax
> -#endif
> -
> - sub %edx, %eax
> - ret
> -
> -LABEL(less32bytes):
> - lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
> - lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
> - test %r8d, %r8d
> - jz LABEL(ret)
> - xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
> -
> - .p2align 4
> -LABEL(ret):
> -LABEL(less16bytes):
> - bsf %rdx, %rdx /* find and store bit index in %rdx */
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub %rdx, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - movzbl (%rsi, %rdx), %ecx
> - movzbl (%rdi, %rdx), %eax
> -
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> - movl (%rdx,%rcx,4), %ecx
> - movl (%rdx,%rax,4), %eax
> -#endif
> -
> - sub %ecx, %eax
> - ret
> -
> -LABEL(strcmp_exitz):
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> - // XXX Same as code above
> -LABEL(Byte0):
> - movzx (%rsi), %ecx
> - movzx (%rdi), %eax
> -
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> - movl (%rdx,%rcx,4), %ecx
> - movl (%rdx,%rax,4), %eax
> -#endif
> -
> - sub %ecx, %eax
> - ret
> - cfi_endproc
> - .size STRCMP_SSE42, .-STRCMP_SSE42
> -
> -#undef UCLOW_reg
> -#undef UCHIGH_reg
> -#undef LCQWORD_reg
> -#undef TOLOWER
> -
> - /* Put all SSE 4.2 functions together. */
> - .section .rodata.SECTION,"a",@progbits
> - .p2align 3
> -LABEL(unaligned_table):
> - .int LABEL(ashr_1) - LABEL(unaligned_table)
> - .int LABEL(ashr_2) - LABEL(unaligned_table)
> - .int LABEL(ashr_3) - LABEL(unaligned_table)
> - .int LABEL(ashr_4) - LABEL(unaligned_table)
> - .int LABEL(ashr_5) - LABEL(unaligned_table)
> - .int LABEL(ashr_6) - LABEL(unaligned_table)
> - .int LABEL(ashr_7) - LABEL(unaligned_table)
> - .int LABEL(ashr_8) - LABEL(unaligned_table)
> - .int LABEL(ashr_9) - LABEL(unaligned_table)
> - .int LABEL(ashr_10) - LABEL(unaligned_table)
> - .int LABEL(ashr_11) - LABEL(unaligned_table)
> - .int LABEL(ashr_12) - LABEL(unaligned_table)
> - .int LABEL(ashr_13) - LABEL(unaligned_table)
> - .int LABEL(ashr_14) - LABEL(unaligned_table)
> - .int LABEL(ashr_15) - LABEL(unaligned_table)
> - .int LABEL(ashr_0) - LABEL(unaligned_table)
> -
> -#undef LABEL
> -#undef GLABEL
> -#undef SECTION
> -#undef movdqa
> -#undef movdqu
> -#undef pmovmskb
> -#undef pcmpistri
> -#undef psubb
> -#undef pcmpeqb
> -#undef psrldq
> -#undef pslldq
> -#undef palignr
> -#undef pxor
> -#undef D
> diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
> index f50f26c..63aa62e 100644
> --- a/sysdeps/x86_64/multiarch/strcmp.S
> +++ b/sysdeps/x86_64/multiarch/strcmp.S
> @@ -31,8 +31,8 @@
> test %r9, %r9; \
> je LABEL(strcmp_exitz); \
> mov %r9, %r11
> -
> -# define STRCMP_SSE42 __strncmp_sse42
> +# define STRCMP_AVX2 __strncmp_avx2
> +# define STRCMP_SSE2_UNALIGNED __strncmp_sse2_unaligned
> # define STRCMP_SSSE3 __strncmp_ssse3
> # define STRCMP_SSE2 __strncmp_sse2
> # define __GI_STRCMP __GI_strncmp
> @@ -40,9 +40,8 @@
> # include "locale-defines.h"
>
> # define UPDATE_STRNCMP_COUNTER
> -
> -# define STRCMP_AVX __strcasecmp_l_avx
> -# define STRCMP_SSE42 __strcasecmp_l_sse42
> +# define STRCMP_AVX2 __strcasecmp_avx2_l
> +# define STRCMP_SSE2_UNALIGNED __strcasecmp_sse2_unaligned_l
> # define STRCMP_SSSE3 __strcasecmp_l_ssse3
> # define STRCMP_SSE2 __strcasecmp_l_sse2
> # define __GI_STRCMP __GI___strcasecmp_l
> @@ -60,8 +59,8 @@
> je LABEL(strcmp_exitz); \
> mov %r9, %r11
>
> -# define STRCMP_AVX __strncasecmp_l_avx
> -# define STRCMP_SSE42 __strncasecmp_l_sse42
> +# define STRCMP_AVX2 __strncasecmp_avx2_l
> +# define STRCMP_SSE2_UNALIGNED __strncasecmp_sse2_unaligned_l
> # define STRCMP_SSSE3 __strncasecmp_l_ssse3
> # define STRCMP_SSE2 __strncasecmp_l_sse2
> # define __GI_STRCMP __GI___strncasecmp_l
> @@ -69,8 +68,9 @@
> # define USE_AS_STRCMP
> # define UPDATE_STRNCMP_COUNTER
> # ifndef STRCMP
> +# define STRCMP_AVX2 __strcmp_avx2
> +# define STRCMP_SSE2_UNALIGNED __strcmp_sse2_unaligned
> # define STRCMP strcmp
> -# define STRCMP_SSE42 __strcmp_sse42
> # define STRCMP_SSSE3 __strcmp_ssse3
> # define STRCMP_SSE2 __strcmp_sse2
> # define __GI_STRCMP __GI_strcmp
> @@ -89,17 +89,16 @@ ENTRY(STRCMP)
> jne 1f
> call __init_cpu_features
> 1:
> -#ifdef USE_AS_STRCMP
> - leaq __strcmp_sse2_unaligned(%rip), %rax
> +# ifdef HAVE_AVX2_SUPPORT
> +
> + leaq STRCMP_AVX2(%rip), %rax
> + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
> + jnz 3f
> +# endif
> + leaq STRCMP_SSE2_UNALIGNED(%rip), %rax
> testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> jnz 3f
> -#else
> - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
> - jnz 2f
> - leaq STRCMP_SSE42(%rip), %rax
> - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> - jnz 3f
> -#endif
> +
> 2: leaq STRCMP_SSSE3(%rip), %rax
> testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> jnz 3f
> @@ -115,21 +114,22 @@ ENTRY(__strcasecmp)
> jne 1f
> call __init_cpu_features
> 1:
> -# ifdef HAVE_AVX_SUPPORT
> - leaq __strcasecmp_avx(%rip), %rax
> - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +# ifdef HAVE_AVX2_SUPPORT
> +
> + leaq __strcasecmp_avx2(%rip), %rax
> + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
> jnz 3f
> # endif
> - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
> - jnz 2f
> - leaq __strcasecmp_sse42(%rip), %rax
> - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> - jnz 3f
> + leaq __strcasecmp_sse2_unaligned(%rip), %rax
> + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> + jnz 3f
> +
> 2: leaq __strcasecmp_ssse3(%rip), %rax
> testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> jnz 3f
> leaq __strcasecmp_sse2(%rip), %rax
> 3: ret
> +
> END(__strcasecmp)
> weak_alias (__strcasecmp, strcasecmp)
> # endif
> @@ -141,45 +141,26 @@ ENTRY(__strncasecmp)
> jne 1f
> call __init_cpu_features
> 1:
> -# ifdef HAVE_AVX_SUPPORT
> - leaq __strncasecmp_avx(%rip), %rax
> - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +# ifdef HAVE_AVX2_SUPPORT
> +
> + leaq __strncasecmp_avx2(%rip), %rax
> + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
> jnz 3f
> # endif
> - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
> - jnz 2f
> - leaq __strncasecmp_sse42(%rip), %rax
> - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> - jnz 3f
> + leaq __strncasecmp_sse2_unaligned(%rip), %rax
> + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> + jnz 3f
> +
> 2: leaq __strncasecmp_ssse3(%rip), %rax
> testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> jnz 3f
> leaq __strncasecmp_sse2(%rip), %rax
> 3: ret
> +
> END(__strncasecmp)
> weak_alias (__strncasecmp, strncasecmp)
> # endif
>
> -# undef LABEL
> -# define LABEL(l) .L##l##_sse42
> -# define GLABEL(l) l##_sse42
> -# define SECTION sse4.2
> -# include "strcmp-sse42.S"
> -
> -
> -# ifdef HAVE_AVX_SUPPORT
> -# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define LABEL(l) .L##l##_avx
> -# define GLABEL(l) l##_avx
> -# define USE_AVX 1
> -# undef STRCMP_SSE42
> -# define STRCMP_SSE42 STRCMP_AVX
> -# define SECTION avx
> -# include "strcmp-sse42.S"
> -# endif
> -# endif
> -
> -
> # undef ENTRY
> # define ENTRY(name) \
> .type STRCMP_SSE2, @function; \
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> new file mode 100644
> index 0000000..809b966
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> @@ -0,0 +1,6 @@
> +#define AS_STRCASECMP
> +#define AS_STRNCMP
> +#define USE_AVX2
> +#define __strncasecmp_sse2_unaligned __strncasecmp_avx2
> +#define STRCMP __strncasecmp_avx2_l
> +#include "strcmp-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S
> new file mode 100644
> index 0000000..a372ed4
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S
> @@ -0,0 +1,4 @@
> +#define AS_STRCASECMP
> +#define AS_STRNCMP
> +#define STRCMP __strncasecmp_sse2_unaligned_l
> +#include "strcmp-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
> new file mode 100644
> index 0000000..2d9a032
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
> @@ -0,0 +1,4 @@
> +#define USE_AVX2
> +#define AS_STRNCMP
> +#define STRCMP __strncmp_avx2
> +#include "strcmp-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
> new file mode 100644
> index 0000000..7f9a5fd
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
> @@ -0,0 +1,3 @@
> +#define AS_STRNCMP
> +#define STRCMP __strncmp_sse2_unaligned
> +#include "strcmp-sse2-unaligned.S"
> --
> 1.8.4.rc3
--
Budget cuts forced us to sell all the power cords for the servers.