This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 7 Oct 2013 15:06:51 +0200
- Subject: Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp
- Authentication-results: sourceware.org; auth=none
- References: <20130913200552 dot GA31992 at domone> <20130913205303 dot GA3620 at domone> <20130916123234 dot GA24928 at domone>
ping
On Mon, Sep 16, 2013 at 02:32:34PM +0200, OndÅej BÃlka wrote:
> On Fri, Sep 13, 2013 at 10:53:03PM +0200, OndÅej BÃlka wrote:
> > Hi,
> > I tried to gather data also for strcasecmp/strncasecmp and I got
> > that they are used rarely on my system.
> >
> Thanks to Andreas I have a implementation ready.
>
> It works by first finding different characters with strcmp code, then
> checking if their case differ. As it is likely that these characters
> were different performance should be similar to strcmp one. I checked
> this property on my computer with following code and number of case
> comparisons needed is mostly 1 in my test:
>
> #include <stdio.h>
> int strcasecmp(unsigned char *x,unsigned char *y)
> {
> int casecmp=0;
> int i=0;
> while(1) {
> if (x[i]!=y[i])
> if (tolower(x[i])==tolower(y[i]))
> casecmp++;
> else
> {
> fprintf(stderr,"dif chars %i tolower_needed %i\n", i, casecmp+1);
> return tolower(x[i])-tolower(y[i]);
> }
> if (!x[i])
> {
> fprintf(stderr,"same chars %i tolower_needed %i \n",i, casecmp);
> return 0;
> }
> i++;
> }
> return 0;
> }
>
> Downsite of this implementation is that checking aaaa vs AAAA will be
> slower, as this looks as unlikely case we could make this tradeoff.
>
> I added it in generic way as I plan to add also ssse3 loop version which
> will come in separate patch.
>
> * sysdeps/x86_64/locale-defines.sym (LOCALE_TOLOWER): Add.
> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
> Add strcasecmp_l-sse2-unaligned.
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
> strcasecmp_sse2_unaligned.
> * sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S: New file.
> * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strcasecmp
> implementation.
> * sysdeps/x86_64/multiarch/strcmp.S: Update ifunc.
>
> ---
> sysdeps/x86_64/locale-defines.sym | 1 +
> sysdeps/x86_64/multiarch/Makefile | 1 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 2 +
> .../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S | 2 +
> sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 117 +++++++++++++++++++++
> sysdeps/x86_64/multiarch/strcmp.S | 9 +-
> 6 files changed, 127 insertions(+), 5 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
>
> diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
> index aebff9a..804debb 100644
> --- a/sysdeps/x86_64/locale-defines.sym
> +++ b/sysdeps/x86_64/locale-defines.sym
> @@ -8,4 +8,5 @@ LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales)
> LC_CTYPE
> _NL_CTYPE_NONASCII_CASE
> LOCALE_DATA_VALUES offsetof (struct __locale_data, values)
> +LOCALE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower)
> SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0])
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5ab950a..551923c 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
> memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> + strcasecmp_l-sse2-unaligned \
> strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> strcpy-sse2-unaligned strncpy-sse2-unaligned \
> stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 1a65ac0..40f8895 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -81,6 +81,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __strcasecmp_avx)
> IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
> __strcasecmp_sse42)
> + IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
> + __strcasecmp_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
> __strcasecmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
> new file mode 100644
> index 0000000..62ce37e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
> @@ -0,0 +1,2 @@
> +#define AS_STRCASECMP
> +#include "strcmp-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> index eed8432..c93d2f5 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> @@ -16,10 +16,33 @@
> License along with the GNU C Library; if not, see
> <http://www.gnu.org/licenses/>. */
>
> +#ifndef NOT_IN_libc
> +
> #include "sysdep.h"
> #define ALIGN(x) .p2align x
>
> +#ifdef AS_STRCASECMP
> +# include "locale-defines.h"
> +
> +#define __strcasecmp_sse2_unaligned strcasecmp_new
> +
> +# define __strcmp_sse2_unaligned __strcasecmp_sse2_unaligned_l
> +ENTRY (__strcasecmp_sse2_unaligned)
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %rdx
> + // XXX 5 byte should be before the function
> + /* 5-byte NOP. */
> + .byte 0x0f,0x1f,0x44,0x00,0x00
> +
> +END (__strcasecmp_sse2_unaligned)
> +
> +#endif
> +
> ENTRY ( __strcmp_sse2_unaligned)
> +
> +#ifdef AS_STRCASECMP
> + mov LOCALE_TOLOWER(%rdx), %r11
> +#endif
> movl %edi, %eax
> xorl %edx, %edx
> pxor %xmm7, %xmm7
> @@ -36,12 +59,16 @@ ENTRY ( __strcmp_sse2_unaligned)
> pmovmskb %xmm0, %eax
> testq %rax, %rax
> je L(next_48_bytes)
> +#ifndef AS_STRCASECMP
> L(return):
> bsfq %rax, %rdx
> movzbl (%rdi, %rdx), %eax
> movzbl (%rsi, %rdx), %edx
> subl %edx, %eax
> ret
> +#else
> + jmp L(caseloop1)
> +#endif
>
> ALIGN (4)
> L(next_48_bytes):
> @@ -85,6 +112,76 @@ L(main_loop_header):
> movq %rcx, %rsi
> jmp L(loop_start)
>
> +#ifdef AS_STRCASECMP
> +L(caseloop1):
> + bsfq %rax, %rdx
> + leaq -1(%rax), %rcx
> + andq %rax, %rcx
> + movzbl (%rdi, %rdx), %eax
> + movzbl (%rsi, %rdx), %edx
> + movl (%r11, %rax, 4), %eax
> + movl (%r11, %rdx, 4), %edx
> + testl %eax, %eax
> + je L(zero1)
> + cmpl %edx, %eax
> + je L(casecnt1)
> +L(zero1):
> + subl %edx, %eax
> + ret
> +L(casecnt1):
> + testq %rcx, %rcx
> + je L(next_48_bytes)
> + movq %rcx, %rax
> + jmp L(caseloop1)
> +
> +L(return):
> +L(caseloop2):
> + bsfq %rax, %rdx
> + leaq -1(%rax), %rcx
> + andq %rax, %rcx
> + movzbl (%rdi, %rdx), %eax
> + movzbl (%rsi, %rdx), %edx
> + movl (%r11, %rax, 4), %eax
> + movl (%r11, %rdx, 4), %edx
> + testl %eax, %eax
> + je L(zero2)
> + cmpl %edx, %eax
> + je L(casecnt2)
> +L(zero2):
> + subl %edx, %eax
> + ret
> +L(casecnt2):
> + testq %rcx, %rcx
> + je L(main_loop_header)
> + movq %rcx, %rax
> + jmp L(caseloop2)
> +
> +L(caseloop3):
> + bsfq %rax, %rdx
> + leaq -1(%rax), %r10
> + andq %rax, %r10
> + movzbl (%rdi, %rdx), %eax
> + movzbl (%rsi, %rdx), %edx
> + movl (%r11, %rax, 4), %eax
> + movl (%r11, %rdx, 4), %edx
> + testl %eax, %eax
> + je L(zero3)
> + cmpl %edx, %eax
> + je L(casecnt3)
> +L(zero3):
> + subl %edx, %eax
> + ret
> +L(casecnt3):
> + movq %rdi, %rax
> + movq %rsi, %rdx
> + testq %r10, %r10
> + je L(back_to_loop)
> + movq %r10, %rax
> + jmp L(caseloop3)
> +
> +#endif
> +
> +
> ALIGN (4)
> L(loop):
> addq $64, %rax
> @@ -135,11 +232,18 @@ L(back_to_loop):
> orq %rdi, %rcx
> salq $48, %rsi
> orq %rsi, %rcx
> +#ifndef AS_STRCASECMP
> bsfq %rcx, %rcx
> movzbl (%rax, %rcx), %eax
> movzbl (%rdx, %rcx), %edx
> subl %edx, %eax
> ret
> +#else
> + movq %rax, %rdi
> + movq %rdx, %rsi
> + movq %rcx, %rax
> + jmp L(return)
> +#endif
>
> ALIGN (4)
> L(loop_cross_page):
> @@ -185,11 +289,19 @@ L(loop_cross_page):
> shrq %cl, %rdi
> test %rdi, %rdi
> je L(back_to_loop)
> +#ifndef AS_STRCASECMP
> bsfq %rdi, %rcx
> movzbl (%rax, %rcx), %eax
> movzbl (%rdx, %rcx), %edx
> subl %edx, %eax
> ret
> +#else
> + movq %rdi, %r10
> + movq %rax, %rdi
> + movq %rdx, %rsi
> + movq %r10, %rax
> + jmp L(caseloop3)
> +#endif
>
> ALIGN (4)
> L(cross_page_loop):
> @@ -201,6 +313,10 @@ L(cross_page_loop):
> L(cross_page):
> movzbl (%rdi, %rdx), %eax
> movzbl (%rsi, %rdx), %ecx
> +#ifdef AS_STRCASECMP
> + movl (%r11, %rax, 4), %eax
> + movl (%r11, %rcx, 4), %ecx
> +#endif
> testb %al, %al
> jne L(cross_page_loop)
> xorl %eax, %eax
> @@ -208,3 +324,4 @@ L(different):
> subl %ecx, %eax
> ret
> END (__strcmp_sse2_unaligned)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
> index c5dcd1a..818aa31 100644
> --- a/sysdeps/x86_64/multiarch/strcmp.S
> +++ b/sysdeps/x86_64/multiarch/strcmp.S
> @@ -115,16 +115,15 @@ ENTRY(__strcasecmp)
> jne 1f
> call __init_cpu_features
> 1:
> + leaq __strcasecmp_sse2_unaligned(%rip), %rax
> + testl $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip)
> + jnz 3f
> +
> # ifdef HAVE_AVX_SUPPORT
> leaq __strcasecmp_avx(%rip), %rax
> testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> jnz 3f
> # endif
> - testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
> - jnz 2f
> - leaq __strcasecmp_sse42(%rip), %rax
> - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> - jnz 3f
> 2: leaq __strcasecmp_ssse3(%rip), %rax
> testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> jnz 3f
> --
> 1.8.3.2
--
Too much radiation coming from the soil.