This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PING][PATCH] Improve memcmp with unaligned loads.


ping
On Wed, Sep 18, 2013 at 11:58:08PM +0200, OndÅej BÃlka wrote:
> Hi, 
> 
> A sse4_1 version of memcmp was optimized relatively well and it took me 
> quite of time to come with improvement. When memcmp does check we need to
>  integrate ends caused by mismatches with ending by exceeding size. 
> I tried several approaches that were slower than current memcmp.
> 
> A solution come from bit unexpected direction, when I tried to implement
> strcmp I noticed that a same idea applies to memcmp as well and this
> gains us 10% in real workloads.
> 
> There are few optimizations that I left for future patches, 
> 
> First one is a wmemcmp support which is straigthforward to add but I did
> not have time to do so.
> 
> Second is that if doing first check if first byte matches or not helps
> us? (strcmp is same case) For that I would need to collect more data as 
> this depends on profile.
> 
> A code is generated from following C skeleton, there is some room of
> improvement by having better control flow.
> 
> int memcmp (unsigned char *a, unsigned char *b, unsigned long n)
> {  
>   unsigned long i;
>   finish:;
>   if (!n)
>     return 0;
>   /* if (a[0] != b[0]) return a[0] - b[0]; */
>   int or1=((unsigned long)a) & 4095;
>   int or2=((unsigned long)b) & 4095;
>   if ((or1 <= 4096 - 64) & (or2 <= 4096 - 64)) {
>     unsigned long xm = (2L << (n - 1)) - 1; /* To handle n==64. */
> 
>     /* Check first 16 bytes and return bitmask with 1 on positions where
>        bytes differ.  */
>     unsigned long ma= get_diff16 (a, b); 
>     if (ma & xm) 
>       {
>         int f=ffs (ma & xm);
>         return a[f] - b[f];
>       }
>     if (n <= 16) /* Profiling shown that sizes less than 16 that are
>                     same are relatively common and this improves
>                     performance by around 10%.  */
>       return 0;
>     ma = ma | get_diff48 (a + 16, b + 16);
>     if ((ma) & xm) 
>      {
>        int f = ffs (ma & xm);
>        return a[f] - b[f];
>      }
>     if (n <= 64)
>       return 0;
>     if (ma)
>       {
>         int f = ffs (ma);
>         return a[f]-b[f];
>       }
>   } else {
>     for (i = 0;i <= 64; i++){
>       if (i >= n) 
>         return 0;
>       if (a[i] != b[i])
>         return a[i] - b[i];
>     }
>   }
>   unsigned char *a_al = ALIGN_DOWN (a + 64, 64);
>   int shift= a_al - a;
>   a += shift;
>   b += shift;
>   n -= shift; /* Possible to replace with counter.  */
>   while (1) 
>     {
>       if (n<=64) goto finish;
>       if (get_diff_loop64(a,b)) /*nonzero if 64 bytes differ. */
>         goto finish;
>       a += 64;
>       b += 64;
>       n -= 64;
>     }
> }
> 
> Passes test ok to commit?
> 
> 	* sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S: New file.
> 	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
> 	sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
> 	* sysdeps/x86_64/multiarch/memcmp.S: Add __memcmp_sse2_unaligned 
> 	ifunc selection.
> 	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
> 	__memcmp_sse2_unaligned.
> 
> ---
>  sysdeps/x86_64/multiarch/Makefile                |   2 +-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c       |   1 +
>  sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S | 190 +++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/memcmp.S                |   4 +-
>  4 files changed, 194 insertions(+), 3 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
> 
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5ab950a..44976c7 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -8,7 +8,7 @@ ifeq ($(subdir),string)
>  
>  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>  		   strcmp-sse2-unaligned strncmp-ssse3 \
> -		   strend-sse4 memcmp-sse4 memcpy-ssse3 \
> +		   strend-sse4 memcmp-sse2-unaligned memcpy-ssse3 \
>  		   memcpy-sse2-unaligned mempcpy-ssse3 \
>  		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>  		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 1a65ac0..a6d0c11 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -42,6 +42,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
>  			      __memcmp_sse4_1)
>  	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
> +	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>  
>    /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
> diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
> new file mode 100644
> index 0000000..8d11b47
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
> @@ -0,0 +1,190 @@
> +/* memcmp with SSE2 and unaligned loads
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef MEMCMP
> +#  define MEMCMP	__memcmp_sse2_unaligned
> +# endif
> +
> +# ifndef ALIGN
> +#  define ALIGN(n)	.p2align n
> +# endif
> +
> +ENTRY (MEMCMP)
> +	testq	%rdx, %rdx
> +	je	L(return_zero)
> +	pxor	%xmm4, %xmm4
> +L(handle_end):
> +	movl	%esi, %eax
> +	andl	$4095, %eax
> +	cmpl	$4032, %eax
> +	jg	L(cross_page)
> +	movl	%edi, %eax
> +	andl	$4095, %eax
> +	cmpl	$4032, %eax
> +	jg	L(cross_page)
> +	movdqu	(%rdi), %xmm0
> +	lea	-1(%edx), %ecx
> +	movl	$2, %eax
> +	movdqu	(%rsi), %xmm1
> +	salq	%cl, %rax
> +	leaq	-1(%rax), %rcx
> +	pcmpeqb	%xmm1, %xmm0
> +	pcmpeqb	%xmm4, %xmm0
> +	pmovmskb	%xmm0, %r8d
> +	movq	%r8, %rax
> +	andq	%rcx, %rax
> +	jne	L(different)
> +	cmpq	$16, %rdx
> +	jbe	L(return_zero)
> +	movdqu	16(%rdi), %xmm2
> +	movdqu	16(%rsi), %xmm6
> +	movdqu	32(%rdi), %xmm1
> +	pcmpeqb	%xmm6, %xmm2
> +	movdqu	32(%rsi), %xmm5
> +	pcmpeqb	%xmm4, %xmm2
> +	pcmpeqb	%xmm5, %xmm1
> +	movdqu	48(%rdi), %xmm0
> +	pmovmskb	%xmm2, %eax
> +	movdqu	48(%rsi), %xmm3
> +	pcmpeqb	%xmm4, %xmm1
> +	pmovmskb	%xmm1, %r9d
> +	salq	$16, %rax
> +	pcmpeqb	%xmm3, %xmm0
> +	salq	$32, %r9
> +	pcmpeqb	%xmm4, %xmm0
> +	orq	%r9, %rax
> +	orq	%r8, %rax
> +	pmovmskb	%xmm0, %r8d
> +	salq	$48, %r8
> +	orq	%r8, %rax
> +	andq	%rax, %rcx
> +	jne	L(different2)
> +	cmpq	$64, %rdx
> +	jbe	L(return_zero)
> +	testq	%rax, %rax
> +	jne	L(different)
> +L(align_loop):
> +	leaq	64(%rdi), %rax
> +	andq	$-64, %rax
> +	subq	%rdi, %rax
> +	subq	%rax, %rdx
> +	addq	%rax, %rdi
> +	addq	%rax, %rsi
> +	cmpq	$64, %rdx
> +	ja	L(loop_start)
> +	testq	%rdx, %rdx
> +	jne	L(handle_end)
> +	xorl	%eax, %eax
> +	ret
> +
> +
> +	ALIGN (4)
> +L(different):
> +	bsfq	%rax, %rdx
> +	movzbl	(%rdi,%rdx), %eax
> +	movzbl	(%rsi,%rdx), %edx
> +	subl	%edx, %eax
> +	ret
> +L(different2):
> +	bsfq	%rcx, %rdx
> +	movzbl	(%rdi,%rdx), %eax
> +	movzbl	(%rsi,%rdx), %edx
> +	subl	%edx, %eax
> +	ret
> +
> +	ALIGN (4)
> +L(loop):
> +	subq	$64, %rdx
> +	addq	$64, %rdi
> +	addq	$64, %rsi
> +	cmpq	$64, %rdx
> +	jbe	L(less_64_bytes)
> +L(loop_start):
> +	movdqu	(%rsi), %xmm0
> +	movdqu	16(%rsi), %xmm3
> +	pcmpeqb	(%rdi), %xmm0
> +	movdqu	32(%rsi), %xmm2
> +	pcmpeqb	16(%rdi), %xmm3
> +	movdqu	48(%rsi), %xmm1
> +	pcmpeqb	32(%rdi), %xmm2
> +	pminub	%xmm3, %xmm0
> +	pcmpeqb	48(%rdi), %xmm1
> +	pminub	%xmm2, %xmm0
> +	pminub	%xmm1, %xmm0
> +	pcmpeqb	%xmm4, %xmm0
> +	pmovmskb	%xmm0, %eax
> +	testl	%eax, %eax
> +	je	L(loop)
> +	jmp	L(handle_end)
> +
> +	ALIGN (4)
> +L(less_64_bytes):
> +	testq	%rdx, %rdx
> +	jne	L(handle_end)
> +	xorl	%eax, %eax
> +	ret
> +
> +
> +
> +	/* Byte by byte loop, this should be fast enough as 
> +           we expect mismatch in first 5 bytes.
> +
> +           for (i=0;i<64;i++)
> +             {
> +               if (i==n)
> +                 return 0;
> +               if (a[i]!=b[i])
> +                 return a[i]-b[i];
> +             }
> +         */ 
> +	ALIGN (4)
> +L(cross_page):
> +	testq	%rdx, %rdx
> +	je	L(return_zero)
> +	movzbl	(%rdi), %eax
> +	movzbl	(%rsi), %ecx
> +	cmpb	%cl, %al
> +	jne	L(cross_page_different)
> +	movl	$1, %r8d
> +	jmp	L(cross_page_loop_start)
> +
> +	ALIGN (4)
> +L(cross_page_loop):
> +	movzbl	(%rdi,%r8), %eax
> +	movzbl	(%rsi,%r8), %ecx
> +	cmpb	%cl, %al
> +	jne	L(cross_page_different)
> +	addq	$1, %r8
> +	cmpq	$65, %r8
> +	je	L(align_loop)
> +L(cross_page_loop_start):
> +	cmpq	%rdx, %r8
> +	jne	L(cross_page_loop)
> +L(return_zero):
> +	xorl	%eax, %eax
> +	ret
> +L(cross_page_different):
> +	subl	%ecx, %eax
> +	ret
> +
> +END (MEMCMP)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
> index da88af2..9095f00 100644
> --- a/sysdeps/x86_64/multiarch/memcmp.S
> +++ b/sysdeps/x86_64/multiarch/memcmp.S
> @@ -35,9 +35,9 @@ ENTRY(memcmp)
>  	leaq	__memcmp_sse2(%rip), %rax
>  	ret
>  
> -2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
> +2:	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
>  	jz	3f
> -	leaq	__memcmp_sse4_1(%rip), %rax
> +	leaq	__memcmp_sse2_unaligned(%rip), %rax
>  	ret
>  
>  3:	leaq	__memcmp_ssse3(%rip), %rax
> -- 
> 1.8.3.2

-- 

You must've hit the wrong any key.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]