This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH v3 neleai/string-x64] Improve memcmp performance and fix regression.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Fri, 3 Jul 2015 09:45:49 +0200
- Subject: [PING][PATCH v3 neleai/string-x64] Improve memcmp performance and fix regression.
- Authentication-results: sourceware.org; auth=none
- References: <20150618080910 dot GA27306 at domone> <20150619155304 dot GA26278 at domone> <20150621104732 dot GA16055 at domone>
On Sun, Jun 21, 2015 at 12:47:32PM +0200, OndÅej BÃlka wrote:
> On Fri, Jun 19, 2015 at 05:53:04PM +0200, OndÅej BÃlka wrote:
> > On Thu, Jun 18, 2015 at 10:09:10AM +0200, OndÅej BÃlka wrote:
> > > Hi,
> > >
> > > As I sumbitted before in 2013 memcmp improvement here is new version
> > > that improves performance a bit more.
> > >
> > > Also when I browsed results I found that memcmp-sse4 is in fact
> > > regression for i7 nehalem, ivy bridge and haswell architectures. There
> > > its beaten by old sse2 code by more than 10%.
> > >
>
> Also when I tried different headers to see if I could improve avx2
> version. It turned out that byte-by-byte loop that I use for crosspage
> case is best. If I always use that it beats sse4 version on gcc
> workload.
>
> Main problem is that branch misprediction kills performance and I
> couldn't make decision about n fast.
>
> > > Main idea of new implementation is same, problem with performance is
> > > that lot inputs were identical with small n.
> > > For that I found that following approach gives best performance when
> > > n<64 is likely.
> > >
> > > if (!cross_page (s1) && !cross_page (s2))
> > > {
> > > mask = get_mask(EQ(EQ(LOAD(s1),LOAD(s2)),zero))
> > > mask2 = mask & (2 << (n-1));
> > > if (mask2)
> > > return s1[first_byte(mask2)]-s2[first_byte(mask2)];
> > > if (n<=16)
> > > return 0;
> > > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 16;
> > > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 32;
> > > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 48;
> > > mask2 = mask & (2 << (n-1));
> > > if (mask2)
> > > return s1[first_byte(mask2)]-s2[first_byte(mask2)];
> > > if (n<=64)
> > > return 0;
> > > if (mask)
> > > return s1[first_byte(mask)]-s2[first_byte(mask)];
> > > }
> > >
> > > I didn't checked yet using just registers and byteswap to eliminate need
> > > of getting exact byte position as I wrote in related thread.
> > >
> > > I could improve this bit more, I lose lot of cycles in loop ending
> > > conditions. Problem is that I need to handle that unaligned s2 may read
> > > from next page, I would need to add more complicated logic to compute
> > > number of loop iterations.
> > >
> > > Thats related to avx2. I as RFC included it but it harm performance on
> > > haswell.
> > >
> > > Last is wmemcmp that I would also need to convert, now I just moved
> > > memcmp-sse-4 there.
> > >
> > > A profile is found here.
> > >
> > > http://kam.mff.cuni.cz/~ondra/benchmark_string/memcmp_profile.html
> > >
> > I updated that new version. I removed avx2 for now, I will submit it
> > when I find how it could improve performance.
> >
> > Second change is that I added wmemcmp conditionals so now I could delete
> > memcmp-sse4 and wmemcmp-sse4.
> >
> >
> After finding out bts trick for strncmp I also tried to use it in
> memcmp. Problem is that in memcmp my previous control flow was better as
> for memcmp its likely that arguments are equal so I save cost of bsf and
> comparing bytes.
>
> Only improvement was that using bts with same control flow saves few
> cycles making around 2% improvement for gcc workload.
>
> Also in cross-page case only optimization was to unroll a byte-by-byte
> loop as switching to bigger comparison caused more overhead than saved.
>
> So what about following version?
>
> * sysdeps/x86_64/memcmp.S: New implementation.
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c
> (__libc_ifunc_impl_list): Remove memcmp-sse4
> * sysdeps/x86_64/multiarch/Makefile(routines): Remove memcmp-sse4.
> * sysdeps/x86_64/multiarch/memcmp.S: Likewise.
> * sysdeps/x86_64/multiarch/memcmp-sse4.S: Removed.
> * sysdeps/x86_64/multiarch/wmemcmp-sse4.S: Likewise.
> >
>
> ---
> sysdeps/x86_64/memcmp.S | 512 +++----
> sysdeps/x86_64/multiarch/Makefile | 6 +-
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 9 +-
> sysdeps/x86_64/multiarch/memcmp-avx2.S | 3 +
> sysdeps/x86_64/multiarch/memcmp-sse4.S | 1776 ----------------------
> sysdeps/x86_64/multiarch/memcmp.S | 25 +-
> sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 9 +-
> sysdeps/x86_64/multiarch/wmemcmp-sse4.S | 4 -
> sysdeps/x86_64/multiarch/wmemcmp.S | 12 +-
> 9 files changed, 221 insertions(+), 2135 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2.S
> delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
> delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S
>
> diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
> index f636716..88c0c4a 100644
> --- a/sysdeps/x86_64/memcmp.S
> +++ b/sysdeps/x86_64/memcmp.S
> @@ -19,340 +19,204 @@
>
> #include <sysdep.h>
>
> +#ifndef MEMCMP
> +# define MEMCMP memcmp
> +#endif
> +
> .text
> -ENTRY (memcmp)
> - test %rdx, %rdx
> - jz L(finz)
> - cmpq $1, %rdx
> - jle L(finr1b)
> - subq %rdi, %rsi
> - movq %rdx, %r10
> - cmpq $32, %r10
> - jge L(gt32)
> - /* Handle small chunks and last block of less than 32 bytes. */
> -L(small):
> - testq $1, %r10
> - jz L(s2b)
> - movzbl (%rdi), %eax
> - movzbl (%rdi, %rsi), %edx
> - subq $1, %r10
> - je L(finz1)
> - addq $1, %rdi
> - subl %edx, %eax
> - jnz L(exit)
> -L(s2b):
> - testq $2, %r10
> - jz L(s4b)
> - movzwl (%rdi), %eax
> - movzwl (%rdi, %rsi), %edx
> - subq $2, %r10
> - je L(fin2_7)
> - addq $2, %rdi
> - cmpl %edx, %eax
> - jnz L(fin2_7)
> -L(s4b):
> - testq $4, %r10
> - jz L(s8b)
> - movl (%rdi), %eax
> - movl (%rdi, %rsi), %edx
> - subq $4, %r10
> - je L(fin2_7)
> - addq $4, %rdi
> - cmpl %edx, %eax
> - jnz L(fin2_7)
> -L(s8b):
> - testq $8, %r10
> - jz L(s16b)
> - movq (%rdi), %rax
> - movq (%rdi, %rsi), %rdx
> - subq $8, %r10
> - je L(fin2_7)
> - addq $8, %rdi
> - cmpq %rdx, %rax
> - jnz L(fin2_7)
> -L(s16b):
> - movdqu (%rdi), %xmm1
> - movdqu (%rdi, %rsi), %xmm0
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - xorl %eax, %eax
> - subl $0xffff, %edx
> - jz L(finz)
> - bsfl %edx, %ecx
> - leaq (%rdi, %rcx), %rcx
> - movzbl (%rcx), %eax
> - movzbl (%rsi, %rcx), %edx
> - jmp L(finz1)
> +ENTRY (MEMCMP)
> + testq %rdx, %rdx
> + je L(return_zero)
> +#ifdef AS_WMEMCMP
> + shl $2, %rdx
> +#endif
> + pxor %xmm4, %xmm4
> + movl %edi, %eax
> + andl $4095, %eax
> + cmpl $4032, %eax
> + ja L(cross_page_start)
> +L(handle_end):
> + movl %esi, %eax
> + andl $4095, %eax
> + cmpl $4032, %eax
> + ja L(cross_page_start)
> +L(back_header):
> + xor %ecx, %ecx
> + bts %rdx, %rcx
> + sub $1, %rcx
> + movdqu (%rdi), %xmm0
> + movdqu (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm4, %xmm0
> + pmovmskb %xmm0, %eax
> + and %ecx, %eax
> + jne L(different)
> + cmpq $16, %rdx
> + ja L(next)
> + ret
> +L(next):
> + pmovmskb %xmm0, %r8d
> + movdqu 16(%rdi), %xmm2
> + movdqu 16(%rsi), %xmm6
> + movdqu 32(%rdi), %xmm1
> + pcmpeqb %xmm6, %xmm2
> + movdqu 32(%rsi), %xmm5
> + pcmpeqb %xmm4, %xmm2
> + pcmpeqb %xmm5, %xmm1
> + movdqu 48(%rdi), %xmm7
> + pmovmskb %xmm2, %eax
> + movdqu 48(%rsi), %xmm3
> + pcmpeqb %xmm4, %xmm1
> + pmovmskb %xmm1, %r9d
> + sal $16, %eax
> + pcmpeqb %xmm3, %xmm7
> + salq $32, %r9
> + pcmpeqb %xmm4, %xmm7
> + orq %r9, %rax
> + orq %r8, %rax
> + pmovmskb %xmm7, %r8d
> + salq $48, %r8
> + orq %r8, %rax
> + movq %rax, %r8
> + andq %rcx, %rax
> + jne L(different)
> + cmpq $64, %rdx
> + jb L(return_zero)
> + movq %r8, %rax
> + testq %rax, %rax
> + jne L(different)
> +L(align_loop):
> + leaq 64(%rdi), %rax
> + andq $-64, %rax
> + subq %rdi, %rax
> + subq %rax, %rdx
> + addq %rax, %rdi
> + addq %rax, %rsi
> + cmpq $64, %rdx
> + ja L(loop_start)
> + testq %rdx, %rdx
> + jne L(handle_end)
> + xorl %eax, %eax
> + ret
>
> - .p2align 4,, 4
> -L(finr1b):
> - movzbl (%rdi), %eax
> - movzbl (%rsi), %edx
> -L(finz1):
> + .p2align 4
> +L(different):
> + bsfq %rax, %rdx
> +#ifdef AS_WMEMCMP
> + and $-4, %rdx
> + mov (%rdi,%rdx), %eax
> + mov (%rsi,%rdx), %edx
> subl %edx, %eax
> -L(exit):
> + jg L(ret1)
> + jl L(ret_neg_1)
> ret
> -
> - .p2align 4,, 4
> -L(fin2_7):
> - cmpq %rdx, %rax
> - jz L(finz)
> - movq %rax, %r11
> - subq %rdx, %r11
> - bsfq %r11, %rcx
> - sarq $3, %rcx
> - salq $3, %rcx
> - sarq %cl, %rax
> - movzbl %al, %eax
> - sarq %cl, %rdx
> - movzbl %dl, %edx
> +L(ret1):
> + mov $1, %eax
> + ret
> +L(ret_neg_1):
> + mov $-1, %eax
> + ret
> +#else
> + movzbl (%rdi,%rdx), %eax
> + movzbl (%rsi,%rdx), %edx
> subl %edx, %eax
> ret
> -
> - .p2align 4,, 4
> -L(finz):
> +#endif
> +L(return_zero):
> + xor %eax, %eax
> + ret
> + .p2align 4
> +L(loop):
> + subq $64, %rdx
> + addq $64, %rdi
> + addq $64, %rsi
> + cmpq $64, %rdx
> + jbe L(less_64_bytes)
> +L(loop_start):
> + movdqu (%rsi), %xmm0
> + movdqu 16(%rsi), %xmm1
> + pcmpeqb (%rdi), %xmm0
> + movdqu 32(%rsi), %xmm2
> + pcmpeqb 16(%rdi), %xmm1
> + movdqu 48(%rsi), %xmm3
> + pcmpeqb 32(%rdi), %xmm2
> + pcmpeqb 48(%rdi), %xmm3
> + pminub %xmm0, %xmm3
> + pminub %xmm1, %xmm3
> + pminub %xmm2, %xmm3
> + pcmpeqb %xmm4, %xmm3
> + pmovmskb %xmm3, %eax
> + testl %eax, %eax
> + je L(loop)
> + shl $48, %rax
> + pcmpeqb %xmm4, %xmm0
> + pcmpeqb %xmm4, %xmm1
> + pcmpeqb %xmm4, %xmm2
> + pmovmskb %xmm0, %r8
> + pmovmskb %xmm1, %rcx
> + pmovmskb %xmm2, %r9
> + shl $16, %ecx
> + shl $32, %r9
> + or %r8, %rax
> + or %r9, %rax
> + or %rcx, %rax
> + jmp L(different)
> +
> + .p2align 4
> +L(less_64_bytes):
> + testq %rdx, %rdx
> + jne L(handle_end)
> xorl %eax, %eax
> ret
>
> - /* For blocks bigger than 32 bytes
> - 1. Advance one of the addr pointer to be 16B aligned.
> - 2. Treat the case of both addr pointers aligned to 16B
> - separately to avoid movdqu.
> - 3. Handle any blocks of greater than 64 consecutive bytes with
> - unrolling to reduce branches.
> - 4. At least one addr pointer is 16B aligned, use memory version
> - of pcmbeqb.
> - */
> - .p2align 4,, 4
> -L(gt32):
> - movq %rdx, %r11
> - addq %rdi, %r11
> - movq %rdi, %r8
> -
> - andq $15, %r8
> - jz L(16am)
> - /* Both pointers may be misaligned. */
> - movdqu (%rdi), %xmm1
> - movdqu (%rdi, %rsi), %xmm0
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - neg %r8
> - leaq 16(%rdi, %r8), %rdi
> -L(16am):
> - /* Handle two 16B aligned pointers separately. */
> - testq $15, %rsi
> - jz L(ATR)
> - testq $16, %rdi
> - jz L(A32)
> - movdqu (%rdi, %rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -L(A32):
> - movq %r11, %r10
> - andq $-32, %r10
> - cmpq %r10, %rdi
> - jge L(mt16)
> - /* Pre-unroll to be ready for unrolled 64B loop. */
> - testq $32, %rdi
> - jz L(A64)
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> -L(A64):
> - movq %r11, %r10
> - andq $-64, %r10
> - cmpq %r10, %rdi
> - jge L(mt32)
> -
> -L(A64main):
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - cmpq %rdi, %r10
> - jne L(A64main)
> -
> -L(mt32):
> - movq %r11, %r10
> - andq $-32, %r10
> - cmpq %r10, %rdi
> - jge L(mt16)
>
> -L(A32main):
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - cmpq %rdi, %r10
> - jne L(A32main)
> -L(mt16):
> - subq %rdi, %r11
> - je L(finz)
> - movq %r11, %r10
> - jmp L(small)
> -
> - .p2align 4,, 4
> -L(neq):
> - bsfl %edx, %ecx
> - movzbl (%rdi, %rcx), %eax
> - addq %rdi, %rsi
> - movzbl (%rsi,%rcx), %edx
> - jmp L(finz1)
> -
> - .p2align 4,, 4
> -L(ATR):
> - movq %r11, %r10
> - andq $-32, %r10
> - cmpq %r10, %rdi
> - jge L(mt16)
> - testq $16, %rdi
> - jz L(ATR32)
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> - cmpq %rdi, %r10
> - je L(mt16)
> -
> -L(ATR32):
> - movq %r11, %r10
> - andq $-64, %r10
> - testq $32, %rdi
> - jz L(ATR64)
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> -L(ATR64):
> - cmpq %rdi, %r10
> - je L(mt32)
> -
> -L(ATR64main):
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> - cmpq %rdi, %r10
> - jne L(ATR64main)
> -
> - movq %r11, %r10
> - andq $-32, %r10
> - cmpq %r10, %rdi
> - jge L(mt16)
> -
> -L(ATR32res):
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - cmpq %r10, %rdi
> - jne L(ATR32res)
> -
> - subq %rdi, %r11
> - je L(finz)
> - movq %r11, %r10
> - jmp L(small)
> - /* Align to 16byte to improve instruction fetch. */
> - .p2align 4,, 4
> -END(memcmp)
> + .p2align 4
> +L(cross_page_start):
> + cmp $64, %rdx
> + ja L(back_header)
> +
> + .p2align 4
> +L(cross_page):
> + test %edx, %edx
> + je L(return_zero)
> +#ifdef AS_WMEMCMP
> + mov (%rdi), %eax
> + mov (%rsi), %ecx
> + subl %ecx, %eax
> + jg L(ret1)
> + jl L(ret_neg_1)
> +#else
> + movzbl (%rdi), %eax
> + movzbl (%rsi), %ecx
> + subl %ecx, %eax
> + jne L(return)
> + cmp $1, %edx
> + je L(return)
> + movzbl 1(%rdi), %eax
> + movzbl 1(%rsi), %ecx
> + subl %ecx, %eax
> + jne L(return)
> + cmp $2, %edx
> + je L(return)
> + movzbl 2(%rdi), %eax
> + movzbl 2(%rsi), %ecx
> + subl %ecx, %eax
> + jne L(return)
> + cmp $3, %edx
> + je L(return)
> + movzbl 3(%rdi), %eax
> + movzbl 3(%rsi), %ecx
> + subl %ecx, %eax
> + jne L(return)
> +#endif
> + sub $4, %edx
> + add $4, %rdi
> + add $4, %rsi
> + jmp L(cross_page)
> +L(return):
> + ret
> +END(MEMCMP)
>
> -#undef bcmp
> +#undef bcmp
> weak_alias (memcmp, bcmp)
> libc_hidden_builtin_def (memcmp)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index c573744..679db2a 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -8,7 +8,7 @@ ifeq ($(subdir),string)
>
> sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
> strcmp-sse2-unaligned strncmp-ssse3 \
> - memcmp-sse4 memcpy-ssse3 \
> + memcpy-ssse3 \
> memcpy-sse2-unaligned mempcpy-ssse3 \
> memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
> @@ -29,10 +29,10 @@ CFLAGS-strspn-c.c += -msse4
> endif
>
> ifeq (yes,$(config-cflags-avx2))
> -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2
> +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
> endif
> endif
>
> ifeq ($(subdir),wcsmbs)
> -sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
> +sysdep_routines += wmemcmp-sse2-unaligned wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
> endif
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index d398e43..b3dbe65 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -39,10 +39,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/memcmp.S. */
> IFUNC_IMPL (i, name, memcmp,
> - IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
> - __memcmp_sse4_1)
> + IFUNC_IMPL_ADD (array, i, memcmp, HAS_AVX2, __memcmp_avx2)
> IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
> - IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
> + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned))
>
> /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
> IFUNC_IMPL (i, name, __memmove_chk,
> @@ -211,8 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */
> IFUNC_IMPL (i, name, wmemcmp,
> - IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1,
> - __wmemcmp_sse4_1)
> + IFUNC_IMPL_ADD (array, i, wmemcmp, 1,
> + __wmemcmp_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3,
> __wmemcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
> diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
> new file mode 100644
> index 0000000..60483bf
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
> @@ -0,0 +1,3 @@
> +#define USE_AVX2
> +#define MEMCMP __memcmp_avx2
> +#include "../memcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
> deleted file mode 100644
> index 533fece..0000000
> --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
> +++ /dev/null
> @@ -1,1776 +0,0 @@
> -/* memcmp with SSE4.1, wmemcmp with SSE4.1
> - Copyright (C) 2010-2015 Free Software Foundation, Inc.
> - Contributed by Intel Corporation.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef MEMCMP
> -# define MEMCMP __memcmp_sse4_1
> -# endif
> -
> -# define JMPTBL(I, B) (I - B)
> -
> -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> - lea TABLE(%rip), %r11; \
> - movslq (%r11, INDEX, SCALE), %rcx; \
> - add %r11, %rcx; \
> - jmp *%rcx; \
> - ud2
> -
> -/* Warning!
> - wmemcmp has to use SIGNED comparison for elements.
> - memcmp has to use UNSIGNED comparison for elemnts.
> -*/
> -
> - .section .text.sse4.1,"ax",@progbits
> -ENTRY (MEMCMP)
> -# ifdef USE_AS_WMEMCMP
> - shl $2, %rdx
> -# endif
> - pxor %xmm0, %xmm0
> - cmp $79, %rdx
> - ja L(79bytesormore)
> -# ifndef USE_AS_WMEMCMP
> - cmp $1, %rdx
> - je L(firstbyte)
> -# endif
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -# ifndef USE_AS_WMEMCMP
> - .p2align 4
> -L(firstbyte):
> - movzbl (%rdi), %eax
> - movzbl (%rsi), %ecx
> - sub %ecx, %eax
> - ret
> -# endif
> -
> - .p2align 4
> -L(79bytesormore):
> - movdqu (%rsi), %xmm1
> - movdqu (%rdi), %xmm2
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> - mov %rsi, %rcx
> - and $-16, %rsi
> - add $16, %rsi
> - sub %rsi, %rcx
> -
> - sub %rcx, %rdi
> - add %rcx, %rdx
> - test $0xf, %rdi
> - jz L(2aligned)
> -
> - cmp $128, %rdx
> - ja L(128bytesormore)
> -L(less128bytes):
> - sub $64, %rdx
> -
> - movdqu (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqu 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> -
> - movdqu 32(%rdi), %xmm2
> - pxor 32(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(48bytesin256)
> -
> - movdqu 48(%rdi), %xmm2
> - pxor 48(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(64bytesin256)
> - cmp $32, %rdx
> - jb L(less32bytesin64)
> -
> - movdqu 64(%rdi), %xmm2
> - pxor 64(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(80bytesin256)
> -
> - movdqu 80(%rdi), %xmm2
> - pxor 80(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(96bytesin256)
> - sub $32, %rdx
> - add $32, %rdi
> - add $32, %rsi
> -L(less32bytesin64):
> - add $64, %rdi
> - add $64, %rsi
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -L(128bytesormore):
> - cmp $512, %rdx
> - ja L(512bytesormore)
> - cmp $256, %rdx
> - ja L(less512bytes)
> -L(less256bytes):
> - sub $128, %rdx
> -
> - movdqu (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqu 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> -
> - movdqu 32(%rdi), %xmm2
> - pxor 32(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(48bytesin256)
> -
> - movdqu 48(%rdi), %xmm2
> - pxor 48(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(64bytesin256)
> -
> - movdqu 64(%rdi), %xmm2
> - pxor 64(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(80bytesin256)
> -
> - movdqu 80(%rdi), %xmm2
> - pxor 80(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(96bytesin256)
> -
> - movdqu 96(%rdi), %xmm2
> - pxor 96(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(112bytesin256)
> -
> - movdqu 112(%rdi), %xmm2
> - pxor 112(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(128bytesin256)
> -
> - add $128, %rsi
> - add $128, %rdi
> -
> - cmp $64, %rdx
> - jae L(less128bytes)
> -
> - cmp $32, %rdx
> - jb L(less32bytesin128)
> -
> - movdqu (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqu 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> - sub $32, %rdx
> - add $32, %rdi
> - add $32, %rsi
> -L(less32bytesin128):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -L(less512bytes):
> - sub $256, %rdx
> - movdqu (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqu 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> -
> - movdqu 32(%rdi), %xmm2
> - pxor 32(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(48bytesin256)
> -
> - movdqu 48(%rdi), %xmm2
> - pxor 48(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(64bytesin256)
> -
> - movdqu 64(%rdi), %xmm2
> - pxor 64(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(80bytesin256)
> -
> - movdqu 80(%rdi), %xmm2
> - pxor 80(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(96bytesin256)
> -
> - movdqu 96(%rdi), %xmm2
> - pxor 96(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(112bytesin256)
> -
> - movdqu 112(%rdi), %xmm2
> - pxor 112(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(128bytesin256)
> -
> - movdqu 128(%rdi), %xmm2
> - pxor 128(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(144bytesin256)
> -
> - movdqu 144(%rdi), %xmm2
> - pxor 144(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(160bytesin256)
> -
> - movdqu 160(%rdi), %xmm2
> - pxor 160(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(176bytesin256)
> -
> - movdqu 176(%rdi), %xmm2
> - pxor 176(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(192bytesin256)
> -
> - movdqu 192(%rdi), %xmm2
> - pxor 192(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(208bytesin256)
> -
> - movdqu 208(%rdi), %xmm2
> - pxor 208(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(224bytesin256)
> -
> - movdqu 224(%rdi), %xmm2
> - pxor 224(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(240bytesin256)
> -
> - movdqu 240(%rdi), %xmm2
> - pxor 240(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(256bytesin256)
> -
> - add $256, %rsi
> - add $256, %rdi
> -
> - cmp $128, %rdx
> - jae L(less256bytes)
> -
> - cmp $64, %rdx
> - jae L(less128bytes)
> -
> - cmp $32, %rdx
> - jb L(less32bytesin256)
> -
> - movdqu (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqu 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> - sub $32, %rdx
> - add $32, %rdi
> - add $32, %rsi
> -L(less32bytesin256):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> - .p2align 4
> -L(512bytesormore):
> -# ifdef DATA_CACHE_SIZE_HALF
> - mov $DATA_CACHE_SIZE_HALF, %R8_LP
> -# else
> - mov __x86_data_cache_size_half(%rip), %R8_LP
> -# endif
> - mov %r8, %r9
> - shr $1, %r8
> - add %r9, %r8
> - cmp %r8, %rdx
> - ja L(L2_L3_cache_unaglined)
> - sub $64, %rdx
> - .p2align 4
> -L(64bytesormore_loop):
> - movdqu (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - movdqa %xmm2, %xmm1
> -
> - movdqu 16(%rdi), %xmm3
> - pxor 16(%rsi), %xmm3
> - por %xmm3, %xmm1
> -
> - movdqu 32(%rdi), %xmm4
> - pxor 32(%rsi), %xmm4
> - por %xmm4, %xmm1
> -
> - movdqu 48(%rdi), %xmm5
> - pxor 48(%rsi), %xmm5
> - por %xmm5, %xmm1
> -
> - ptest %xmm1, %xmm0
> - jnc L(64bytesormore_loop_end)
> - add $64, %rsi
> - add $64, %rdi
> - sub $64, %rdx
> - jae L(64bytesormore_loop)
> -
> - add $64, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -L(L2_L3_cache_unaglined):
> - sub $64, %rdx
> - .p2align 4
> -L(L2_L3_unaligned_128bytes_loop):
> - prefetchnta 0x1c0(%rdi)
> - prefetchnta 0x1c0(%rsi)
> - movdqu (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - movdqa %xmm2, %xmm1
> -
> - movdqu 16(%rdi), %xmm3
> - pxor 16(%rsi), %xmm3
> - por %xmm3, %xmm1
> -
> - movdqu 32(%rdi), %xmm4
> - pxor 32(%rsi), %xmm4
> - por %xmm4, %xmm1
> -
> - movdqu 48(%rdi), %xmm5
> - pxor 48(%rsi), %xmm5
> - por %xmm5, %xmm1
> -
> - ptest %xmm1, %xmm0
> - jnc L(64bytesormore_loop_end)
> - add $64, %rsi
> - add $64, %rdi
> - sub $64, %rdx
> - jae L(L2_L3_unaligned_128bytes_loop)
> -
> - add $64, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -/*
> - * This case is for machines which are sensitive for unaligned instructions.
> - */
> - .p2align 4
> -L(2aligned):
> - cmp $128, %rdx
> - ja L(128bytesormorein2aligned)
> -L(less128bytesin2aligned):
> - sub $64, %rdx
> -
> - movdqa (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqa 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> -
> - movdqa 32(%rdi), %xmm2
> - pxor 32(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(48bytesin256)
> -
> - movdqa 48(%rdi), %xmm2
> - pxor 48(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(64bytesin256)
> - cmp $32, %rdx
> - jb L(less32bytesin64in2alinged)
> -
> - movdqa 64(%rdi), %xmm2
> - pxor 64(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(80bytesin256)
> -
> - movdqa 80(%rdi), %xmm2
> - pxor 80(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(96bytesin256)
> - sub $32, %rdx
> - add $32, %rdi
> - add $32, %rsi
> -L(less32bytesin64in2alinged):
> - add $64, %rdi
> - add $64, %rsi
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> - .p2align 4
> -L(128bytesormorein2aligned):
> - cmp $512, %rdx
> - ja L(512bytesormorein2aligned)
> - cmp $256, %rdx
> - ja L(256bytesormorein2aligned)
> -L(less256bytesin2alinged):
> - sub $128, %rdx
> -
> - movdqa (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqa 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> -
> - movdqa 32(%rdi), %xmm2
> - pxor 32(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(48bytesin256)
> -
> - movdqa 48(%rdi), %xmm2
> - pxor 48(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(64bytesin256)
> -
> - movdqa 64(%rdi), %xmm2
> - pxor 64(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(80bytesin256)
> -
> - movdqa 80(%rdi), %xmm2
> - pxor 80(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(96bytesin256)
> -
> - movdqa 96(%rdi), %xmm2
> - pxor 96(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(112bytesin256)
> -
> - movdqa 112(%rdi), %xmm2
> - pxor 112(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(128bytesin256)
> -
> - add $128, %rsi
> - add $128, %rdi
> -
> - cmp $64, %rdx
> - jae L(less128bytesin2aligned)
> -
> - cmp $32, %rdx
> - jb L(less32bytesin128in2aligned)
> -
> - movdqu (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqu 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> - sub $32, %rdx
> - add $32, %rdi
> - add $32, %rsi
> -L(less32bytesin128in2aligned):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> - .p2align 4
> -L(256bytesormorein2aligned):
> -
> - sub $256, %rdx
> - movdqa (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqa 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> -
> - movdqa 32(%rdi), %xmm2
> - pxor 32(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(48bytesin256)
> -
> - movdqa 48(%rdi), %xmm2
> - pxor 48(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(64bytesin256)
> -
> - movdqa 64(%rdi), %xmm2
> - pxor 64(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(80bytesin256)
> -
> - movdqa 80(%rdi), %xmm2
> - pxor 80(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(96bytesin256)
> -
> - movdqa 96(%rdi), %xmm2
> - pxor 96(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(112bytesin256)
> -
> - movdqa 112(%rdi), %xmm2
> - pxor 112(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(128bytesin256)
> -
> - movdqa 128(%rdi), %xmm2
> - pxor 128(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(144bytesin256)
> -
> - movdqa 144(%rdi), %xmm2
> - pxor 144(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(160bytesin256)
> -
> - movdqa 160(%rdi), %xmm2
> - pxor 160(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(176bytesin256)
> -
> - movdqa 176(%rdi), %xmm2
> - pxor 176(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(192bytesin256)
> -
> - movdqa 192(%rdi), %xmm2
> - pxor 192(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(208bytesin256)
> -
> - movdqa 208(%rdi), %xmm2
> - pxor 208(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(224bytesin256)
> -
> - movdqa 224(%rdi), %xmm2
> - pxor 224(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(240bytesin256)
> -
> - movdqa 240(%rdi), %xmm2
> - pxor 240(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(256bytesin256)
> -
> - add $256, %rsi
> - add $256, %rdi
> -
> - cmp $128, %rdx
> - jae L(less256bytesin2alinged)
> -
> - cmp $64, %rdx
> - jae L(less128bytesin2aligned)
> -
> - cmp $32, %rdx
> - jb L(less32bytesin256in2alinged)
> -
> - movdqa (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(16bytesin256)
> -
> - movdqa 16(%rdi), %xmm2
> - pxor 16(%rsi), %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(32bytesin256)
> - sub $32, %rdx
> - add $32, %rdi
> - add $32, %rsi
> -L(less32bytesin256in2alinged):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> - .p2align 4
> -L(512bytesormorein2aligned):
> -# ifdef DATA_CACHE_SIZE_HALF
> - mov $DATA_CACHE_SIZE_HALF, %R8_LP
> -# else
> - mov __x86_data_cache_size_half(%rip), %R8_LP
> -# endif
> - mov %r8, %r9
> - shr $1, %r8
> - add %r9, %r8
> - cmp %r8, %rdx
> - ja L(L2_L3_cache_aglined)
> -
> - sub $64, %rdx
> - .p2align 4
> -L(64bytesormore_loopin2aligned):
> - movdqa (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - movdqa %xmm2, %xmm1
> -
> - movdqa 16(%rdi), %xmm3
> - pxor 16(%rsi), %xmm3
> - por %xmm3, %xmm1
> -
> - movdqa 32(%rdi), %xmm4
> - pxor 32(%rsi), %xmm4
> - por %xmm4, %xmm1
> -
> - movdqa 48(%rdi), %xmm5
> - pxor 48(%rsi), %xmm5
> - por %xmm5, %xmm1
> -
> - ptest %xmm1, %xmm0
> - jnc L(64bytesormore_loop_end)
> - add $64, %rsi
> - add $64, %rdi
> - sub $64, %rdx
> - jae L(64bytesormore_loopin2aligned)
> -
> - add $64, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -L(L2_L3_cache_aglined):
> - sub $64, %rdx
> -
> - .p2align 4
> -L(L2_L3_aligned_128bytes_loop):
> - prefetchnta 0x1c0(%rdi)
> - prefetchnta 0x1c0(%rsi)
> - movdqa (%rdi), %xmm2
> - pxor (%rsi), %xmm2
> - movdqa %xmm2, %xmm1
> -
> - movdqa 16(%rdi), %xmm3
> - pxor 16(%rsi), %xmm3
> - por %xmm3, %xmm1
> -
> - movdqa 32(%rdi), %xmm4
> - pxor 32(%rsi), %xmm4
> - por %xmm4, %xmm1
> -
> - movdqa 48(%rdi), %xmm5
> - pxor 48(%rsi), %xmm5
> - por %xmm5, %xmm1
> -
> - ptest %xmm1, %xmm0
> - jnc L(64bytesormore_loop_end)
> - add $64, %rsi
> - add $64, %rdi
> - sub $64, %rdx
> - jae L(L2_L3_aligned_128bytes_loop)
> -
> - add $64, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -
> - .p2align 4
> -L(64bytesormore_loop_end):
> - add $16, %rdi
> - add $16, %rsi
> - ptest %xmm2, %xmm0
> - jnc L(16bytes)
> -
> - add $16, %rdi
> - add $16, %rsi
> - ptest %xmm3, %xmm0
> - jnc L(16bytes)
> -
> - add $16, %rdi
> - add $16, %rsi
> - ptest %xmm4, %xmm0
> - jnc L(16bytes)
> -
> - add $16, %rdi
> - add $16, %rsi
> - jmp L(16bytes)
> -
> -L(256bytesin256):
> - add $256, %rdi
> - add $256, %rsi
> - jmp L(16bytes)
> -L(240bytesin256):
> - add $240, %rdi
> - add $240, %rsi
> - jmp L(16bytes)
> -L(224bytesin256):
> - add $224, %rdi
> - add $224, %rsi
> - jmp L(16bytes)
> -L(208bytesin256):
> - add $208, %rdi
> - add $208, %rsi
> - jmp L(16bytes)
> -L(192bytesin256):
> - add $192, %rdi
> - add $192, %rsi
> - jmp L(16bytes)
> -L(176bytesin256):
> - add $176, %rdi
> - add $176, %rsi
> - jmp L(16bytes)
> -L(160bytesin256):
> - add $160, %rdi
> - add $160, %rsi
> - jmp L(16bytes)
> -L(144bytesin256):
> - add $144, %rdi
> - add $144, %rsi
> - jmp L(16bytes)
> -L(128bytesin256):
> - add $128, %rdi
> - add $128, %rsi
> - jmp L(16bytes)
> -L(112bytesin256):
> - add $112, %rdi
> - add $112, %rsi
> - jmp L(16bytes)
> -L(96bytesin256):
> - add $96, %rdi
> - add $96, %rsi
> - jmp L(16bytes)
> -L(80bytesin256):
> - add $80, %rdi
> - add $80, %rsi
> - jmp L(16bytes)
> -L(64bytesin256):
> - add $64, %rdi
> - add $64, %rsi
> - jmp L(16bytes)
> -L(48bytesin256):
> - add $16, %rdi
> - add $16, %rsi
> -L(32bytesin256):
> - add $16, %rdi
> - add $16, %rsi
> -L(16bytesin256):
> - add $16, %rdi
> - add $16, %rsi
> -L(16bytes):
> - mov -16(%rdi), %rax
> - mov -16(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> -L(8bytes):
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(12bytes):
> - mov -12(%rdi), %rax
> - mov -12(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> -L(4bytes):
> - mov -4(%rsi), %ecx
> -# ifndef USE_AS_WMEMCMP
> - mov -4(%rdi), %eax
> - cmp %eax, %ecx
> -# else
> - cmp -4(%rdi), %ecx
> -# endif
> - jne L(diffin4bytes)
> -L(0bytes):
> - xor %eax, %eax
> - ret
> -
> -# ifndef USE_AS_WMEMCMP
> -/* unreal case for wmemcmp */
> - .p2align 4
> -L(65bytes):
> - movdqu -65(%rdi), %xmm1
> - movdqu -65(%rsi), %xmm2
> - mov $-65, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(49bytes):
> - movdqu -49(%rdi), %xmm1
> - movdqu -49(%rsi), %xmm2
> - mov $-49, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(33bytes):
> - movdqu -33(%rdi), %xmm1
> - movdqu -33(%rsi), %xmm2
> - mov $-33, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(17bytes):
> - mov -17(%rdi), %rax
> - mov -17(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> -L(9bytes):
> - mov -9(%rdi), %rax
> - mov -9(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - movzbl -1(%rdi), %eax
> - movzbl -1(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(13bytes):
> - mov -13(%rdi), %rax
> - mov -13(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(5bytes):
> - mov -5(%rdi), %eax
> - mov -5(%rsi), %ecx
> - cmp %eax, %ecx
> - jne L(diffin4bytes)
> - movzbl -1(%rdi), %eax
> - movzbl -1(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(66bytes):
> - movdqu -66(%rdi), %xmm1
> - movdqu -66(%rsi), %xmm2
> - mov $-66, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(50bytes):
> - movdqu -50(%rdi), %xmm1
> - movdqu -50(%rsi), %xmm2
> - mov $-50, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(34bytes):
> - movdqu -34(%rdi), %xmm1
> - movdqu -34(%rsi), %xmm2
> - mov $-34, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(18bytes):
> - mov -18(%rdi), %rax
> - mov -18(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> -L(10bytes):
> - mov -10(%rdi), %rax
> - mov -10(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - movzwl -2(%rdi), %eax
> - movzwl -2(%rsi), %ecx
> - cmp %cl, %al
> - jne L(end)
> - and $0xffff, %eax
> - and $0xffff, %ecx
> - sub %ecx, %eax
> - ret
> -
> - .p2align 4
> -L(14bytes):
> - mov -14(%rdi), %rax
> - mov -14(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(6bytes):
> - mov -6(%rdi), %eax
> - mov -6(%rsi), %ecx
> - cmp %eax, %ecx
> - jne L(diffin4bytes)
> -L(2bytes):
> - movzwl -2(%rsi), %ecx
> - movzwl -2(%rdi), %eax
> - cmp %cl, %al
> - jne L(end)
> - and $0xffff, %eax
> - and $0xffff, %ecx
> - sub %ecx, %eax
> - ret
> -
> - .p2align 4
> -L(67bytes):
> - movdqu -67(%rdi), %xmm2
> - movdqu -67(%rsi), %xmm1
> - mov $-67, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(51bytes):
> - movdqu -51(%rdi), %xmm2
> - movdqu -51(%rsi), %xmm1
> - mov $-51, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(35bytes):
> - movdqu -35(%rsi), %xmm1
> - movdqu -35(%rdi), %xmm2
> - mov $-35, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(19bytes):
> - mov -19(%rdi), %rax
> - mov -19(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> -L(11bytes):
> - mov -11(%rdi), %rax
> - mov -11(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov -4(%rdi), %eax
> - mov -4(%rsi), %ecx
> - cmp %eax, %ecx
> - jne L(diffin4bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(15bytes):
> - mov -15(%rdi), %rax
> - mov -15(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(7bytes):
> - mov -7(%rdi), %eax
> - mov -7(%rsi), %ecx
> - cmp %eax, %ecx
> - jne L(diffin4bytes)
> - mov -4(%rdi), %eax
> - mov -4(%rsi), %ecx
> - cmp %eax, %ecx
> - jne L(diffin4bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(3bytes):
> - movzwl -3(%rdi), %eax
> - movzwl -3(%rsi), %ecx
> - cmp %eax, %ecx
> - jne L(diffin2bytes)
> -L(1bytes):
> - movzbl -1(%rdi), %eax
> - movzbl -1(%rsi), %ecx
> - sub %ecx, %eax
> - ret
> -# endif
> -
> - .p2align 4
> -L(68bytes):
> - movdqu -68(%rdi), %xmm2
> - movdqu -68(%rsi), %xmm1
> - mov $-68, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(52bytes):
> - movdqu -52(%rdi), %xmm2
> - movdqu -52(%rsi), %xmm1
> - mov $-52, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(36bytes):
> - movdqu -36(%rdi), %xmm2
> - movdqu -36(%rsi), %xmm1
> - mov $-36, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(20bytes):
> - movdqu -20(%rdi), %xmm2
> - movdqu -20(%rsi), %xmm1
> - mov $-20, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -4(%rsi), %ecx
> -
> -# ifndef USE_AS_WMEMCMP
> - mov -4(%rdi), %eax
> - cmp %eax, %ecx
> -# else
> - cmp -4(%rdi), %ecx
> -# endif
> - jne L(diffin4bytes)
> - xor %eax, %eax
> - ret
> -
> -# ifndef USE_AS_WMEMCMP
> -/* unreal cases for wmemcmp */
> - .p2align 4
> -L(69bytes):
> - movdqu -69(%rsi), %xmm1
> - movdqu -69(%rdi), %xmm2
> - mov $-69, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(53bytes):
> - movdqu -53(%rsi), %xmm1
> - movdqu -53(%rdi), %xmm2
> - mov $-53, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(37bytes):
> - movdqu -37(%rsi), %xmm1
> - movdqu -37(%rdi), %xmm2
> - mov $-37, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(21bytes):
> - movdqu -21(%rsi), %xmm1
> - movdqu -21(%rdi), %xmm2
> - mov $-21, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(70bytes):
> - movdqu -70(%rsi), %xmm1
> - movdqu -70(%rdi), %xmm2
> - mov $-70, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(54bytes):
> - movdqu -54(%rsi), %xmm1
> - movdqu -54(%rdi), %xmm2
> - mov $-54, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(38bytes):
> - movdqu -38(%rsi), %xmm1
> - movdqu -38(%rdi), %xmm2
> - mov $-38, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(22bytes):
> - movdqu -22(%rsi), %xmm1
> - movdqu -22(%rdi), %xmm2
> - mov $-22, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(71bytes):
> - movdqu -71(%rsi), %xmm1
> - movdqu -71(%rdi), %xmm2
> - mov $-71, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(55bytes):
> - movdqu -55(%rdi), %xmm2
> - movdqu -55(%rsi), %xmm1
> - mov $-55, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(39bytes):
> - movdqu -39(%rdi), %xmm2
> - movdqu -39(%rsi), %xmm1
> - mov $-39, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(23bytes):
> - movdqu -23(%rdi), %xmm2
> - movdqu -23(%rsi), %xmm1
> - mov $-23, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -# endif
> -
> - .p2align 4
> -L(72bytes):
> - movdqu -72(%rsi), %xmm1
> - movdqu -72(%rdi), %xmm2
> - mov $-72, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(56bytes):
> - movdqu -56(%rdi), %xmm2
> - movdqu -56(%rsi), %xmm1
> - mov $-56, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(40bytes):
> - movdqu -40(%rdi), %xmm2
> - movdqu -40(%rsi), %xmm1
> - mov $-40, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(24bytes):
> - movdqu -24(%rdi), %xmm2
> - movdqu -24(%rsi), %xmm1
> - mov $-24, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -
> - mov -8(%rsi), %rcx
> - mov -8(%rdi), %rax
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> -# ifndef USE_AS_WMEMCMP
> -/* unreal cases for wmemcmp */
> - .p2align 4
> -L(73bytes):
> - movdqu -73(%rsi), %xmm1
> - movdqu -73(%rdi), %xmm2
> - mov $-73, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(57bytes):
> - movdqu -57(%rdi), %xmm2
> - movdqu -57(%rsi), %xmm1
> - mov $-57, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(41bytes):
> - movdqu -41(%rdi), %xmm2
> - movdqu -41(%rsi), %xmm1
> - mov $-41, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(25bytes):
> - movdqu -25(%rdi), %xmm2
> - movdqu -25(%rsi), %xmm1
> - mov $-25, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -9(%rdi), %rax
> - mov -9(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - movzbl -1(%rdi), %eax
> - movzbl -1(%rsi), %ecx
> - sub %ecx, %eax
> - ret
> -
> - .p2align 4
> -L(74bytes):
> - movdqu -74(%rsi), %xmm1
> - movdqu -74(%rdi), %xmm2
> - mov $-74, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(58bytes):
> - movdqu -58(%rdi), %xmm2
> - movdqu -58(%rsi), %xmm1
> - mov $-58, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(42bytes):
> - movdqu -42(%rdi), %xmm2
> - movdqu -42(%rsi), %xmm1
> - mov $-42, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(26bytes):
> - movdqu -26(%rdi), %xmm2
> - movdqu -26(%rsi), %xmm1
> - mov $-26, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -10(%rdi), %rax
> - mov -10(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - movzwl -2(%rdi), %eax
> - movzwl -2(%rsi), %ecx
> - jmp L(diffin2bytes)
> -
> - .p2align 4
> -L(75bytes):
> - movdqu -75(%rsi), %xmm1
> - movdqu -75(%rdi), %xmm2
> - mov $-75, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(59bytes):
> - movdqu -59(%rdi), %xmm2
> - movdqu -59(%rsi), %xmm1
> - mov $-59, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(43bytes):
> - movdqu -43(%rdi), %xmm2
> - movdqu -43(%rsi), %xmm1
> - mov $-43, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(27bytes):
> - movdqu -27(%rdi), %xmm2
> - movdqu -27(%rsi), %xmm1
> - mov $-27, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -11(%rdi), %rax
> - mov -11(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov -4(%rdi), %eax
> - mov -4(%rsi), %ecx
> - cmp %eax, %ecx
> - jne L(diffin4bytes)
> - xor %eax, %eax
> - ret
> -# endif
> - .p2align 4
> -L(76bytes):
> - movdqu -76(%rsi), %xmm1
> - movdqu -76(%rdi), %xmm2
> - mov $-76, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(60bytes):
> - movdqu -60(%rdi), %xmm2
> - movdqu -60(%rsi), %xmm1
> - mov $-60, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(44bytes):
> - movdqu -44(%rdi), %xmm2
> - movdqu -44(%rsi), %xmm1
> - mov $-44, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(28bytes):
> - movdqu -28(%rdi), %xmm2
> - movdqu -28(%rsi), %xmm1
> - mov $-28, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -12(%rdi), %rax
> - mov -12(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov -4(%rsi), %ecx
> -# ifndef USE_AS_WMEMCMP
> - mov -4(%rdi), %eax
> - cmp %eax, %ecx
> -# else
> - cmp -4(%rdi), %ecx
> -# endif
> - jne L(diffin4bytes)
> - xor %eax, %eax
> - ret
> -
> -# ifndef USE_AS_WMEMCMP
> -/* unreal cases for wmemcmp */
> - .p2align 4
> -L(77bytes):
> - movdqu -77(%rsi), %xmm1
> - movdqu -77(%rdi), %xmm2
> - mov $-77, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(61bytes):
> - movdqu -61(%rdi), %xmm2
> - movdqu -61(%rsi), %xmm1
> - mov $-61, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(45bytes):
> - movdqu -45(%rdi), %xmm2
> - movdqu -45(%rsi), %xmm1
> - mov $-45, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(29bytes):
> - movdqu -29(%rdi), %xmm2
> - movdqu -29(%rsi), %xmm1
> - mov $-29, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -
> - mov -13(%rdi), %rax
> - mov -13(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> -
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(78bytes):
> - movdqu -78(%rsi), %xmm1
> - movdqu -78(%rdi), %xmm2
> - mov $-78, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(62bytes):
> - movdqu -62(%rdi), %xmm2
> - movdqu -62(%rsi), %xmm1
> - mov $-62, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(46bytes):
> - movdqu -46(%rdi), %xmm2
> - movdqu -46(%rsi), %xmm1
> - mov $-46, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(30bytes):
> - movdqu -30(%rdi), %xmm2
> - movdqu -30(%rsi), %xmm1
> - mov $-30, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -14(%rdi), %rax
> - mov -14(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(79bytes):
> - movdqu -79(%rsi), %xmm1
> - movdqu -79(%rdi), %xmm2
> - mov $-79, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(63bytes):
> - movdqu -63(%rdi), %xmm2
> - movdqu -63(%rsi), %xmm1
> - mov $-63, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(47bytes):
> - movdqu -47(%rdi), %xmm2
> - movdqu -47(%rsi), %xmm1
> - mov $-47, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(31bytes):
> - movdqu -31(%rdi), %xmm2
> - movdqu -31(%rsi), %xmm1
> - mov $-31, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> - mov -15(%rdi), %rax
> - mov -15(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -# endif
> - .p2align 4
> -L(64bytes):
> - movdqu -64(%rdi), %xmm2
> - movdqu -64(%rsi), %xmm1
> - mov $-64, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(48bytes):
> - movdqu -48(%rdi), %xmm2
> - movdqu -48(%rsi), %xmm1
> - mov $-48, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -L(32bytes):
> - movdqu -32(%rdi), %xmm2
> - movdqu -32(%rsi), %xmm1
> - mov $-32, %dl
> - pxor %xmm1, %xmm2
> - ptest %xmm2, %xmm0
> - jnc L(less16bytes)
> -
> - mov -16(%rdi), %rax
> - mov -16(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> -
> - mov -8(%rdi), %rax
> - mov -8(%rsi), %rcx
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - xor %eax, %eax
> - ret
> -
> -/*
> - * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
> - */
> - .p2align 3
> -L(less16bytes):
> - movsbq %dl, %rdx
> - mov (%rsi, %rdx), %rcx
> - mov (%rdi, %rdx), %rax
> - cmp %rax, %rcx
> - jne L(diffin8bytes)
> - mov 8(%rsi, %rdx), %rcx
> - mov 8(%rdi, %rdx), %rax
> -L(diffin8bytes):
> - cmp %eax, %ecx
> - jne L(diffin4bytes)
> - shr $32, %rcx
> - shr $32, %rax
> -
> -# ifdef USE_AS_WMEMCMP
> -/* for wmemcmp */
> - cmp %eax, %ecx
> - jne L(diffin4bytes)
> - xor %eax, %eax
> - ret
> -# endif
> -
> -L(diffin4bytes):
> -# ifndef USE_AS_WMEMCMP
> - cmp %cx, %ax
> - jne L(diffin2bytes)
> - shr $16, %ecx
> - shr $16, %eax
> -L(diffin2bytes):
> - cmp %cl, %al
> - jne L(end)
> - and $0xffff, %eax
> - and $0xffff, %ecx
> - sub %ecx, %eax
> - ret
> -
> - .p2align 4
> -L(end):
> - and $0xff, %eax
> - and $0xff, %ecx
> - sub %ecx, %eax
> - ret
> -# else
> -
> -/* for wmemcmp */
> - mov $1, %eax
> - jl L(nequal_bigger)
> - neg %eax
> - ret
> -
> - .p2align 4
> -L(nequal_bigger):
> - ret
> -
> -L(unreal_case):
> - xor %eax, %eax
> - ret
> -# endif
> -
> -END (MEMCMP)
> -
> - .section .rodata.sse4.1,"a",@progbits
> - .p2align 3
> -# ifndef USE_AS_WMEMCMP
> -L(table_64bytes):
> - .int JMPTBL (L(0bytes), L(table_64bytes))
> - .int JMPTBL (L(1bytes), L(table_64bytes))
> - .int JMPTBL (L(2bytes), L(table_64bytes))
> - .int JMPTBL (L(3bytes), L(table_64bytes))
> - .int JMPTBL (L(4bytes), L(table_64bytes))
> - .int JMPTBL (L(5bytes), L(table_64bytes))
> - .int JMPTBL (L(6bytes), L(table_64bytes))
> - .int JMPTBL (L(7bytes), L(table_64bytes))
> - .int JMPTBL (L(8bytes), L(table_64bytes))
> - .int JMPTBL (L(9bytes), L(table_64bytes))
> - .int JMPTBL (L(10bytes), L(table_64bytes))
> - .int JMPTBL (L(11bytes), L(table_64bytes))
> - .int JMPTBL (L(12bytes), L(table_64bytes))
> - .int JMPTBL (L(13bytes), L(table_64bytes))
> - .int JMPTBL (L(14bytes), L(table_64bytes))
> - .int JMPTBL (L(15bytes), L(table_64bytes))
> - .int JMPTBL (L(16bytes), L(table_64bytes))
> - .int JMPTBL (L(17bytes), L(table_64bytes))
> - .int JMPTBL (L(18bytes), L(table_64bytes))
> - .int JMPTBL (L(19bytes), L(table_64bytes))
> - .int JMPTBL (L(20bytes), L(table_64bytes))
> - .int JMPTBL (L(21bytes), L(table_64bytes))
> - .int JMPTBL (L(22bytes), L(table_64bytes))
> - .int JMPTBL (L(23bytes), L(table_64bytes))
> - .int JMPTBL (L(24bytes), L(table_64bytes))
> - .int JMPTBL (L(25bytes), L(table_64bytes))
> - .int JMPTBL (L(26bytes), L(table_64bytes))
> - .int JMPTBL (L(27bytes), L(table_64bytes))
> - .int JMPTBL (L(28bytes), L(table_64bytes))
> - .int JMPTBL (L(29bytes), L(table_64bytes))
> - .int JMPTBL (L(30bytes), L(table_64bytes))
> - .int JMPTBL (L(31bytes), L(table_64bytes))
> - .int JMPTBL (L(32bytes), L(table_64bytes))
> - .int JMPTBL (L(33bytes), L(table_64bytes))
> - .int JMPTBL (L(34bytes), L(table_64bytes))
> - .int JMPTBL (L(35bytes), L(table_64bytes))
> - .int JMPTBL (L(36bytes), L(table_64bytes))
> - .int JMPTBL (L(37bytes), L(table_64bytes))
> - .int JMPTBL (L(38bytes), L(table_64bytes))
> - .int JMPTBL (L(39bytes), L(table_64bytes))
> - .int JMPTBL (L(40bytes), L(table_64bytes))
> - .int JMPTBL (L(41bytes), L(table_64bytes))
> - .int JMPTBL (L(42bytes), L(table_64bytes))
> - .int JMPTBL (L(43bytes), L(table_64bytes))
> - .int JMPTBL (L(44bytes), L(table_64bytes))
> - .int JMPTBL (L(45bytes), L(table_64bytes))
> - .int JMPTBL (L(46bytes), L(table_64bytes))
> - .int JMPTBL (L(47bytes), L(table_64bytes))
> - .int JMPTBL (L(48bytes), L(table_64bytes))
> - .int JMPTBL (L(49bytes), L(table_64bytes))
> - .int JMPTBL (L(50bytes), L(table_64bytes))
> - .int JMPTBL (L(51bytes), L(table_64bytes))
> - .int JMPTBL (L(52bytes), L(table_64bytes))
> - .int JMPTBL (L(53bytes), L(table_64bytes))
> - .int JMPTBL (L(54bytes), L(table_64bytes))
> - .int JMPTBL (L(55bytes), L(table_64bytes))
> - .int JMPTBL (L(56bytes), L(table_64bytes))
> - .int JMPTBL (L(57bytes), L(table_64bytes))
> - .int JMPTBL (L(58bytes), L(table_64bytes))
> - .int JMPTBL (L(59bytes), L(table_64bytes))
> - .int JMPTBL (L(60bytes), L(table_64bytes))
> - .int JMPTBL (L(61bytes), L(table_64bytes))
> - .int JMPTBL (L(62bytes), L(table_64bytes))
> - .int JMPTBL (L(63bytes), L(table_64bytes))
> - .int JMPTBL (L(64bytes), L(table_64bytes))
> - .int JMPTBL (L(65bytes), L(table_64bytes))
> - .int JMPTBL (L(66bytes), L(table_64bytes))
> - .int JMPTBL (L(67bytes), L(table_64bytes))
> - .int JMPTBL (L(68bytes), L(table_64bytes))
> - .int JMPTBL (L(69bytes), L(table_64bytes))
> - .int JMPTBL (L(70bytes), L(table_64bytes))
> - .int JMPTBL (L(71bytes), L(table_64bytes))
> - .int JMPTBL (L(72bytes), L(table_64bytes))
> - .int JMPTBL (L(73bytes), L(table_64bytes))
> - .int JMPTBL (L(74bytes), L(table_64bytes))
> - .int JMPTBL (L(75bytes), L(table_64bytes))
> - .int JMPTBL (L(76bytes), L(table_64bytes))
> - .int JMPTBL (L(77bytes), L(table_64bytes))
> - .int JMPTBL (L(78bytes), L(table_64bytes))
> - .int JMPTBL (L(79bytes), L(table_64bytes))
> -# else
> -L(table_64bytes):
> - .int JMPTBL (L(0bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(4bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(8bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(12bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(16bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(20bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(24bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(28bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(32bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(36bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(40bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(44bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(48bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(52bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(56bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(60bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(64bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(68bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(72bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(76bytes), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> - .int JMPTBL (L(unreal_case), L(table_64bytes))
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
> index f8b4636..5d87a17 100644
> --- a/sysdeps/x86_64/multiarch/memcmp.S
> +++ b/sysdeps/x86_64/multiarch/memcmp.S
> @@ -29,33 +29,28 @@ ENTRY(memcmp)
> cmpl $0, KIND_OFFSET+__cpu_features(%rip)
> jne 1f
> call __init_cpu_features
> -
> -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> jnz 2f
> - leaq __memcmp_sse2(%rip), %rax
> - ret
> -
> -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
> - jz 3f
> - leaq __memcmp_sse4_1(%rip), %rax
> +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> + jnz 3f
> +2: leaq __memcmp_sse2_unaligned(%rip), %rax
> ret
>
> 3: leaq __memcmp_ssse3(%rip), %rax
> ret
> -
> END(memcmp)
>
> # undef ENTRY
> # define ENTRY(name) \
> - .type __memcmp_sse2, @function; \
> + .type __memcmp_sse2_unaligned, @function; \
> .p2align 4; \
> - .globl __memcmp_sse2; \
> - .hidden __memcmp_sse2; \
> - __memcmp_sse2: cfi_startproc; \
> + .globl __memcmp_sse2_unaligned; \
> + .hidden __memcmp_sse2_unaligned; \
> + __memcmp_sse2_unaligned: cfi_startproc; \
> CALL_MCOUNT
> # undef END
> # define END(name) \
> - cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
> + cfi_endproc; .size __memcmp_sse2_unaligned, .-__memcmp_sse2_unaligned
>
> # ifdef SHARED
> # undef libc_hidden_builtin_def
> @@ -63,7 +58,7 @@ END(memcmp)
> they will be called without setting up EBX needed for PLT which is
> used by IFUNC. */
> # define libc_hidden_builtin_def(name) \
> - .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
> + .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2_unaligned
> # endif
> #endif
>
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> index 695a236..5dd8d44 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> @@ -201,6 +201,10 @@ L(prepare_loop):
> movdqu %xmm2, 96(%rdi)
> movdqu %xmm3, 112(%rdi)
>
> +#ifdef USE_AVX2
> + vpxor %xmm5, %xmm5, %xmm5
> +#endif
> +
> subq %rsi, %rdi
> add $64, %rsi
> andq $-64, %rsi
> @@ -348,10 +352,13 @@ L(cross_loop):
> sub $1, %rcx
> ja L(cross_loop)
>
> +#ifdef USE_AVX2
> + vpxor %xmm5, %xmm5, %xmm5
> +#else
> pxor %xmm5, %xmm5
> pxor %xmm6, %xmm6
> pxor %xmm7, %xmm7
> -
> +#endif
> lea -64(%rsi), %rdx
> andq $-64, %rdx
> addq %rdx, %rdi
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
> deleted file mode 100644
> index b07973a..0000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_WMEMCMP 1
> -#define MEMCMP __wmemcmp_sse4_1
> -
> -#include "memcmp-sse4.S"
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
> index 109e245..dabd3ed 100644
> --- a/sysdeps/x86_64/multiarch/wmemcmp.S
> +++ b/sysdeps/x86_64/multiarch/wmemcmp.S
> @@ -30,18 +30,16 @@ ENTRY(wmemcmp)
> jne 1f
> call __init_cpu_features
>
> -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> jnz 2f
> - leaq __wmemcmp_sse2(%rip), %rax
> - ret
> -
> -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
> - jz 3f
> - leaq __wmemcmp_sse4_1(%rip), %rax
> +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> + jnz 3f
> +2: leaq __wmemcmp_sse2_unaligned(%rip), %rax
> ret
>
> 3: leaq __wmemcmp_ssse3(%rip), %rax
> ret
>
> +
> END(wmemcmp)
> #endif
> --
> 1.8.4.rc3
--
Write-only-memory subsystem too slow for this machine. Contact your local dealer.