This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
A patch got accidentaly filtered, so here is correct version. Hi, I looked to rawmemchr implementation and it can be improved by using similar header that is used in strchr. A loop itself was well optimized so we only gain around 20 cycles per call for sizes from 64 bytes. Results at show it that this is improvement for unit tests but I did not find program that calls rawmemchr yet. http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile.html A benchmark is at http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile160813.tar.bz2 Passes test, OK to commit? * sysdeps/x86_64/rawmemchr.S (rawmemchr): Optimize implementation. --- sysdeps/x86_64/rawmemchr.S | 252 +++++++++++++++------------------------------ 1 file changed, 85 insertions(+), 167 deletions(-) diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S index f4d5591..93b8f4f 100644 --- a/sysdeps/x86_64/rawmemchr.S +++ b/sysdeps/x86_64/rawmemchr.S @@ -22,185 +22,103 @@ .text ENTRY (rawmemchr) - movd %rsi, %xmm1 - mov %rdi, %rcx - - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - and $63, %rcx + movd %esi, %xmm1 + movq %rdi, %rax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpl $4031, %eax + punpcklwd %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1 - - cmp $48, %rcx - ja L(crosscache) - + jg L(cross_page) movdqu (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax - test %eax, %eax - - jnz L(matches) - add $16, %rdi - and $-16, %rdi - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax -/* Remove the leading bytes. */ - sar %cl, %eax + pmovmskb %xmm0, %eax test %eax, %eax - je L(unaligned_no_match) -/* Check which byte is a match. */ - bsf %eax, %eax - - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - add $16, %rdi - - .p2align 4 -L(loop_prolog): - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 + jne L(finish) + movdqu 16(%rdi), %xmm3 + movdqu 32(%rdi), %xmm2 pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 - add $64, %rdi - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - test $0x3f, %rdi - jz L(align64_loop) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 + movdqu 48(%rdi), %xmm0 pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - - add $64, %rdi - test %eax, %eax - jnz L(matches0) - - and $-64, %rdi - - .p2align 4 -L(align64_loop): - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - + pmovmskb %xmm3, %edx pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm2, %eax + pmovmskb %xmm0, %ecx + salq $16, %rdx + salq $32, %rax + orq %rdx, %rax + movq %rcx, %rdx + salq $48, %rdx + orq %rdx, %rax + jne L(finish) +L(align): + andq $-64, %rdi + .p2align 4 +L(loop64): + movdqa 64(%rdi), %xmm5 + movdqa 80(%rdi), %xmm4 + pcmpeqb %xmm1, %xmm5 + movdqa 96(%rdi), %xmm3 pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - - add $64, %rdi - - test %eax, %eax - jz L(align64_loop) - - sub $64, %rdi - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 + movdqa 112(%rdi), %xmm2 pcmpeqb %xmm1, %xmm3 - - pcmpeqb 48(%rdi), %xmm1 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches0): - bsf %eax, %eax - lea -16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches): - bsf %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsf %eax, %eax - lea 16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches32): - bsf %eax, %eax - lea 32(%rax, %rdi), %rax + pmaxub %xmm4, %xmm5 + pcmpeqb %xmm1, %xmm2 + pmaxub %xmm3, %xmm5 + pmaxub %xmm2, %xmm5 + addq $64, %rdi + pmovmskb %xmm5, %eax + testl %eax, %eax + je L(loop64) + + movdqa (%rdi), %xmm5 + pcmpeqb %xmm1, %xmm5 + pmovmskb %xmm5, %ecx + pmovmskb %xmm4, %eax + pmovmskb %xmm3, %esi + pmovmskb %xmm2, %edx + salq $32, %rsi + salq $16, %rax + salq $48, %rdx + orq %rsi, %rax + orq %rcx, %rax + orq %rdx, %rax +L(finish): + bsfq %rax, %rax + addq %rdi, %rax ret + .p2align 4,,10 + .p2align 3 +L(cross_page): - .p2align 4 -L(return_null): - xor %rax, %rax + movq %rdi, %rax + andq $-64, %rax + movdqa (%rax), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %esi + movdqa 16(%rax), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + movdqa 32(%rax), %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $16, %rdx + pmovmskb %xmm0, %r8d + movdqa 48(%rax), %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %r8 + orq %r8, %rdx + pmovmskb %xmm0, %ecx + orq %rsi, %rdx + salq $48, %rcx + orq %rcx, %rdx + movl %edi, %ecx + subl %eax, %ecx + shrq %cl, %rdx + testq %rdx, %rdx + je L(align) + bsfq %rdx, %rax + addq %rdi, %rax ret - END (rawmemchr) strong_alias (rawmemchr, __rawmemchr) -- 1.8.3.2
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |