diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S index 6d9951e..028c6d3 100644 --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -34,6 +34,9 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + xor %rax, %rax mov %edi, %ecx and $0x3f, %ecx diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index 901e66f..8101b91 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -33,6 +33,10 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif + +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + xor %eax, %eax cmpb $0, (%rdi) jz L(exit_tail0) diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S index 287ffd2..8bea6fb 100644 --- a/sysdeps/x86_64/strcat.S +++ b/sysdeps/x86_64/strcat.S @@ -21,6 +21,7 @@ #include #include "asm-syntax.h" +/* Will be removed when new strcpy implementation gets merged. */ .text ENTRY (strcat) diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index e82fe8d..6abb3f0 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,5 +1,5 @@ /* SSE2 version of strlen. - Copyright (C) 2012, 2013 Free Software Foundation, Inc. + Copyright (C) 2012-2013 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,12 +18,11 @@ #include -/* Long lived register are - strlen(s), strnlen(s, n): +/* Long lived register in strlen(s), strnlen(s, n) are: %xmm11 - zero %rdi - s - %r10 (s+n) & (~(64-1)) + %r10 (s+n) & (~(64-1)) %r11 s+n */ @@ -31,6 +30,7 @@ .text ENTRY(strlen) +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ #define FIND_ZERO \ pcmpeqb (%rax), %xmm8; \ pcmpeqb 16(%rax), %xmm9; \ @@ -73,9 +73,10 @@ L(n_nonzero): /* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ cmpq $4047, %rcx /* We cannot unify this branching as it would be ~6 cycles slower. */ - ja L(next) + ja L(cross_page) #ifdef AS_STRNLEN +/* Test if end is among first 64 bytes. */ # define STRNLEN_PROLOG \ mov %r11, %rsi; \ subq %rax, %rsi; \ @@ -86,6 +87,7 @@ L(n_nonzero): # define STRNLEN_PROLOG andq $-64, %rax; #endif +/* Ignore bits in mask that come before start of string. */ #define PROLOG(lab) \ movq %rdi, %rcx; \ xorq %rax, %rcx; \ @@ -100,34 +102,37 @@ L(n_nonzero): andq $-16, %rax FIND_ZERO #else + /* Test first 16 bytes unaligned. */ movdqu (%rax), %xmm12 pcmpeqb %xmm8, %xmm12 pmovmskb %xmm12, %edx test %edx, %edx je L(next48_bytes) - bsfq %rdx, %rax + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ ret L(next48_bytes): /* Same as FIND_ZERO except we do not check first 16 bytes. */ andq $-16, %rax - pcmpeqb 16(%rax), %xmm9; - pcmpeqb 32(%rax), %xmm10; - pcmpeqb 48(%rax), %xmm11; - pmovmskb %xmm9, %edx; - pmovmskb %xmm10, %r8d; - pmovmskb %xmm11, %ecx; - salq $16, %rdx; - salq $16, %rcx; - orq %r8, %rcx; - salq $32, %rcx; - orq %rcx, %rdx; + pcmpeqb 16(%rax), %xmm9 + pcmpeqb 32(%rax), %xmm10 + pcmpeqb 48(%rax), %xmm11 + pmovmskb %xmm9, %edx + pmovmskb %xmm10, %r8d + pmovmskb %xmm11, %ecx + salq $16, %rdx + salq $16, %rcx + orq %r8, %rcx + salq $32, %rcx + orq %rcx, %rdx #endif - PROLOG(loop) + /* When no zero byte is found xmm9-11 are zero so we do not have to + zero them. */ + PROLOG(loop) .p2align 4 -L(next): +L(cross_page): andq $-64, %rax FIND_ZERO PROLOG(loop_init) @@ -168,7 +173,7 @@ L(loop): .p2align 4 L(exit_end): cmp %rax, %r11 - je L(first) + je L(first) /* Do not read when end is at page boundary. */ pxor %xmm8, %xmm8 FIND_ZERO @@ -190,6 +195,8 @@ L(exit): ret #else + + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ .p2align 4 L(loop): @@ -229,6 +236,4 @@ L(exit0): #endif END(strlen) -#ifndef AS_STRLEN libc_hidden_builtin_def (strlen) -#endif