sysdeps/x86_64/multiarch/memcpy-ssse3.S | 48 +++++++++++++++--------------- sysdeps/x86_64/multiarch/memmove-ssse3.S | 8 ++-- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S index 9a878d3..747d2ef 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S @@ -61,18 +61,15 @@ ENTRY (MEMCPY) #ifdef USE_AS_MEMPCPY add %rdx, %rax #endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(write_0bytes) - cmp $79, %rdx - jbe L(copy_forward) - jmp L(copy_backward) -L(copy_forward): -#endif - cmp $79, %rdx - lea L(table_less_80bytes)(%rip), %r11 + /* + * The small cases we can do without checking for any + * overlap at all, since we do them as all loads followed + * by all stores. + * + * So just jump through the less-than-80bytes table. + */ + cmp $79,%rdx + lea L(table_less_80bytes)(%rip), %r11 ja L(80bytesormore) movslq (%r11, %rdx, 4), %r9 add %rdx, %rsi @@ -81,13 +78,24 @@ L(copy_forward): jmp *%r9 ud2 - ALIGN (4) + /* + * For the 80+ byte cases we need to check overlap + */ L(80bytesormore): -#ifndef USE_AS_MEMMOVE + lea (%rsi,%rdx),%r9 + lea (%rdi,%rdx),%r11 + cmp %rdi,%r9 /* dest start >= source end */ + jae L(nonoverlap) /* -> nonoverlapping */ + cmp %rsi,%r11 /* source start >= destination end */ + jae L(nonoverlap) /* -> nonoverlapping */ + cmp %rsi, %rdi /* overlap: */ + jb L(copy_forward) /* source < dest: forward copy */ + je L(write_0bytes) /* source == dest: no copy */ + jmp L(copy_backward) /* source > dest: backward copy */ +L(nonoverlap): cmp %dil, %sil jle L(copy_backward) -#endif - +L(copy_forward): movdqu (%rsi), %xmm0 mov %rdi, %rcx and $-16, %rdi @@ -2805,7 +2813,6 @@ L(large_page_fwd): movntdq %xmm1, (%rdi) lea 16(%rdi), %rdi lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE mov %rsi, %r9 sub %rdi, %r9 cmp %rdx, %r9 @@ -2814,7 +2821,6 @@ L(large_page_fwd): cmp %rcx, %rdx jb L(ll_cache_copy_fwd_start) L(memmove_is_memcpy_fwd): -#endif L(large_page_loop): movdqu (%rsi), %xmm0 movdqu 0x10(%rsi), %xmm1 @@ -2859,7 +2865,6 @@ L(large_page_less_64bytes): sfence BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#ifdef USE_AS_MEMMOVE ALIGN (4) L(ll_cache_copy_fwd_start): prefetcht0 0x1c0(%rsi) @@ -2906,7 +2911,6 @@ L(large_page_ll_less_fwd_64bytes): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#endif ALIGN (4) L(large_page_bwd): movdqu -0x10(%rsi), %xmm1 @@ -2915,7 +2919,6 @@ L(large_page_bwd): movdqa %xmm1, -0x10(%rdi) lea -16(%rdi), %rdi lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE mov %rdi, %r9 sub %rsi, %r9 cmp %rdx, %r9 @@ -2923,7 +2926,6 @@ L(large_page_bwd): cmp %rcx, %r9 jb L(ll_cache_copy_bwd_start) L(memmove_is_memcpy_bwd): -#endif L(large_page_bwd_loop): movdqu -0x10(%rsi), %xmm0 movdqu -0x20(%rsi), %xmm1 @@ -2966,7 +2968,6 @@ L(large_page_less_bwd_64bytes): sfence BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#ifdef USE_AS_MEMMOVE ALIGN (4) L(ll_cache_copy_bwd_start): prefetcht0 -0x1c0(%rsi) @@ -3010,7 +3011,6 @@ L(ll_cache_copy_bwd_start): sub $0x40, %rdx L(large_page_ll_less_bwd_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#endif END (MEMCPY) diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S index 295430b..a38d445 100644 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -1,4 +1,4 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3 -#define MEMCPY_CHK __memmove_chk_ssse3 -#include "memcpy-ssse3.S" +#include + +strong_alias(__memmove_ssse3, __memcpy_ssse3); +strong_alias(__memmove_chk_ssse3, __memcpy_chk_ssse3);