diff --git a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S index 84d92a8..abeea22 100644 --- a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S +++ b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S @@ -54,7 +54,6 @@ ENTRY (__wcscpy_ssse3) PUSH (%edi) mov %edx, %edi - PUSH (%esi) lea 16(%ecx), %esi @@ -220,7 +219,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 @@ -228,15 +226,14 @@ L(Shl4Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %eax, %eax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%edx), %edx @@ -248,7 +245,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 @@ -256,13 +252,11 @@ L(Shl4Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 test %eax, %eax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 28(%ecx), %ecx lea 16(%edx), %edx @@ -305,14 +299,13 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%edx), %xmm6 - psrldq $12, %xmm6 - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%edx) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + POP (%esi) add $12, %edx add $12, %ecx - - POP (%esi) test %al, %al jz L(ExitHigh) test $0x01, %al @@ -337,7 +330,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 @@ -345,15 +337,14 @@ L(Shl8Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %eax, %eax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%edx), %edx @@ -365,7 +356,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 @@ -373,13 +363,11 @@ L(Shl8Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 test %eax, %eax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 24(%ecx), %ecx lea 16(%edx), %edx @@ -422,14 +410,11 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%edx), %xmm6 - psrldq $8, %xmm6 - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%edx) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + POP (%esi) add $8, %edx add $8, %ecx - - POP (%esi) test %al, %al jz L(ExitHigh) test $0x01, %al @@ -454,7 +439,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 @@ -462,15 +446,14 @@ L(Shl12Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %eax, %eax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%edx), %edx @@ -482,7 +465,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 @@ -490,13 +472,11 @@ L(Shl12Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 test %eax, %eax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 20(%ecx), %ecx lea 16(%edx), %edx @@ -539,11 +519,9 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%edx), %xmm6 - psrldq $4, %xmm6 + movl (%ecx), %esi + movl %esi, (%edx) mov $4, %esi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%edx) .p2align 4 L(CopyFrom1To16Bytes): @@ -555,6 +533,7 @@ L(CopyFrom1To16Bytes): jz L(ExitHigh) test $0x01, %al jnz L(Exit4) +L(Exit8): movlpd (%ecx), %xmm0 movlpd %xmm0, (%edx) movl %edi, %eax @@ -564,6 +543,7 @@ L(CopyFrom1To16Bytes): L(ExitHigh): test $0x01, %ah jnz L(Exit12) +L(Exit16): movdqu (%ecx), %xmm0 movdqu %xmm0, (%edx) movl %edi, %eax