diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S index c4ec54c..b104765 100644 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -29,6 +29,7 @@ .section .text.ssse3,"ax",@progbits ENTRY (STRCPY) + mov %rsi, %rcx # ifdef USE_AS_STRNCPY mov %rdx, %r8 @@ -39,7 +40,7 @@ ENTRY (STRCPY) jz L(Exit0) cmp $8, %r8 jbe L(StrncpyExit8Bytes) -# endif +# endif cmpb $0, (%rcx) jz L(Exit1) cmpb $0, 1(%rcx) @@ -56,10 +57,10 @@ ENTRY (STRCPY) jz L(Exit7) cmpb $0, 7(%rcx) jz L(Exit8) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY cmp $16, %r8 jb L(StrncpyExit15Bytes) -# endif +# endif cmpb $0, 8(%rcx) jz L(Exit9) cmpb $0, 9(%rcx) @@ -74,10 +75,10 @@ ENTRY (STRCPY) jz L(Exit14) cmpb $0, 14(%rcx) jz L(Exit15) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY cmp $16, %r8 je L(Exit16) -# endif +# endif cmpb $0, 15(%rcx) jz L(Exit16) # endif @@ -87,25 +88,15 @@ ENTRY (STRCPY) sub $16, %r8 and $0xf, %rsi -/* add 16 bytes rcx_shift to r8 */ +/* add 16 bytes rcx_offset to r8 */ + add %rsi, %r8 # endif lea 16(%rcx), %rsi -/* Now: - rsi = alignment_16(rcx) + rcx_shift + 16; - rcx_shift = rcx - alignment_16(rcx) -*/ and $-16, %rsi -/* Now: - rsi = alignment_16(rcx) + 16 -*/ pxor %xmm0, %xmm0 mov (%rcx), %r9 mov %r9, (%rdx) -/* - look if there is zero symbol in next 16 bytes of string - from rsi to rsi + 15 and form mask in xmm0 -*/ pcmpeqb (%rsi), %xmm0 mov 8(%rcx), %r9 mov %r9, 8(%rdx) @@ -115,10 +106,6 @@ ENTRY (STRCPY) pmovmskb %xmm0, %rax sub %rcx, %rsi -/* rsi = 16 - rcx_shift */ - -/* rax = 0: there isn't end of string from position rsi to rsi+15 */ - # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(CopyFrom1To16BytesCase2OrCase3) @@ -128,17 +115,9 @@ ENTRY (STRCPY) mov %rdx, %rax lea 16(%rdx), %rdx -/* Now: - rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16 -*/ and $-16, %rdx - -/* Now: rdx = alignment_16(rdx) + 16 */ - sub %rdx, %rax -/* Now: rax = rdx_shift - 16 */ - # ifdef USE_AS_STRNCPY add %rax, %rsi lea -1(%rsi), %rsi @@ -150,22 +129,11 @@ ENTRY (STRCPY) L(ContinueCopy): # endif sub %rax, %rcx -/* Now: - case rcx_shift >= rdx_shift: - rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16 - case rcx_shift < rdx_shift: - rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift) -*/ mov %rcx, %rax and $0xf, %rax -/* Now: - case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift - case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift) - rax can be 0, 1, ..., 15 -*/ mov $0, %rsi -/* case: rcx_shift == rdx_shift */ +/* case: rcx_offset == rdx_offset */ jz L(Align16Both) @@ -282,10 +250,11 @@ L(Align16Both): sub %rcx, %rax sub %rax, %rdx # ifdef USE_AS_STRNCPY - lea 48+64(%r8, %rax), %r8 + lea 112(%r8, %rax), %r8 # endif mov $-0x40, %rsi + .p2align 4 L(Aligned64Loop): movaps (%rcx), %xmm2 movaps %xmm2, %xmm4 @@ -366,7 +335,6 @@ L(Shl1Start): jnz L(Shl1LoopExit) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 31(%rcx), %xmm2 @@ -374,7 +342,7 @@ L(Shl1Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit1Case2OrCase3) @@ -382,10 +350,9 @@ L(Shl1Start): test %rax, %rax jnz L(Shl1LoopExit) - palignr $1, %xmm1, %xmm2 + palignr $1, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 31(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -400,7 +367,6 @@ L(Shl1Start): jnz L(Shl1LoopExit) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 31(%rcx), %xmm2 @@ -408,7 +374,6 @@ L(Shl1Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit1Case2OrCase3) @@ -416,8 +381,7 @@ L(Shl1Start): test %rax, %rax jnz L(Shl1LoopExit) - palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $1, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 31(%rcx), %rcx lea 16(%rdx), %rdx @@ -432,6 +396,8 @@ L(Shl1Start): # endif movaps -1(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl1LoopStart): movaps 15(%rcx), %xmm2 movaps 31(%rcx), %xmm3 @@ -465,11 +431,9 @@ L(Shl1LoopStart): jmp L(Shl1LoopStart) L(Shl1LoopExit): - movaps (%rdx), %xmm6 - psrldq $15, %xmm6 + movdqu -1(%rcx), %xmm1 mov $15, %rsi - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -1(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -488,7 +452,6 @@ L(Shl2Start): jnz L(Shl2LoopExit) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 30(%rcx), %xmm2 @@ -496,7 +459,7 @@ L(Shl2Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit2Case2OrCase3) @@ -504,10 +467,9 @@ L(Shl2Start): test %rax, %rax jnz L(Shl2LoopExit) - palignr $2, %xmm1, %xmm2 + palignr $2, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 30(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -522,7 +484,6 @@ L(Shl2Start): jnz L(Shl2LoopExit) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 30(%rcx), %xmm2 @@ -530,7 +491,6 @@ L(Shl2Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit2Case2OrCase3) @@ -538,8 +498,7 @@ L(Shl2Start): test %rax, %rax jnz L(Shl2LoopExit) - palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $2, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 30(%rcx), %rcx lea 16(%rdx), %rdx @@ -554,6 +513,8 @@ L(Shl2Start): # endif movaps -2(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl2LoopStart): movaps 14(%rcx), %xmm2 movaps 30(%rcx), %xmm3 @@ -587,11 +548,9 @@ L(Shl2LoopStart): jmp L(Shl2LoopStart) L(Shl2LoopExit): - movaps (%rdx), %xmm6 - psrldq $14, %xmm6 + movdqu -2(%rcx), %xmm1 mov $14, %rsi - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -2(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -610,7 +569,6 @@ L(Shl3Start): jnz L(Shl3LoopExit) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 29(%rcx), %xmm2 @@ -618,7 +576,7 @@ L(Shl3Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit3Case2OrCase3) @@ -626,10 +584,9 @@ L(Shl3Start): test %rax, %rax jnz L(Shl3LoopExit) - palignr $3, %xmm1, %xmm2 + palignr $3, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 29(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -644,7 +601,6 @@ L(Shl3Start): jnz L(Shl3LoopExit) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 29(%rcx), %xmm2 @@ -652,7 +608,6 @@ L(Shl3Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit3Case2OrCase3) @@ -660,8 +615,7 @@ L(Shl3Start): test %rax, %rax jnz L(Shl3LoopExit) - palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $3, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 29(%rcx), %rcx lea 16(%rdx), %rdx @@ -676,6 +630,8 @@ L(Shl3Start): # endif movaps -3(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl3LoopStart): movaps 13(%rcx), %xmm2 movaps 29(%rcx), %xmm3 @@ -709,11 +665,9 @@ L(Shl3LoopStart): jmp L(Shl3LoopStart) L(Shl3LoopExit): - movaps (%rdx), %xmm6 - psrldq $13, %xmm6 + movdqu -3(%rcx), %xmm1 mov $13, %rsi - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -3(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -732,7 +686,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -740,7 +693,7 @@ L(Shl4Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit4Case2OrCase3) @@ -748,10 +701,9 @@ L(Shl4Start): test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -766,7 +718,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -774,7 +725,6 @@ L(Shl4Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit4Case2OrCase3) @@ -782,8 +732,7 @@ L(Shl4Start): test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 28(%rcx), %rcx lea 16(%rdx), %rdx @@ -798,6 +747,8 @@ L(Shl4Start): # endif movaps -4(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl4LoopStart): movaps 12(%rcx), %xmm2 movaps 28(%rcx), %xmm3 @@ -831,11 +782,9 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%rdx), %xmm6 - psrldq $12, %xmm6 + movdqu -4(%rcx), %xmm1 mov $12, %rsi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -4(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -854,7 +803,6 @@ L(Shl5Start): jnz L(Shl5LoopExit) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 27(%rcx), %xmm2 @@ -862,7 +810,7 @@ L(Shl5Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit5Case2OrCase3) @@ -870,10 +818,9 @@ L(Shl5Start): test %rax, %rax jnz L(Shl5LoopExit) - palignr $5, %xmm1, %xmm2 + palignr $5, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 27(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -888,7 +835,6 @@ L(Shl5Start): jnz L(Shl5LoopExit) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 27(%rcx), %xmm2 @@ -896,7 +842,6 @@ L(Shl5Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit5Case2OrCase3) @@ -904,8 +849,7 @@ L(Shl5Start): test %rax, %rax jnz L(Shl5LoopExit) - palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $5, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 27(%rcx), %rcx lea 16(%rdx), %rdx @@ -920,6 +864,8 @@ L(Shl5Start): # endif movaps -5(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl5LoopStart): movaps 11(%rcx), %xmm2 movaps 27(%rcx), %xmm3 @@ -953,11 +899,9 @@ L(Shl5LoopStart): jmp L(Shl5LoopStart) L(Shl5LoopExit): - movaps (%rdx), %xmm6 - psrldq $11, %xmm6 + movdqu -5(%rcx), %xmm1 mov $11, %rsi - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -5(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -976,7 +920,6 @@ L(Shl6Start): jnz L(Shl6LoopExit) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 26(%rcx), %xmm2 @@ -984,7 +927,7 @@ L(Shl6Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit6Case2OrCase3) @@ -992,10 +935,9 @@ L(Shl6Start): test %rax, %rax jnz L(Shl6LoopExit) - palignr $6, %xmm1, %xmm2 + palignr $6, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 26(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1010,7 +952,6 @@ L(Shl6Start): jnz L(Shl6LoopExit) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 26(%rcx), %xmm2 @@ -1018,7 +959,6 @@ L(Shl6Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit6Case2OrCase3) @@ -1026,8 +966,7 @@ L(Shl6Start): test %rax, %rax jnz L(Shl6LoopExit) - palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $6, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 26(%rcx), %rcx lea 16(%rdx), %rdx @@ -1042,6 +981,8 @@ L(Shl6Start): # endif movaps -6(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl6LoopStart): movaps 10(%rcx), %xmm2 movaps 26(%rcx), %xmm3 @@ -1075,11 +1016,11 @@ L(Shl6LoopStart): jmp L(Shl6LoopStart) L(Shl6LoopExit): - movaps (%rdx), %xmm6 - psrldq $10, %xmm6 + mov (%rcx), %r9 + mov 6(%rcx), %esi + mov %r9, (%rdx) + mov %esi, 6(%rdx) mov $10, %rsi - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1098,7 +1039,6 @@ L(Shl7Start): jnz L(Shl7LoopExit) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 25(%rcx), %xmm2 @@ -1106,7 +1046,7 @@ L(Shl7Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit7Case2OrCase3) @@ -1114,10 +1054,9 @@ L(Shl7Start): test %rax, %rax jnz L(Shl7LoopExit) - palignr $7, %xmm1, %xmm2 + palignr $7, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 25(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1132,7 +1071,6 @@ L(Shl7Start): jnz L(Shl7LoopExit) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 25(%rcx), %xmm2 @@ -1140,7 +1078,6 @@ L(Shl7Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit7Case2OrCase3) @@ -1148,8 +1085,7 @@ L(Shl7Start): test %rax, %rax jnz L(Shl7LoopExit) - palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $7, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 25(%rcx), %rcx lea 16(%rdx), %rdx @@ -1164,6 +1100,8 @@ L(Shl7Start): # endif movaps -7(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl7LoopStart): movaps 9(%rcx), %xmm2 movaps 25(%rcx), %xmm3 @@ -1197,11 +1135,11 @@ L(Shl7LoopStart): jmp L(Shl7LoopStart) L(Shl7LoopExit): - movaps (%rdx), %xmm6 - psrldq $9, %xmm6 + mov (%rcx), %r9 + mov 5(%rcx), %esi + mov %r9, (%rdx) + mov %esi, 5(%rdx) mov $9, %rsi - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1220,7 +1158,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -1228,7 +1165,7 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit8Case2OrCase3) @@ -1236,10 +1173,9 @@ L(Shl8Start): test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1254,7 +1190,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -1262,7 +1197,6 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit8Case2OrCase3) @@ -1270,8 +1204,7 @@ L(Shl8Start): test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 24(%rcx), %rcx lea 16(%rdx), %rdx @@ -1286,6 +1219,8 @@ L(Shl8Start): # endif movaps -8(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl8LoopStart): movaps 8(%rcx), %xmm2 movaps 24(%rcx), %xmm3 @@ -1319,11 +1254,9 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%rdx), %xmm6 - psrldq $8, %xmm6 + mov (%rcx), %r9 mov $8, %rsi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1342,7 +1275,6 @@ L(Shl9Start): jnz L(Shl9LoopExit) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 23(%rcx), %xmm2 @@ -1350,7 +1282,7 @@ L(Shl9Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit9Case2OrCase3) @@ -1358,10 +1290,9 @@ L(Shl9Start): test %rax, %rax jnz L(Shl9LoopExit) - palignr $9, %xmm1, %xmm2 + palignr $9, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 23(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1376,7 +1307,6 @@ L(Shl9Start): jnz L(Shl9LoopExit) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 23(%rcx), %xmm2 @@ -1384,7 +1314,6 @@ L(Shl9Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit9Case2OrCase3) @@ -1392,8 +1321,7 @@ L(Shl9Start): test %rax, %rax jnz L(Shl9LoopExit) - palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $9, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 23(%rcx), %rcx lea 16(%rdx), %rdx @@ -1408,6 +1336,8 @@ L(Shl9Start): # endif movaps -9(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl9LoopStart): movaps 7(%rcx), %xmm2 movaps 23(%rcx), %xmm3 @@ -1441,11 +1371,9 @@ L(Shl9LoopStart): jmp L(Shl9LoopStart) L(Shl9LoopExit): - movaps (%rdx), %xmm6 - psrldq $7, %xmm6 + mov -1(%rcx), %r9 mov $7, %rsi - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -1(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1464,7 +1392,6 @@ L(Shl10Start): jnz L(Shl10LoopExit) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 22(%rcx), %xmm2 @@ -1472,7 +1399,7 @@ L(Shl10Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit10Case2OrCase3) @@ -1480,10 +1407,9 @@ L(Shl10Start): test %rax, %rax jnz L(Shl10LoopExit) - palignr $10, %xmm1, %xmm2 + palignr $10, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 22(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1498,7 +1424,6 @@ L(Shl10Start): jnz L(Shl10LoopExit) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 22(%rcx), %xmm2 @@ -1506,7 +1431,6 @@ L(Shl10Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit10Case2OrCase3) @@ -1514,8 +1438,7 @@ L(Shl10Start): test %rax, %rax jnz L(Shl10LoopExit) - palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $10, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 22(%rcx), %rcx lea 16(%rdx), %rdx @@ -1530,6 +1453,8 @@ L(Shl10Start): # endif movaps -10(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl10LoopStart): movaps 6(%rcx), %xmm2 movaps 22(%rcx), %xmm3 @@ -1563,11 +1488,9 @@ L(Shl10LoopStart): jmp L(Shl10LoopStart) L(Shl10LoopExit): - movaps (%rdx), %xmm6 - psrldq $6, %xmm6 + mov -2(%rcx), %r9 mov $6, %rsi - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -2(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1586,7 +1509,6 @@ L(Shl11Start): jnz L(Shl11LoopExit) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 21(%rcx), %xmm2 @@ -1594,7 +1516,7 @@ L(Shl11Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit11Case2OrCase3) @@ -1602,10 +1524,9 @@ L(Shl11Start): test %rax, %rax jnz L(Shl11LoopExit) - palignr $11, %xmm1, %xmm2 + palignr $11, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 21(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1620,7 +1541,6 @@ L(Shl11Start): jnz L(Shl11LoopExit) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 21(%rcx), %xmm2 @@ -1628,7 +1548,6 @@ L(Shl11Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit11Case2OrCase3) @@ -1636,8 +1555,7 @@ L(Shl11Start): test %rax, %rax jnz L(Shl11LoopExit) - palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $11, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 21(%rcx), %rcx lea 16(%rdx), %rdx @@ -1652,6 +1570,8 @@ L(Shl11Start): # endif movaps -11(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl11LoopStart): movaps 5(%rcx), %xmm2 movaps 21(%rcx), %xmm3 @@ -1685,11 +1605,9 @@ L(Shl11LoopStart): jmp L(Shl11LoopStart) L(Shl11LoopExit): - movaps (%rdx), %xmm6 - psrldq $5, %xmm6 + mov -3(%rcx), %r9 mov $5, %rsi - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -3(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1708,7 +1626,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -1716,7 +1633,7 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit12Case2OrCase3) @@ -1724,10 +1641,9 @@ L(Shl12Start): test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1742,7 +1658,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -1750,7 +1665,6 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit12Case2OrCase3) @@ -1758,8 +1672,7 @@ L(Shl12Start): test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 20(%rcx), %rcx lea 16(%rdx), %rdx @@ -1774,6 +1687,8 @@ L(Shl12Start): # endif movaps -12(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl12LoopStart): movaps 4(%rcx), %xmm2 movaps 20(%rcx), %xmm3 @@ -1807,11 +1722,9 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%rdx), %xmm6 - psrldq $4, %xmm6 + mov (%rcx), %r9d mov $4, %rsi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1830,7 +1743,6 @@ L(Shl13Start): jnz L(Shl13LoopExit) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 19(%rcx), %xmm2 @@ -1838,7 +1750,7 @@ L(Shl13Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit13Case2OrCase3) @@ -1846,10 +1758,9 @@ L(Shl13Start): test %rax, %rax jnz L(Shl13LoopExit) - palignr $13, %xmm1, %xmm2 + palignr $13, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 19(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1864,7 +1775,6 @@ L(Shl13Start): jnz L(Shl13LoopExit) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 19(%rcx), %xmm2 @@ -1872,7 +1782,6 @@ L(Shl13Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit13Case2OrCase3) @@ -1880,8 +1789,7 @@ L(Shl13Start): test %rax, %rax jnz L(Shl13LoopExit) - palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $13, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 19(%rcx), %rcx lea 16(%rdx), %rdx @@ -1896,6 +1804,8 @@ L(Shl13Start): # endif movaps -13(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl13LoopStart): movaps 3(%rcx), %xmm2 movaps 19(%rcx), %xmm3 @@ -1929,11 +1839,9 @@ L(Shl13LoopStart): jmp L(Shl13LoopStart) L(Shl13LoopExit): - movaps (%rdx), %xmm6 - psrldq $3, %xmm6 + mov -1(%rcx), %r9d mov $3, %rsi - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -1(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1952,7 +1860,6 @@ L(Shl14Start): jnz L(Shl14LoopExit) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 18(%rcx), %xmm2 @@ -1960,7 +1867,7 @@ L(Shl14Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit14Case2OrCase3) @@ -1968,10 +1875,9 @@ L(Shl14Start): test %rax, %rax jnz L(Shl14LoopExit) - palignr $14, %xmm1, %xmm2 + palignr $14, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 18(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1986,7 +1892,6 @@ L(Shl14Start): jnz L(Shl14LoopExit) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 18(%rcx), %xmm2 @@ -1994,7 +1899,6 @@ L(Shl14Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit14Case2OrCase3) @@ -2002,8 +1906,7 @@ L(Shl14Start): test %rax, %rax jnz L(Shl14LoopExit) - palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $14, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 18(%rcx), %rcx lea 16(%rdx), %rdx @@ -2018,6 +1921,8 @@ L(Shl14Start): # endif movaps -14(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl14LoopStart): movaps 2(%rcx), %xmm2 movaps 18(%rcx), %xmm3 @@ -2051,11 +1956,9 @@ L(Shl14LoopStart): jmp L(Shl14LoopStart) L(Shl14LoopExit): - movaps (%rdx), %xmm6 - psrldq $2, %xmm6 + mov -2(%rcx), %r9d mov $2, %rsi - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -2(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -2074,7 +1977,6 @@ L(Shl15Start): jnz L(Shl15LoopExit) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 17(%rcx), %xmm2 @@ -2082,7 +1984,7 @@ L(Shl15Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit15Case2OrCase3) @@ -2090,10 +1992,9 @@ L(Shl15Start): test %rax, %rax jnz L(Shl15LoopExit) - palignr $15, %xmm1, %xmm2 + palignr $15, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 17(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -2108,7 +2009,6 @@ L(Shl15Start): jnz L(Shl15LoopExit) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 17(%rcx), %xmm2 @@ -2116,7 +2016,6 @@ L(Shl15Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit15Case2OrCase3) @@ -2124,8 +2023,7 @@ L(Shl15Start): test %rax, %rax jnz L(Shl15LoopExit) - palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $15, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 17(%rcx), %rcx lea 16(%rdx), %rdx @@ -2140,6 +2038,8 @@ L(Shl15Start): # endif movaps -15(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl15LoopStart): movaps 1(%rcx), %xmm2 movaps 17(%rcx), %xmm3 @@ -2173,16 +2073,15 @@ L(Shl15LoopStart): jmp L(Shl15LoopStart) L(Shl15LoopExit): - movaps (%rdx), %xmm6 - psrldq $1, %xmm6 + mov -3(%rcx), %r9d mov $1, %rsi - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -3(%rdx) # ifdef USE_AS_STRCAT jmp L(CopyFrom1To16Bytes) # endif # ifndef USE_AS_STRCAT + .p2align 4 L(CopyFrom1To16Bytes): # ifdef USE_AS_STRNCPY @@ -2463,7 +2362,7 @@ L(Exit4): # ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif +# endif # endif ret @@ -2485,7 +2384,7 @@ L(Exit5): # ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif +# endif # endif ret @@ -2507,7 +2406,7 @@ L(Exit6): # ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif +# endif # endif ret @@ -2617,7 +2516,7 @@ L(Exit12): # ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif +# endif # endif ret @@ -2955,11 +2854,10 @@ L(StrncpyExit8Bytes): ret # endif - # endif # ifdef USE_AS_STRNCPY - + .p2align 4 L(StrncpyLeaveCase2OrCase3): test %rax, %rax jnz L(Aligned64LeaveCase2) @@ -3014,710 +2912,639 @@ L(Aligned64LeaveCase2): lea -16(%r8), %r8 jmp L(CopyFrom1To16BytesCase2) /*--------------------------------------------------*/ + .p2align 4 L(StrncpyExit1Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $15, %xmm6 + movdqu -1(%rcx), %xmm0 + movdqu %xmm0, -1(%rdx) mov $15, %rsi - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit2Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $14, %xmm6 + movdqu -2(%rcx), %xmm0 + movdqu %xmm0, -2(%rdx) mov $14, %rsi - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit3Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $13, %xmm6 + movdqu -3(%rcx), %xmm0 + movdqu %xmm0, -3(%rdx) mov $13, %rsi - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit4Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $12, %xmm6 + movdqu -4(%rcx), %xmm0 + movdqu %xmm0, -4(%rdx) mov $12, %rsi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit5Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $11, %xmm6 + movdqu -5(%rcx), %xmm0 + movdqu %xmm0, -5(%rdx) mov $11, %rsi - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit6Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $10, %xmm6 - mov $10, %rsi - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov (%rcx), %rsi + mov 6(%rcx), %r9d + mov %r9d, 6(%rdx) + mov %rsi, (%rdx) test %rax, %rax + mov $10, %rsi jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit7Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $9, %xmm6 - mov $9, %rsi - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov (%rcx), %rsi + mov 5(%rcx), %r9d + mov %r9d, 5(%rdx) + mov %rsi, (%rdx) test %rax, %rax + mov $9, %rsi jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit8Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $8, %xmm6 + mov (%rcx), %r9 mov $8, %rsi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit9Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $7, %xmm6 + mov -1(%rcx), %r9 mov $7, %rsi - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -1(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit10Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $6, %xmm6 + mov -2(%rcx), %r9 mov $6, %rsi - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -2(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit11Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $5, %xmm6 + mov -3(%rcx), %r9 mov $5, %rsi - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -3(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit12Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $4, %xmm6 + mov (%rcx), %r9d mov $4, %rsi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit13Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $3, %xmm6 + mov -1(%rcx), %r9d mov $3, %rsi - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -1(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit14Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $2, %xmm6 + mov -2(%rcx), %r9d mov $2, %rsi - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -2(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit15Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $1, %xmm6 + mov -3(%rcx), %r9d mov $1, %rsi - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -3(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave1): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit1) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 31(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 + palignr $1, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 31+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit1) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit1) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit1): - movaps (%rdx, %rsi), %xmm6 - psrldq $15, %xmm6 - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 15(%rsi), %rsi + lea 15(%rdx, %rsi), %rdx + lea 15(%rcx, %rsi), %rcx + mov -15(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -15(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave2): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit2) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 30(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 + palignr $2, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 30+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit2) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit2) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit2): - movaps (%rdx, %rsi), %xmm6 - psrldq $14, %xmm6 - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 14(%rsi), %rsi + lea 14(%rdx, %rsi), %rdx + lea 14(%rcx, %rsi), %rcx + mov -14(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -14(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave3): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit3) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 29(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 + palignr $3, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 29+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit3) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit3) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit3): - movaps (%rdx, %rsi), %xmm6 - psrldq $13, %xmm6 - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 13(%rsi), %rsi + lea 13(%rdx, %rsi), %rdx + lea 13(%rcx, %rsi), %rcx + mov -13(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -13(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave4): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit4) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 28+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit4) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit4) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit4): - movaps (%rdx, %rsi), %xmm6 - psrldq $12, %xmm6 - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 12(%rsi), %rsi + lea 12(%rdx, %rsi), %rdx + lea 12(%rcx, %rsi), %rcx + mov -12(%rcx), %rsi + mov -4(%rcx), %eax + mov %rsi, -12(%rdx) + mov %eax, -4(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave5): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit5) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 27(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 + palignr $5, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 27+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit5) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit5) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit5): - movaps (%rdx, %rsi), %xmm6 - psrldq $11, %xmm6 - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 11(%rsi), %rsi + lea 11(%rdx, %rsi), %rdx + lea 11(%rcx, %rsi), %rcx + mov -11(%rcx), %rsi + mov -4(%rcx), %eax + mov %rsi, -11(%rdx) + mov %eax, -4(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave6): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit6) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 26(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 + palignr $6, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 26+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit6) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit6) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit6): - movaps (%rdx, %rsi), %xmm6 - psrldq $10, %xmm6 - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 10(%rsi), %rsi + lea 10(%rdx, %rsi), %rdx + lea 10(%rcx, %rsi), %rcx + mov -10(%rcx), %rsi + movw -2(%rcx), %ax + mov %rsi, -10(%rdx) + movw %ax, -2(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave7): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit7) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 25(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 + palignr $7, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 25+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit7) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit7) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit7): - movaps (%rdx, %rsi), %xmm6 - psrldq $9, %xmm6 - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 9(%rsi), %rsi + lea 9(%rdx, %rsi), %rdx + lea 9(%rcx, %rsi), %rcx + mov -9(%rcx), %rsi + movb -1(%rcx), %ah + mov %rsi, -9(%rdx) + movb %ah, -1(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave8): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit8) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 24+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit8) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit8) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit8): - movaps (%rdx, %rsi), %xmm6 - psrldq $8, %xmm6 - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 8(%rsi), %rsi + lea 8(%rdx, %rsi), %rdx + lea 8(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave9): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit9) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 23(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 + palignr $9, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 23+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit9) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit9) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit9): - movaps (%rdx, %rsi), %xmm6 - psrldq $7, %xmm6 - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 7(%rsi), %rsi + lea 7(%rdx, %rsi), %rdx + lea 7(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave10): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit10) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 22(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 + palignr $10, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 22+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit10) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit10) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit10): - movaps (%rdx, %rsi), %xmm6 - psrldq $6, %xmm6 - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 6(%rsi), %rsi + lea 6(%rdx, %rsi), %rdx + lea 6(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave11): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit11) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 21(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 + palignr $11, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 21+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit11) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit11) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit11): - movaps (%rdx, %rsi), %xmm6 - psrldq $5, %xmm6 - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 5(%rsi), %rsi + lea 5(%rdx, %rsi), %rdx + lea 5(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave12): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit12) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 20+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit12) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit12) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit12): - movaps (%rdx, %rsi), %xmm6 - psrldq $4, %xmm6 - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 4(%rsi), %rsi + lea 4(%rdx, %rsi), %rdx + lea 4(%rcx, %rsi), %rcx + mov -4(%rcx), %eax + xor %rsi, %rsi + mov %eax, -4(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave13): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit13) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 19(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 + palignr $13, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 19+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit13) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit13) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit13): - movaps (%rdx, %rsi), %xmm6 - psrldq $3, %xmm6 - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 3(%rsi), %rsi + lea 3(%rdx, %rsi), %rdx + lea 3(%rcx, %rsi), %rcx + mov -4(%rcx), %eax + xor %rsi, %rsi + mov %eax, -4(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave14): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit14) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 18(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 + palignr $14, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 18+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit14) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit14) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit14): - movaps (%rdx, %rsi), %xmm6 - psrldq $2, %xmm6 - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 2(%rsi), %rsi + lea 2(%rdx, %rsi), %rdx + lea 2(%rcx, %rsi), %rcx + movw -2(%rcx), %ax + xor %rsi, %rsi + movw %ax, -2(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave15): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit15) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 17(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 + palignr $15, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 17+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit15) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit15) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit15): - movaps (%rdx, %rsi), %xmm6 - psrldq $1, %xmm6 - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 1(%rsi), %rsi + lea 1(%rdx, %rsi), %rdx + lea 1(%rcx, %rsi), %rcx + movb -1(%rcx), %ah + xor %rsi, %rsi + movb %ah, -1(%rdx) jmp L(CopyFrom1To16BytesCase3) + # endif # ifndef USE_AS_STRCAT END (STRCPY) diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S index 4e292f3..477b2cb 100644 --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -21,8 +21,9 @@ #ifndef NOT_IN_libc # include -.text + .section .text.ssse3,"ax",@progbits ENTRY (__wcscpy_ssse3) + mov %rsi, %rcx mov %rdi, %rdx @@ -136,6 +137,7 @@ L(Align16Both): mov $-0x40, %rsi + .p2align 4 L(Aligned64Loop): movaps (%rcx), %xmm2 movaps %xmm2, %xmm4 @@ -205,7 +207,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -213,15 +214,14 @@ L(Shl4Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -233,7 +233,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -245,8 +244,7 @@ L(Shl4Start): test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 28(%rcx), %rcx lea 16(%rdx), %rdx @@ -259,6 +257,7 @@ L(Shl4Start): movaps -4(%rcx), %xmm1 + .p2align 4 L(Shl4LoopStart): movaps 12(%rcx), %xmm2 movaps 28(%rcx), %xmm3 @@ -289,11 +288,9 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%rdx), %xmm6 - psrldq $12, %xmm6 + movdqu -4(%rcx), %xmm1 mov $12, %rsi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -4(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -309,7 +306,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -317,15 +313,14 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -337,7 +332,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -345,13 +339,11 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 24(%rcx), %rcx lea 16(%rdx), %rdx @@ -364,6 +356,7 @@ L(Shl8Start): movaps -8(%rcx), %xmm1 + .p2align 4 L(Shl8LoopStart): movaps 8(%rcx), %xmm2 movaps 24(%rcx), %xmm3 @@ -394,11 +387,9 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%rdx), %xmm6 - psrldq $8, %xmm6 + mov (%rcx), %r9 mov $8, %rsi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -414,7 +405,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -422,15 +412,14 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -442,7 +431,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -450,13 +438,11 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 20(%rcx), %rcx lea 16(%rdx), %rdx @@ -469,6 +455,7 @@ L(Shl12Start): movaps -12(%rcx), %xmm1 + .p2align 4 L(Shl12LoopStart): movaps 4(%rcx), %xmm2 movaps 20(%rcx), %xmm3 @@ -498,11 +485,10 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%rdx), %xmm6 - psrldq $4, %xmm6 + mov (%rcx), %r9d mov $4, %rsi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, (%rdx) + jmp L(CopyFrom1To16Bytes) .p2align 4 L(CopyFrom1To16Bytes): @@ -556,8 +542,10 @@ L(Exit12): .p2align 4 L(Exit16): - movdqu (%rcx), %xmm0 - movdqu %xmm0, (%rdx) + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) mov %rdi, %rax ret