+2013-10-08 Ondřej Bílka <neleai@seznam.cz>
+
+ * sysdeps/x86_64/memset.S (ALIGN): Macro removed.
+ Use .p2align directive instead, throughout.
+ * sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
+ * sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
+ * sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Likewise.
+ * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Likewise.
+ * sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
+ * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Likewise.
+ * sysdeps/x86_64/strchr.S: Likewise.
+ * sysdeps/x86_64/strrchr.S: Likewise.
+
2013-10-08 Siddhesh Poyarekar <siddhesh@redhat.com>
* sysdeps/ieee754/dbl-64/e_pow.c: Fix code formatting.
#include <sysdep.h>
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
.text
#if !defined NOT_IN_libc
ENTRY(__bzero)
L(return):
rep
ret
- ALIGN (4)
+ .p2align 4
L(between_32_64_bytes):
movdqu %xmm8, 16(%rdi)
movdqu %xmm8, -32(%rdi,%rdx)
ret
- ALIGN (4)
+ .p2align 4
L(loop_start):
leaq 64(%rdi), %rcx
movdqu %xmm8, (%rdi)
andq $-64, %rdx
cmpq %rdx, %rcx
je L(return)
- ALIGN (4)
+ .p2align 4
L(loop):
movdqa %xmm8, (%rcx)
movdqa %xmm8, 16(%rcx)
# define MEMCMP __memcmp_sse4_1
# endif
-# ifndef ALIGN
-# define ALIGN(n) .p2align n
-# endif
-
# define JMPTBL(I, B) (I - B)
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(firstbyte):
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
ret
# endif
- ALIGN (4)
+ .p2align 4
L(79bytesormore):
movdqu (%rsi), %xmm1
movdqu (%rdi), %xmm2
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(512bytesormore):
# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %R8_LP
cmp %r8, %rdx
ja L(L2_L3_cache_unaglined)
sub $64, %rdx
- ALIGN (4)
+ .p2align 4
L(64bytesormore_loop):
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
L(L2_L3_cache_unaglined):
sub $64, %rdx
- ALIGN (4)
+ .p2align 4
L(L2_L3_unaligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
prefetchnta 0x1c0(%rsi)
/*
* This case is for machines which are sensitive for unaligned instructions.
*/
- ALIGN (4)
+ .p2align 4
L(2aligned):
cmp $128, %rdx
ja L(128bytesormorein2aligned)
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(128bytesormorein2aligned):
cmp $512, %rdx
ja L(512bytesormorein2aligned)
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(256bytesormorein2aligned):
sub $256, %rdx
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(512bytesormorein2aligned):
# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %R8_LP
ja L(L2_L3_cache_aglined)
sub $64, %rdx
- ALIGN (4)
+ .p2align 4
L(64bytesormore_loopin2aligned):
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
L(L2_L3_cache_aglined):
sub $64, %rdx
- ALIGN (4)
+ .p2align 4
L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
prefetchnta 0x1c0(%rsi)
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(64bytesormore_loop_end):
add $16, %rdi
add $16, %rsi
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(12bytes):
mov -12(%rdi), %rax
mov -12(%rsi), %rcx
# ifndef USE_AS_WMEMCMP
/* unreal case for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(65bytes):
movdqu -65(%rdi), %xmm1
movdqu -65(%rsi), %xmm2
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(13bytes):
mov -13(%rdi), %rax
mov -13(%rsi), %rcx
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(5bytes):
mov -5(%rdi), %eax
mov -5(%rsi), %ecx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(66bytes):
movdqu -66(%rdi), %xmm1
movdqu -66(%rsi), %xmm2
sub %ecx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(14bytes):
mov -14(%rdi), %rax
mov -14(%rsi), %rcx
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(6bytes):
mov -6(%rdi), %eax
mov -6(%rsi), %ecx
sub %ecx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(67bytes):
movdqu -67(%rdi), %xmm2
movdqu -67(%rsi), %xmm1
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(15bytes):
mov -15(%rdi), %rax
mov -15(%rsi), %rcx
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(7bytes):
mov -7(%rdi), %eax
mov -7(%rsi), %ecx
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(3bytes):
movzwl -3(%rdi), %eax
movzwl -3(%rsi), %ecx
ret
# endif
- ALIGN (4)
+ .p2align 4
L(68bytes):
movdqu -68(%rdi), %xmm2
movdqu -68(%rsi), %xmm1
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(69bytes):
movdqu -69(%rsi), %xmm1
movdqu -69(%rdi), %xmm2
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(70bytes):
movdqu -70(%rsi), %xmm1
movdqu -70(%rdi), %xmm2
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(71bytes):
movdqu -71(%rsi), %xmm1
movdqu -71(%rdi), %xmm2
ret
# endif
- ALIGN (4)
+ .p2align 4
L(72bytes):
movdqu -72(%rsi), %xmm1
movdqu -72(%rdi), %xmm2
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(73bytes):
movdqu -73(%rsi), %xmm1
movdqu -73(%rdi), %xmm2
sub %ecx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(74bytes):
movdqu -74(%rsi), %xmm1
movdqu -74(%rdi), %xmm2
movzwl -2(%rsi), %ecx
jmp L(diffin2bytes)
- ALIGN (4)
+ .p2align 4
L(75bytes):
movdqu -75(%rsi), %xmm1
movdqu -75(%rdi), %xmm2
xor %eax, %eax
ret
# endif
- ALIGN (4)
+ .p2align 4
L(76bytes):
movdqu -76(%rsi), %xmm1
movdqu -76(%rdi), %xmm2
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(77bytes):
movdqu -77(%rsi), %xmm1
movdqu -77(%rdi), %xmm2
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(78bytes):
movdqu -78(%rsi), %xmm1
movdqu -78(%rdi), %xmm2
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(79bytes):
movdqu -79(%rsi), %xmm1
movdqu -79(%rdi), %xmm2
xor %eax, %eax
ret
# endif
- ALIGN (4)
+ .p2align 4
L(64bytes):
movdqu -64(%rdi), %xmm2
movdqu -64(%rsi), %xmm1
/*
* Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
*/
- ALIGN (3)
+ .p2align 3
L(less16bytes):
movsbq %dl, %rdx
mov (%rsi, %rdx), %rcx
sub %ecx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(end):
and $0xff, %eax
and $0xff, %ecx
neg %eax
ret
- ALIGN (4)
+ .p2align 4
L(nequal_bigger):
ret
END (MEMCMP)
.section .rodata.sse4.1,"a",@progbits
- ALIGN (3)
+ .p2align 3
# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
# define MEMCMP __memcmp_ssse3
# endif
-# ifndef ALIGN
-# define ALIGN(n) .p2align n
-# endif
-
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
/* ECX >= 32. */
L(48bytesormore):
movdqu (%rdi), %xmm3
je L(shr_6)
jmp L(shr_7)
- ALIGN (2)
+ .p2align 2
L(next_unaligned_table):
cmp $8, %edx
je L(shr_8)
jmp L(shr_12)
# endif
- ALIGN (4)
+ .p2align 4
L(shr_0):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_0_gobble):
movdqa (%rsi), %xmm0
xor %eax, %eax
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(shr_1):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_1_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_2):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_2_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_3):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_3_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
# endif
- ALIGN (4)
+ .p2align 4
L(shr_4):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_4_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(shr_5):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_5_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_6):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_6_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_7):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_7_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
# endif
- ALIGN (4)
+ .p2align 4
L(shr_8):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_8_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(shr_9):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_9_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_10):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_10_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_11):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_11_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
# endif
- ALIGN (4)
+ .p2align 4
L(shr_12):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_12_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(shr_13):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_13_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_14):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_14_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_15):
cmp $80, %rcx
lea -48(%rcx), %rcx
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_15_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
add %rcx, %rdi
jmp L(less48bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(exit):
pmovmskb %xmm1, %r8d
sub $0xffff, %r8d
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte16):
movzbl -16(%rdi), %eax
movzbl -16(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte17):
movzbl -15(%rdi), %eax
movzbl -15(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte18):
movzbl -14(%rdi), %eax
movzbl -14(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte19):
movzbl -13(%rdi), %eax
movzbl -13(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte20):
movzbl -12(%rdi), %eax
movzbl -12(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte21):
movzbl -11(%rdi), %eax
movzbl -11(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte22):
movzbl -10(%rdi), %eax
movzbl -10(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(next_24_bytes):
lea 8(%rdi), %rdi
lea 8(%rsi), %rsi
jne L(find_diff)
ret
- ALIGN (4)
+ .p2align 4
L(second_double_word):
mov -12(%rdi), %eax
cmp -12(%rsi), %eax
jne L(find_diff)
ret
- ALIGN (4)
+ .p2align 4
L(next_two_double_words):
and $15, %dh
jz L(fourth_double_word)
jne L(find_diff)
ret
- ALIGN (4)
+ .p2align 4
L(fourth_double_word):
mov -4(%rdi), %eax
cmp -4(%rsi), %eax
ret
# endif
- ALIGN (4)
+ .p2align 4
L(less48bytes):
cmp $8, %ecx
jae L(more8bytes)
jmp L(4bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more8bytes):
cmp $16, %ecx
jae L(more16bytes)
jmp L(12bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more16bytes):
cmp $24, %ecx
jae L(more24bytes)
jmp L(20bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more24bytes):
cmp $32, %ecx
jae L(more32bytes)
jmp L(28bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more32bytes):
cmp $40, %ecx
jae L(more40bytes)
jmp L(36bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more40bytes):
cmp $40, %ecx
je L(40bytes)
je L(46bytes)
jmp L(47bytes)
- ALIGN (4)
+ .p2align 4
L(44bytes):
movl -44(%rdi), %eax
movl -44(%rsi), %ecx
xor %eax, %eax
ret
# else
- ALIGN (4)
+ .p2align 4
L(44bytes):
movl -44(%rdi), %eax
cmp -44(%rsi), %eax
# endif
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(45bytes):
movl -45(%rdi), %eax
movl -45(%rsi), %ecx
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(46bytes):
movl -46(%rdi), %eax
movl -46(%rsi), %ecx
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(47bytes):
movl -47(%rdi), %eax
movl -47(%rsi), %ecx
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(find_diff):
cmpb %cl, %al
jne L(set)
# else
/* for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(find_diff):
mov $1, %eax
jg L(find_diff_bigger)
neg %eax
ret
- ALIGN (4)
+ .p2align 4
L(find_diff_bigger):
ret
# endif
- ALIGN (4)
+ .p2align 4
L(equal):
xor %eax, %eax
ret
#include "asm-syntax.h"
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
ENTRY(__memcpy_sse2_unaligned)
movq %rsi, %rax
movq %rdi, %rax
ret
.p2align 4,,10
- ALIGN(4)
+ .p2align 4
.L31:
movdqu 16(%rsi), %xmm8
cmpq $64, %rdx
leaq 32(%r10), %r8
leaq 48(%r10), %rax
.p2align 4,,10
- ALIGN(4)
+ .p2align 4
L(loop):
movdqu (%rcx,%r10), %xmm8
movdqa %xmm8, (%rcx)
.L3:
leaq -1(%rdx), %rax
.p2align 4,,10
- ALIGN(4)
+ .p2align 4
.L11:
movzbl (%rsi,%rax), %edx
movb %dl, (%rdi,%rax)
# define MEMCPY_CHK __memcpy_chk_ssse3_back
#endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
#define JMPTBL(I, B) I - B
/* Branch to an entry in a jump table. TABLE is a jump table with
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
#endif
- ALIGN (4)
+ .p2align 4
L(144bytesormore):
#ifndef USE_AS_MEMMOVE
jmp *%r9
ud2
- ALIGN (4)
+ .p2align 4
L(copy_backward):
#ifdef DATA_CACHE_SIZE
mov $DATA_CACHE_SIZE, %RCX_LP
jmp *%r9
ud2
- ALIGN (4)
+ .p2align 4
L(shl_0):
mov %rdx, %r9
#endif
jae L(gobble_mem_fwd)
sub $0x80, %rdx
- ALIGN (4)
+ .p2align 4
L(shl_0_loop):
movdqa (%rsi), %xmm1
movdqa %xmm1, (%rdi)
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_bwd):
sub $0x80, %rdx
L(copy_backward_loop):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_1):
sub $0x80, %rdx
movaps -0x01(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_1_bwd):
movaps -0x01(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_2):
sub $0x80, %rdx
movaps -0x02(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_2_bwd):
movaps -0x02(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_3):
sub $0x80, %rdx
movaps -0x03(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_3_bwd):
movaps -0x03(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_4):
sub $0x80, %rdx
movaps -0x04(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_4_bwd):
movaps -0x04(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_5):
sub $0x80, %rdx
movaps -0x05(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_5_bwd):
movaps -0x05(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_6):
sub $0x80, %rdx
movaps -0x06(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_6_bwd):
movaps -0x06(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_7):
sub $0x80, %rdx
movaps -0x07(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_7_bwd):
movaps -0x07(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_8):
sub $0x80, %rdx
movaps -0x08(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_8_bwd):
movaps -0x08(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_9):
sub $0x80, %rdx
movaps -0x09(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_9_bwd):
movaps -0x09(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_10):
sub $0x80, %rdx
movaps -0x0a(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_10_bwd):
movaps -0x0a(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_11):
sub $0x80, %rdx
movaps -0x0b(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_11_bwd):
movaps -0x0b(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_12):
sub $0x80, %rdx
movdqa -0x0c(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_12_bwd):
movaps -0x0c(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_13):
sub $0x80, %rdx
movaps -0x0d(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_13_bwd):
movaps -0x0d(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_14):
sub $0x80, %rdx
movaps -0x0e(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_14_bwd):
movaps -0x0e(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_15):
sub $0x80, %rdx
movaps -0x0f(%rsi), %xmm1
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_15_bwd):
movaps -0x0f(%rsi), %xmm1
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(gobble_mem_fwd):
movdqu (%rsi), %xmm1
movdqu %xmm0, (%r8)
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(gobble_mem_bwd):
add %rdx, %rsi
add %rdx, %rdi
END (MEMCPY)
.section .rodata.ssse3,"a",@progbits
- ALIGN (3)
+ .p2align 3
L(table_144_bytes_bwd):
.int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
.int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
.int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
.int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
- ALIGN (3)
+ .p2align 3
L(table_144_bytes_fwd):
.int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
.int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
.int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
.int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
- ALIGN (3)
+ .p2align 3
L(shl_table_fwd):
.int JMPTBL (L(shl_0), L(shl_table_fwd))
.int JMPTBL (L(shl_1), L(shl_table_fwd))
.int JMPTBL (L(shl_14), L(shl_table_fwd))
.int JMPTBL (L(shl_15), L(shl_table_fwd))
- ALIGN (3)
+ .p2align 3
L(shl_table_bwd):
.int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
.int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
# define MEMCPY_CHK __memcpy_chk_ssse3
#endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
#define JMPTBL(I, B) I - B
/* Branch to an entry in a jump table. TABLE is a jump table with
jmp *%r9
ud2
- ALIGN (4)
+ .p2align 4
L(80bytesormore):
#ifndef USE_AS_MEMMOVE
cmp %dil, %sil
#endif
BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
- ALIGN (4)
+ .p2align 4
L(copy_backward):
movdqu -16(%rsi, %rdx), %xmm0
add %rdx, %rsi
#endif
BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0):
sub $16, %rdx
movdqa (%rsi), %xmm1
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble):
#ifdef DATA_CACHE_SIZE_HALF
cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_mem_loop):
prefetcht0 0x1c0(%rsi)
prefetcht0 0x280(%rsi)
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_bwd):
sub $16, %rdx
movdqa -0x10(%rsi), %xmm1
L(shl_0_less_64bytes_bwd):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_bwd):
#ifdef DATA_CACHE_SIZE_HALF
cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
L(shl_0_gobble_bwd_less_64bytes):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_mem_bwd_loop):
prefetcht0 -0x1c0(%rsi)
prefetcht0 -0x280(%rsi)
L(shl_0_mem_bwd_less_32bytes):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_1):
lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_1_bwd):
lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_2):
lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_2_bwd):
lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_3):
lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_3_bwd):
lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_4):
lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_4_bwd):
lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_5):
lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_5_bwd):
lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_6):
lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_6_bwd):
lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_7):
lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_7_bwd):
lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_8):
lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
cmp %rcx, %rdx
movaps %xmm5, -0x10(%rdi)
jmp *%r9
ud2
- ALIGN (4)
+ .p2align 4
L(shl_8_end):
lea 64(%rdx), %rdx
movaps %xmm4, -0x20(%rdi)
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_8_bwd):
lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_9):
lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_9_bwd):
lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_10):
lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_10_bwd):
lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_11):
lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_11_bwd):
lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_12):
lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_12_bwd):
lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_13):
lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_13_bwd):
lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_14):
lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_14_bwd):
lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_15):
lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
cmp %rcx, %rdx
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_15_bwd):
lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
cmp %rcx, %rdx
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(write_72bytes):
movdqu -72(%rsi), %xmm0
movdqu -56(%rsi), %xmm1
mov %rcx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_64bytes):
movdqu -64(%rsi), %xmm0
mov -48(%rsi), %rcx
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_56bytes):
movdqu -56(%rsi), %xmm0
mov -40(%rsi), %r8
mov %rcx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_48bytes):
mov -48(%rsi), %rcx
mov -40(%rsi), %r8
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_40bytes):
mov -40(%rsi), %r8
mov -32(%rsi), %r9
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_32bytes):
mov -32(%rsi), %r9
mov -24(%rsi), %r10
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_24bytes):
mov -24(%rsi), %r10
mov -16(%rsi), %r11
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_16bytes):
mov -16(%rsi), %r11
mov -8(%rsi), %rdx
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_8bytes):
mov -8(%rsi), %rdx
mov %rdx, -8(%rdi)
L(write_0bytes):
ret
- ALIGN (4)
+ .p2align 4
L(write_73bytes):
movdqu -73(%rsi), %xmm0
movdqu -57(%rsi), %xmm1
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_65bytes):
movdqu -65(%rsi), %xmm0
movdqu -49(%rsi), %xmm1
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_57bytes):
movdqu -57(%rsi), %xmm0
mov -41(%rsi), %r8
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_49bytes):
movdqu -49(%rsi), %xmm0
mov -33(%rsi), %r9
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_41bytes):
mov -41(%rsi), %r8
mov -33(%rsi), %r9
mov %dl, -1(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_33bytes):
mov -33(%rsi), %r9
mov -25(%rsi), %r10
mov %dl, -1(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_25bytes):
mov -25(%rsi), %r10
mov -17(%rsi), %r11
mov %dl, -1(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_17bytes):
mov -17(%rsi), %r11
mov -9(%rsi), %rcx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_9bytes):
mov -9(%rsi), %rcx
mov -4(%rsi), %edx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_1bytes):
mov -1(%rsi), %dl
mov %dl, -1(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_74bytes):
movdqu -74(%rsi), %xmm0
movdqu -58(%rsi), %xmm1
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_66bytes):
movdqu -66(%rsi), %xmm0
movdqu -50(%rsi), %xmm1
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_58bytes):
movdqu -58(%rsi), %xmm1
mov -42(%rsi), %r8
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_50bytes):
movdqu -50(%rsi), %xmm0
mov -34(%rsi), %r9
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_42bytes):
mov -42(%rsi), %r8
mov -34(%rsi), %r9
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_34bytes):
mov -34(%rsi), %r9
mov -26(%rsi), %r10
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_26bytes):
mov -26(%rsi), %r10
mov -18(%rsi), %r11
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_18bytes):
mov -18(%rsi), %r11
mov -10(%rsi), %rcx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_10bytes):
mov -10(%rsi), %rcx
mov -4(%rsi), %edx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_2bytes):
mov -2(%rsi), %dx
mov %dx, -2(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_75bytes):
movdqu -75(%rsi), %xmm0
movdqu -59(%rsi), %xmm1
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_67bytes):
movdqu -67(%rsi), %xmm0
movdqu -59(%rsi), %xmm1
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_59bytes):
movdqu -59(%rsi), %xmm0
mov -43(%rsi), %r8
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_51bytes):
movdqu -51(%rsi), %xmm0
mov -35(%rsi), %r9
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_43bytes):
mov -43(%rsi), %r8
mov -35(%rsi), %r9
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_35bytes):
mov -35(%rsi), %r9
mov -27(%rsi), %r10
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_27bytes):
mov -27(%rsi), %r10
mov -19(%rsi), %r11
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_19bytes):
mov -19(%rsi), %r11
mov -11(%rsi), %rcx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_11bytes):
mov -11(%rsi), %rcx
mov -4(%rsi), %edx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_3bytes):
mov -3(%rsi), %dx
mov -2(%rsi), %cx
mov %cx, -2(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_76bytes):
movdqu -76(%rsi), %xmm0
movdqu -60(%rsi), %xmm1
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_68bytes):
movdqu -68(%rsi), %xmm0
movdqu -52(%rsi), %xmm1
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_60bytes):
movdqu -60(%rsi), %xmm0
mov -44(%rsi), %r8
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_52bytes):
movdqu -52(%rsi), %xmm0
mov -36(%rsi), %r9
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_44bytes):
mov -44(%rsi), %r8
mov -36(%rsi), %r9
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_36bytes):
mov -36(%rsi), %r9
mov -28(%rsi), %r10
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_28bytes):
mov -28(%rsi), %r10
mov -20(%rsi), %r11
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_20bytes):
mov -20(%rsi), %r11
mov -12(%rsi), %rcx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_12bytes):
mov -12(%rsi), %rcx
mov -4(%rsi), %edx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_4bytes):
mov -4(%rsi), %edx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_77bytes):
movdqu -77(%rsi), %xmm0
movdqu -61(%rsi), %xmm1
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_69bytes):
movdqu -69(%rsi), %xmm0
movdqu -53(%rsi), %xmm1
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_61bytes):
movdqu -61(%rsi), %xmm0
mov -45(%rsi), %r8
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_53bytes):
movdqu -53(%rsi), %xmm0
mov -45(%rsi), %r8
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_45bytes):
mov -45(%rsi), %r8
mov -37(%rsi), %r9
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_37bytes):
mov -37(%rsi), %r9
mov -29(%rsi), %r10
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_29bytes):
mov -29(%rsi), %r10
mov -21(%rsi), %r11
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_21bytes):
mov -21(%rsi), %r11
mov -13(%rsi), %rcx
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_13bytes):
mov -13(%rsi), %rcx
mov -8(%rsi), %rdx
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_5bytes):
mov -5(%rsi), %edx
mov -4(%rsi), %ecx
mov %ecx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_78bytes):
movdqu -78(%rsi), %xmm0
movdqu -62(%rsi), %xmm1
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_70bytes):
movdqu -70(%rsi), %xmm0
movdqu -54(%rsi), %xmm1
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_62bytes):
movdqu -62(%rsi), %xmm0
mov -46(%rsi), %r8
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_54bytes):
movdqu -54(%rsi), %xmm0
mov -38(%rsi), %r9
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_46bytes):
mov -46(%rsi), %r8
mov -38(%rsi), %r9
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_38bytes):
mov -38(%rsi), %r9
mov -30(%rsi), %r10
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_30bytes):
mov -30(%rsi), %r10
mov -22(%rsi), %r11
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_22bytes):
mov -22(%rsi), %r11
mov -14(%rsi), %rcx
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_14bytes):
mov -14(%rsi), %rcx
mov -8(%rsi), %rdx
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_6bytes):
mov -6(%rsi), %edx
mov -4(%rsi), %ecx
mov %ecx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_79bytes):
movdqu -79(%rsi), %xmm0
movdqu -63(%rsi), %xmm1
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_71bytes):
movdqu -71(%rsi), %xmm0
movdqu -55(%rsi), %xmm1
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_63bytes):
movdqu -63(%rsi), %xmm0
mov -47(%rsi), %r8
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_55bytes):
movdqu -55(%rsi), %xmm0
mov -39(%rsi), %r9
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_47bytes):
mov -47(%rsi), %r8
mov -39(%rsi), %r9
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_39bytes):
mov -39(%rsi), %r9
mov -31(%rsi), %r10
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_31bytes):
mov -31(%rsi), %r10
mov -23(%rsi), %r11
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_23bytes):
mov -23(%rsi), %r11
mov -15(%rsi), %rcx
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_15bytes):
mov -15(%rsi), %rcx
mov -8(%rsi), %rdx
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_7bytes):
mov -7(%rsi), %edx
mov -4(%rsi), %ecx
mov %ecx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(large_page_fwd):
movdqu (%rsi), %xmm1
lea 16(%rsi), %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
#ifdef USE_AS_MEMMOVE
- ALIGN (4)
+ .p2align 4
L(ll_cache_copy_fwd_start):
prefetcht0 0x1c0(%rsi)
prefetcht0 0x200(%rsi)
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
#endif
- ALIGN (4)
+ .p2align 4
L(large_page_bwd):
movdqu -0x10(%rsi), %xmm1
lea -16(%rsi), %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
#ifdef USE_AS_MEMMOVE
- ALIGN (4)
+ .p2align 4
L(ll_cache_copy_bwd_start):
prefetcht0 -0x1c0(%rsi)
prefetcht0 -0x200(%rsi)
END (MEMCPY)
.section .rodata.ssse3,"a",@progbits
- ALIGN (3)
+ .p2align 3
L(table_less_80bytes):
.int JMPTBL (L(write_0bytes), L(table_less_80bytes))
.int JMPTBL (L(write_1bytes), L(table_less_80bytes))
.int JMPTBL (L(write_78bytes), L(table_less_80bytes))
.int JMPTBL (L(write_79bytes), L(table_less_80bytes))
- ALIGN (3)
+ .p2align 3
L(shl_table):
.int JMPTBL (L(shl_0), L(shl_table))
.int JMPTBL (L(shl_1), L(shl_table))
.int JMPTBL (L(shl_14), L(shl_table))
.int JMPTBL (L(shl_15), L(shl_table))
- ALIGN (3)
+ .p2align 3
L(shl_table_bwd):
.int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
.int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
<http://www.gnu.org/licenses/>. */
#include "sysdep.h"
-#define ALIGN(x) .p2align x
ENTRY ( __strcmp_sse2_unaligned)
movl %edi, %eax
subl %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(next_48_bytes):
movdqu 16(%rdi), %xmm6
movdqu 16(%rsi), %xmm3
movq %rcx, %rsi
jmp L(loop_start)
- ALIGN (4)
+ .p2align 4
L(loop):
addq $64, %rax
addq $64, %rdx
subl %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(loop_cross_page):
xor %r10, %r10
movq %rdx, %r9
subl %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(cross_page_loop):
cmpb %cl, %al
jne L(different)
#include <sysdep.h>
-# ifndef ALIGN
-# define ALIGN(n) .p2align n
-# endif
-
-
.text
ENTRY (strchr)
movd %esi, %xmm1
#endif
ret
- ALIGN(3)
+ .p2align 3
L(next_48_bytes):
movdqu 16(%rdi), %xmm0
movdqa %xmm0, %xmm4
L(loop_start):
/* We use this alignment to force loop be aligned to 8 but not
16 bytes. This gives better sheduling on AMD processors. */
- ALIGN(4)
+ .p2align 4
pxor %xmm6, %xmm6
andq $-64, %rdi
- ALIGN(3)
+ .p2align 3
L(loop64):
addq $64, %rdi
movdqa (%rdi), %xmm5
orq %rcx, %rax
salq $48, %rdx
orq %rdx, %rax
- ALIGN(3)
+ .p2align 3
L(return):
bsfq %rax, %rax
#ifdef AS_STRCHRNUL
cmovne %rdx, %rax
#endif
ret
- ALIGN(4)
+ .p2align 4
L(cross_page):
movq %rdi, %rdx
#include <sysdep.h>
-# ifndef ALIGN
-# define ALIGN(n) .p2align n
-# endif
-
-
.text
ENTRY (strrchr)
movd %esi, %xmm1
addq %rdi, %rax
ret
- ALIGN(4)
+ .p2align 4
L(next_48_bytes):
movdqu 16(%rdi), %xmm4
movdqa %xmm4, %xmm5
leaq (%rdi,%rsi), %rax
ret
- ALIGN(4)
+ .p2align 4
L(loop_header2):
testq %rsi, %rsi
movq %rdi, %rcx
andq $-64, %rdi
jmp L(loop_entry)
- ALIGN(4)
+ .p2align 4
L(loop64):
testq %rdx, %rdx
cmovne %rdx, %rsi
leaq (%rcx,%rsi), %rax
ret
- ALIGN(4)
+ .p2align 4
L(no_c_found):
movl $1, %esi
xorl %ecx, %ecx
jmp L(loop_header)
- ALIGN(4)
+ .p2align 4
L(exit):
xorl %eax, %eax
ret
- ALIGN(4)
+ .p2align 4
L(cross_page):
movq %rdi, %rax
pxor %xmm0, %xmm0