1 /* memcmp with SSE4.2, wmemcmp with SSE4.2
2 Copyright (C) 2010-2012 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26 # define MEMCMP __memcmp_sse4_2
29 # define CFI_PUSH(REG) \
30 cfi_adjust_cfa_offset (4); \
31 cfi_rel_offset (REG, 0)
33 # define CFI_POP(REG) \
34 cfi_adjust_cfa_offset (-4); \
37 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
38 # define POP(REG) popl REG; CFI_POP (REG)
42 # define BLK2 BLK1 + 4
44 # define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
48 # define JMPTBL(I, B) I - B
50 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
51 jump table with relative offsets. INDEX is a register contains the
52 index into the jump table. SCALE is the scale of INDEX. */
54 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
55 /* We first load PC into EBX. */ \
57 /* Get the address of the jump table. */ \
58 addl $(TABLE - .), %ebx; \
59 /* Get the entry and convert the relative offset to the \
60 absolute address. */ \
61 addl (%ebx,INDEX,SCALE), %ebx; \
62 /* We loaded the jump table and adjuested EDX/ESI. Go. */ \
65 # define JMPTBL(I, B) I
67 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
68 jump table with relative offsets. INDEX is a register contains the
69 index into the jump table. SCALE is the scale of INDEX. */
70 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
71 jmp *TABLE(,INDEX,SCALE)
76 wmemcmp has to use SIGNED comparison for elements.
77 memcmp has to use UNSIGNED comparison for elemnts.
80 .section .text.sse4.2,"ax",@progbits
86 # ifdef USE_AS_WMEMCMP
100 # ifndef USE_AS_WMEMCMP
110 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
112 # ifndef USE_AS_WMEMCMP
174 # ifdef USE_AS_WMEMCMP
176 /* for wmemcmp, case N == 1 */
184 jg L(find_diff_bigger)
198 # ifndef USE_AS_WMEMCMP
218 L(64bytesormore_loop):
225 movdqu 16(%eax), %xmm1
226 movdqu 16(%edx), %xmm2
231 movdqu 32(%eax), %xmm1
232 movdqu 32(%edx), %xmm2
237 movdqu 48(%eax), %xmm1
238 movdqu 48(%edx), %xmm2
245 jae L(64bytesormore_loop)
249 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
251 # ifdef USE_AS_WMEMCMP
253 /* Label needs only for table_64bytes filling */
269 # ifndef USE_AS_WMEMCMP
315 # ifndef USE_AS_WMEMCMP
318 movdqu -49(%eax), %xmm1
319 movdqu -49(%edx), %xmm2
325 movdqu -33(%eax), %xmm1
326 movdqu -33(%edx), %xmm2
351 movzbl -1(%eax), %ecx
360 movdqu -50(%eax), %xmm1
361 movdqu -50(%edx), %xmm2
367 movdqu -34(%eax), %xmm1
368 movdqu -34(%edx), %xmm2
393 movzwl -2(%eax), %ecx
394 movzwl -2(%edx), %ebx
405 movdqu -51(%eax), %xmm1
406 movdqu -51(%edx), %xmm2
412 movdqu -35(%eax), %xmm1
413 movdqu -35(%edx), %xmm2
438 movzwl -3(%eax), %ecx
439 movzwl -3(%edx), %ebx
445 movzbl -1(%eax), %eax
453 movdqu -52(%eax), %xmm1
454 movdqu -52(%edx), %xmm2
460 movdqu -36(%eax), %xmm1
461 movdqu -36(%edx), %xmm2
467 movdqu -20(%eax), %xmm1
468 movdqu -20(%edx), %xmm2
474 # ifndef USE_AS_WMEMCMP
484 # ifndef USE_AS_WMEMCMP
487 movdqu -53(%eax), %xmm1
488 movdqu -53(%edx), %xmm2
495 movdqu -37(%eax), %xmm1
496 movdqu -37(%edx), %xmm2
502 movdqu -21(%eax), %xmm1
503 movdqu -21(%edx), %xmm2
511 movzbl -1(%eax), %ecx
519 movdqu -54(%eax), %xmm1
520 movdqu -54(%edx), %xmm2
527 movdqu -38(%eax), %xmm1
528 movdqu -38(%edx), %xmm2
534 movdqu -22(%eax), %xmm1
535 movdqu -22(%edx), %xmm2
544 movzwl -2(%eax), %ecx
545 movzwl -2(%edx), %ebx
555 movdqu -55(%eax), %xmm1
556 movdqu -55(%edx), %xmm2
563 movdqu -39(%eax), %xmm1
564 movdqu -39(%edx), %xmm2
570 movdqu -23(%eax), %xmm1
571 movdqu -23(%edx), %xmm2
579 movzwl -3(%eax), %ecx
580 movzwl -3(%edx), %ebx
585 movzbl -1(%eax), %eax
593 movdqu -56(%eax), %xmm1
594 movdqu -56(%edx), %xmm2
601 movdqu -40(%eax), %xmm1
602 movdqu -40(%edx), %xmm2
608 movdqu -24(%eax), %xmm1
609 movdqu -24(%edx), %xmm2
615 # ifndef USE_AS_WMEMCMP
624 # ifndef USE_AS_WMEMCMP
634 # ifndef USE_AS_WMEMCMP
637 movdqu -57(%eax), %xmm1
638 movdqu -57(%edx), %xmm2
645 movdqu -41(%eax), %xmm1
646 movdqu -41(%edx), %xmm2
652 movdqu -25(%eax), %xmm1
653 movdqu -25(%edx), %xmm2
665 movzbl -1(%eax), %ecx
673 movdqu -58(%eax), %xmm1
674 movdqu -58(%edx), %xmm2
681 movdqu -42(%eax), %xmm1
682 movdqu -42(%edx), %xmm2
688 movdqu -26(%eax), %xmm1
689 movdqu -26(%edx), %xmm2
704 movzwl -2(%eax), %ecx
705 movzwl -2(%edx), %ebx
715 movdqu -59(%eax), %xmm1
716 movdqu -59(%edx), %xmm2
723 movdqu -43(%eax), %xmm1
724 movdqu -43(%edx), %xmm2
730 movdqu -27(%eax), %xmm1
731 movdqu -27(%edx), %xmm2
743 movzwl -3(%eax), %ecx
744 movzwl -3(%edx), %ebx
749 movzbl -1(%eax), %eax
757 movdqu -60(%eax), %xmm1
758 movdqu -60(%edx), %xmm2
765 movdqu -44(%eax), %xmm1
766 movdqu -44(%edx), %xmm2
772 movdqu -28(%eax), %xmm1
773 movdqu -28(%edx), %xmm2
779 # ifndef USE_AS_WMEMCMP
788 # ifndef USE_AS_WMEMCMP
797 # ifndef USE_AS_WMEMCMP
807 # ifndef USE_AS_WMEMCMP
810 movdqu -61(%eax), %xmm1
811 movdqu -61(%edx), %xmm2
818 movdqu -45(%eax), %xmm1
819 movdqu -45(%edx), %xmm2
825 movdqu -29(%eax), %xmm1
826 movdqu -29(%edx), %xmm2
845 movzbl -1(%eax), %ecx
853 movdqu -62(%eax), %xmm1
854 movdqu -62(%edx), %xmm2
861 movdqu -46(%eax), %xmm1
862 movdqu -46(%edx), %xmm2
868 movdqu -30(%eax), %xmm1
869 movdqu -30(%edx), %xmm2
885 movzwl -2(%eax), %ecx
886 movzwl -2(%edx), %ebx
896 movdqu -63(%eax), %xmm1
897 movdqu -63(%edx), %xmm2
904 movdqu -47(%eax), %xmm1
905 movdqu -47(%edx), %xmm2
911 movdqu -31(%eax), %xmm1
912 movdqu -31(%edx), %xmm2
929 movzwl -3(%eax), %ecx
930 movzwl -3(%edx), %ebx
935 movzbl -1(%eax), %eax
944 movdqu -64(%eax), %xmm1
945 movdqu -64(%edx), %xmm2
951 movdqu -48(%eax), %xmm1
952 movdqu -48(%edx), %xmm2
958 movdqu -32(%eax), %xmm1
959 movdqu -32(%edx), %xmm2
966 # ifndef USE_AS_WMEMCMP
975 # ifndef USE_AS_WMEMCMP
984 # ifndef USE_AS_WMEMCMP
993 # ifndef USE_AS_WMEMCMP
1003 # ifndef USE_AS_WMEMCMP
1058 # ifndef USE_AS_WMEMCMP
1088 .section .rodata.sse4.2,"a",@progbits
1090 .type L(table_64bytes), @object
1091 # ifndef USE_AS_WMEMCMP
1093 .int JMPTBL (L(0bytes), L(table_64bytes))
1094 .int JMPTBL (L(1bytes), L(table_64bytes))
1095 .int JMPTBL (L(2bytes), L(table_64bytes))
1096 .int JMPTBL (L(3bytes), L(table_64bytes))
1097 .int JMPTBL (L(4bytes), L(table_64bytes))
1098 .int JMPTBL (L(5bytes), L(table_64bytes))
1099 .int JMPTBL (L(6bytes), L(table_64bytes))
1100 .int JMPTBL (L(7bytes), L(table_64bytes))
1101 .int JMPTBL (L(8bytes), L(table_64bytes))
1102 .int JMPTBL (L(9bytes), L(table_64bytes))
1103 .int JMPTBL (L(10bytes), L(table_64bytes))
1104 .int JMPTBL (L(11bytes), L(table_64bytes))
1105 .int JMPTBL (L(12bytes), L(table_64bytes))
1106 .int JMPTBL (L(13bytes), L(table_64bytes))
1107 .int JMPTBL (L(14bytes), L(table_64bytes))
1108 .int JMPTBL (L(15bytes), L(table_64bytes))
1109 .int JMPTBL (L(16bytes), L(table_64bytes))
1110 .int JMPTBL (L(17bytes), L(table_64bytes))
1111 .int JMPTBL (L(18bytes), L(table_64bytes))
1112 .int JMPTBL (L(19bytes), L(table_64bytes))
1113 .int JMPTBL (L(20bytes), L(table_64bytes))
1114 .int JMPTBL (L(21bytes), L(table_64bytes))
1115 .int JMPTBL (L(22bytes), L(table_64bytes))
1116 .int JMPTBL (L(23bytes), L(table_64bytes))
1117 .int JMPTBL (L(24bytes), L(table_64bytes))
1118 .int JMPTBL (L(25bytes), L(table_64bytes))
1119 .int JMPTBL (L(26bytes), L(table_64bytes))
1120 .int JMPTBL (L(27bytes), L(table_64bytes))
1121 .int JMPTBL (L(28bytes), L(table_64bytes))
1122 .int JMPTBL (L(29bytes), L(table_64bytes))
1123 .int JMPTBL (L(30bytes), L(table_64bytes))
1124 .int JMPTBL (L(31bytes), L(table_64bytes))
1125 .int JMPTBL (L(32bytes), L(table_64bytes))
1126 .int JMPTBL (L(33bytes), L(table_64bytes))
1127 .int JMPTBL (L(34bytes), L(table_64bytes))
1128 .int JMPTBL (L(35bytes), L(table_64bytes))
1129 .int JMPTBL (L(36bytes), L(table_64bytes))
1130 .int JMPTBL (L(37bytes), L(table_64bytes))
1131 .int JMPTBL (L(38bytes), L(table_64bytes))
1132 .int JMPTBL (L(39bytes), L(table_64bytes))
1133 .int JMPTBL (L(40bytes), L(table_64bytes))
1134 .int JMPTBL (L(41bytes), L(table_64bytes))
1135 .int JMPTBL (L(42bytes), L(table_64bytes))
1136 .int JMPTBL (L(43bytes), L(table_64bytes))
1137 .int JMPTBL (L(44bytes), L(table_64bytes))
1138 .int JMPTBL (L(45bytes), L(table_64bytes))
1139 .int JMPTBL (L(46bytes), L(table_64bytes))
1140 .int JMPTBL (L(47bytes), L(table_64bytes))
1141 .int JMPTBL (L(48bytes), L(table_64bytes))
1142 .int JMPTBL (L(49bytes), L(table_64bytes))
1143 .int JMPTBL (L(50bytes), L(table_64bytes))
1144 .int JMPTBL (L(51bytes), L(table_64bytes))
1145 .int JMPTBL (L(52bytes), L(table_64bytes))
1146 .int JMPTBL (L(53bytes), L(table_64bytes))
1147 .int JMPTBL (L(54bytes), L(table_64bytes))
1148 .int JMPTBL (L(55bytes), L(table_64bytes))
1149 .int JMPTBL (L(56bytes), L(table_64bytes))
1150 .int JMPTBL (L(57bytes), L(table_64bytes))
1151 .int JMPTBL (L(58bytes), L(table_64bytes))
1152 .int JMPTBL (L(59bytes), L(table_64bytes))
1153 .int JMPTBL (L(60bytes), L(table_64bytes))
1154 .int JMPTBL (L(61bytes), L(table_64bytes))
1155 .int JMPTBL (L(62bytes), L(table_64bytes))
1156 .int JMPTBL (L(63bytes), L(table_64bytes))
1157 .int JMPTBL (L(64bytes), L(table_64bytes))
1160 .int JMPTBL (L(0bytes), L(table_64bytes))
1161 .int JMPTBL (L(unreal_case), L(table_64bytes))
1162 .int JMPTBL (L(unreal_case), L(table_64bytes))
1163 .int JMPTBL (L(unreal_case), L(table_64bytes))
1164 .int JMPTBL (L(4bytes), L(table_64bytes))
1165 .int JMPTBL (L(unreal_case), L(table_64bytes))
1166 .int JMPTBL (L(unreal_case), L(table_64bytes))
1167 .int JMPTBL (L(unreal_case), L(table_64bytes))
1168 .int JMPTBL (L(8bytes), L(table_64bytes))
1169 .int JMPTBL (L(unreal_case), L(table_64bytes))
1170 .int JMPTBL (L(unreal_case), L(table_64bytes))
1171 .int JMPTBL (L(unreal_case), L(table_64bytes))
1172 .int JMPTBL (L(12bytes), L(table_64bytes))
1173 .int JMPTBL (L(unreal_case), L(table_64bytes))
1174 .int JMPTBL (L(unreal_case), L(table_64bytes))
1175 .int JMPTBL (L(unreal_case), L(table_64bytes))
1176 .int JMPTBL (L(16bytes), L(table_64bytes))
1177 .int JMPTBL (L(unreal_case), L(table_64bytes))
1178 .int JMPTBL (L(unreal_case), L(table_64bytes))
1179 .int JMPTBL (L(unreal_case), L(table_64bytes))
1180 .int JMPTBL (L(20bytes), L(table_64bytes))
1181 .int JMPTBL (L(unreal_case), L(table_64bytes))
1182 .int JMPTBL (L(unreal_case), L(table_64bytes))
1183 .int JMPTBL (L(unreal_case), L(table_64bytes))
1184 .int JMPTBL (L(24bytes), L(table_64bytes))
1185 .int JMPTBL (L(unreal_case), L(table_64bytes))
1186 .int JMPTBL (L(unreal_case), L(table_64bytes))
1187 .int JMPTBL (L(unreal_case), L(table_64bytes))
1188 .int JMPTBL (L(28bytes), L(table_64bytes))
1189 .int JMPTBL (L(unreal_case), L(table_64bytes))
1190 .int JMPTBL (L(unreal_case), L(table_64bytes))
1191 .int JMPTBL (L(unreal_case), L(table_64bytes))
1192 .int JMPTBL (L(32bytes), L(table_64bytes))
1193 .int JMPTBL (L(unreal_case), L(table_64bytes))
1194 .int JMPTBL (L(unreal_case), L(table_64bytes))
1195 .int JMPTBL (L(unreal_case), L(table_64bytes))
1196 .int JMPTBL (L(36bytes), L(table_64bytes))
1197 .int JMPTBL (L(unreal_case), L(table_64bytes))
1198 .int JMPTBL (L(unreal_case), L(table_64bytes))
1199 .int JMPTBL (L(unreal_case), L(table_64bytes))
1200 .int JMPTBL (L(40bytes), L(table_64bytes))
1201 .int JMPTBL (L(unreal_case), L(table_64bytes))
1202 .int JMPTBL (L(unreal_case), L(table_64bytes))
1203 .int JMPTBL (L(unreal_case), L(table_64bytes))
1204 .int JMPTBL (L(44bytes), L(table_64bytes))
1205 .int JMPTBL (L(unreal_case), L(table_64bytes))
1206 .int JMPTBL (L(unreal_case), L(table_64bytes))
1207 .int JMPTBL (L(unreal_case), L(table_64bytes))
1208 .int JMPTBL (L(48bytes), L(table_64bytes))
1209 .int JMPTBL (L(unreal_case), L(table_64bytes))
1210 .int JMPTBL (L(unreal_case), L(table_64bytes))
1211 .int JMPTBL (L(unreal_case), L(table_64bytes))
1212 .int JMPTBL (L(52bytes), L(table_64bytes))
1213 .int JMPTBL (L(unreal_case), L(table_64bytes))
1214 .int JMPTBL (L(unreal_case), L(table_64bytes))
1215 .int JMPTBL (L(unreal_case), L(table_64bytes))
1216 .int JMPTBL (L(56bytes), L(table_64bytes))
1217 .int JMPTBL (L(unreal_case), L(table_64bytes))
1218 .int JMPTBL (L(unreal_case), L(table_64bytes))
1219 .int JMPTBL (L(unreal_case), L(table_64bytes))
1220 .int JMPTBL (L(60bytes), L(table_64bytes))
1221 .int JMPTBL (L(unreal_case), L(table_64bytes))
1222 .int JMPTBL (L(unreal_case), L(table_64bytes))
1223 .int JMPTBL (L(unreal_case), L(table_64bytes))
1224 .int JMPTBL (L(64bytes), L(table_64bytes))