PATCH: Optimize memcmp for ia32
H. J. Lu
hjl@lucon.org
Thu Feb 5 00:11:00 GMT 2004
This patch optimizes memcmp for ia32. I got average speeup by around
400%.
H.J.
-----
2004-01-15 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/i386/memcmp.S: Optimized for IA-32.
--- sysdeps/i386/memcmp.S.i386 2001-07-05 21:55:52.000000000 -0700
+++ sysdeps/i386/memcmp.S 2003-08-21 07:05:58.000000000 -0700
@@ -1,5 +1,5 @@
/* Compare two memory blocks for differences in the first COUNT bytes.
- Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
+ Copyright (C) 1995, 1996, 1997, 2000, 2003 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -22,53 +22,370 @@
#include "bp-sym.h"
#include "bp-asm.h"
-#define PARMS LINKAGE+4 /* space for 1 saved reg */
-#define BLK1 PARMS
-#define BLK2 BLK1+PTR_SIZE
-#define LEN BLK2+PTR_SIZE
+#define PARMS LINKAGE+4 /* Preserve EBX. */
+#define BLK1 PARMS
+#define BLK2 BLK1+PTR_SIZE
+#define LEN BLK2+PTR_SIZE
+#define ENTRANCE pushl %ebx; ENTER
+#define RETURN popl %ebx; LEAVE; ret
+
+/* Load an entry in a jump table into EBX. TABLE is a jump table
+ with relative offsets. INDEX is a register contains the index
+ into the jump table. */
+#define LOAD_JUMP_TABLE_ENTRY(TABLE, INDEX) \
+ /* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,4), %ebx
+
+#ifdef HAVE_HIDDEN
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+#else
+ .text
+#endif
+ ALIGN (4)
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
- .text
+ .text
+ ALIGN (4)
ENTRY (BP_SYM (memcmp))
- ENTER
+ ENTRANCE
- pushl %esi /* Save callee-safe registers. */
- movl %edi, %edx /* Note that %edx is not used and can
- so be used to save %edi. It's faster. */
-
- movl BLK1(%esp), %esi
- movl BLK2(%esp), %edi
- movl LEN(%esp), %ecx
- CHECK_BOUNDS_LOW (%esi, BLK1(%esp))
- CHECK_BOUNDS_LOW (%edi, BLK2(%esp))
-
- cld /* Set direction of comparison. */
-
- xorl %eax, %eax /* Default result. */
-
- repe /* Compare at most %ecx bytes. */
- cmpsb
- jz L(1) /* If even last byte was equal we return 0. */
-
- /* The memory blocks are not equal. So result of the last
- subtraction is present in the carry flag. It is set when
- the byte in block #2 is bigger. In this case we have to
- return -1 (=0xffffffff), else 1. */
- sbbl %eax, %eax /* This is tricky. %eax == 0 and carry is set
- or not depending on last subtraction. */
-
- /* At this point %eax == 0, if the byte of block #1 was bigger, and
- 0xffffffff if the last byte of block #2 was bigger. The latter
- case is already correct but the former needs a little adjustment.
- Note that the following operation does not change 0xffffffff. */
- orb $1, %al /* Change 0 to 1. */
-
-L(1): CHECK_BOUNDS_HIGH (%esi, BLK1(%esp), jbe)
- CHECK_BOUNDS_HIGH (%edi, BLK2(%esp), jbe)
- popl %esi /* Restore registers. */
- movl %edx, %edi
+ movl BLK1(%esp), %eax
+ movl BLK2(%esp), %edx
+ movl LEN(%esp), %ecx
+
+ cmpl $1, %ecx
+ jne L(not_1)
+ movzbl (%eax), %ecx /* LEN == 1 */
+ cmpb (%edx), %cl
+ jne L(neq)
+L(bye):
+ xorl %eax, %eax
+ RETURN
+
+L(neq):
+ sbbl %eax, %eax
+ sbbl $-1, %eax
+ RETURN
+
+L(not_1):
+ jl L(bye) /* LEN == 0 */
+
+ pushl %esi
+ movl %eax, %esi
+ cmpl $32, %ecx;
+ jge L(32bytesormore) /* LEN => 32 */
+
+ LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+ addl %ecx, %edx
+ addl %ecx, %esi
+ jmp *%ebx
+
+ ALIGN (4)
+L(28bytes):
+ movl -28(%esi), %eax
+ movl -28(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%esi), %eax
+ movl -24(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%esi), %eax
+ movl -20(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%esi), %eax
+ movl -16(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%esi), %eax
+ movl -12(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%esi), %eax
+ movl -8(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%esi), %eax
+ movl -4(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(0bytes):
+ popl %esi
+ xorl %eax, %eax
+ RETURN
+
+L(29bytes):
+ movl -29(%esi), %eax
+ movl -29(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(25bytes):
+ movl -25(%esi), %eax
+ movl -25(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(21bytes):
+ movl -21(%esi), %eax
+ movl -21(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(17bytes):
+ movl -17(%esi), %eax
+ movl -17(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(13bytes):
+ movl -13(%esi), %eax
+ movl -13(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(9bytes):
+ movl -9(%esi), %eax
+ movl -9(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(5bytes):
+ movl -5(%esi), %eax
+ movl -5(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(1bytes):
+ movzbl -1(%esi), %eax
+ cmpb -1(%edx), %al
+ jne L(set)
+ popl %esi
+ xorl %eax, %eax
+ RETURN
+
+L(30bytes):
+ movl -30(%esi), %eax
+ movl -30(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(26bytes):
+ movl -26(%esi), %eax
+ movl -26(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(22bytes):
+ movl -22(%esi), %eax
+ movl -22(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(18bytes):
+ movl -18(%esi), %eax
+ movl -18(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(14bytes):
+ movl -14(%esi), %eax
+ movl -14(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(10bytes):
+ movl -10(%esi), %eax
+ movl -10(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(6bytes):
+ movl -6(%esi), %eax
+ movl -6(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%esi), %eax
+ movzwl -2(%edx), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmpl %ecx, %eax
+ jne L(set)
+ popl %esi
+ xorl %eax, %eax
+ RETURN
+
+L(31bytes):
+ movl -31(%esi), %eax
+ movl -31(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(27bytes):
+ movl -27(%esi), %eax
+ movl -27(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(23bytes):
+ movl -23(%esi), %eax
+ movl -23(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(19bytes):
+ movl -19(%esi), %eax
+ movl -19(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%esi), %eax
+ movl -15(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%esi), %eax
+ movl -11(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%esi), %eax
+ movl -7(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%esi), %eax
+ movzwl -3(%edx), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmpl %ecx, %eax
+ jne L(set)
+ movzbl -1(%esi), %eax
+ cmpb -1(%edx), %al
+ jne L(set)
+ popl %esi
+ xorl %eax, %eax
+ RETURN
+
+ ALIGN (4)
+/* ECX >= 32. */
+L(32bytesormore):
+ subl $32, %ecx
+
+ movl (%esi), %eax
+ cmpl (%edx), %eax
+ jne L(load_ecx)
+
+ movl 4(%esi), %eax
+ cmpl 4(%edx), %eax
+ jne L(load_ecx_4)
+
+ movl 8(%esi), %eax
+ cmpl 8(%edx), %eax
+ jne L(load_ecx_8)
+
+ movl 12(%esi), %eax
+ cmpl 12(%edx), %eax
+ jne L(load_ecx_12)
+
+ movl 16(%esi), %eax
+ cmpl 16(%edx), %eax
+ jne L(load_ecx_16)
+
+ movl 20(%esi), %eax
+ cmpl 20(%edx), %eax
+ jne L(load_ecx_20)
+
+ movl 24(%esi), %eax
+ cmpl 24(%edx), %eax
+ jne L(load_ecx_24)
+
+ movl 28(%esi), %eax
+ cmpl 28(%edx), %eax
+ jne L(load_ecx_28)
+
+ addl $32, %esi
+ addl $32, %edx
+ cmpl $32, %ecx
+ jge L(32bytesormore)
+
+ LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+ addl %ecx, %edx
+ addl %ecx, %esi
+ jmp *%ebx
+
+L(load_ecx_28):
+ addl $0x4, %edx
+L(load_ecx_24):
+ addl $0x4, %edx
+L(load_ecx_20):
+ addl $0x4, %edx
+L(load_ecx_16):
+ addl $0x4, %edx
+L(load_ecx_12):
+ addl $0x4, %edx
+L(load_ecx_8):
+ addl $0x4, %edx
+L(load_ecx_4):
+ addl $0x4, %edx
+L(load_ecx):
+ movl (%edx), %ecx
+
+L(find_diff):
+ cmpb %cl, %al
+ jne L(set)
+ cmpw %cx, %ax
+ jne L(set)
+ shrl $16,%eax
+ shrl $16,%ecx
+ cmpb %cl, %al
+ jne L(set)
+ /* We get there only if we already know there is a
+ difference. */
+ cmpl %ecx, %eax
+L(set):
+ sbbl %eax, %eax
+ sbbl $-1, %eax
+ popl %esi
+ RETURN
+
+ ALIGN (2)
+L(table_32bytes) :
+ .long L(0bytes) - . + 0x0
+ .long L(1bytes) - . + 0x4
+ .long L(2bytes) - . + 0x8
+ .long L(3bytes) - . + 0xc
+ .long L(4bytes) - . + 0x10
+ .long L(5bytes) - . + 0x14
+ .long L(6bytes) - . + 0x18
+ .long L(7bytes) - . + 0x1c
+ .long L(8bytes) - . + 0x20
+ .long L(9bytes) - . + 0x24
+ .long L(10bytes) - . + 0x28
+ .long L(11bytes) - . + 0x2c
+ .long L(12bytes) - . + 0x30
+ .long L(13bytes) - . + 0x34
+ .long L(14bytes) - . + 0x38
+ .long L(15bytes) - . + 0x3c
+ .long L(16bytes) - . + 0x40
+ .long L(17bytes) - . + 0x44
+ .long L(18bytes) - . + 0x48
+ .long L(19bytes) - . + 0x4c
+ .long L(20bytes) - . + 0x50
+ .long L(21bytes) - . + 0x54
+ .long L(22bytes) - . + 0x58
+ .long L(23bytes) - . + 0x5c
+ .long L(24bytes) - . + 0x60
+ .long L(25bytes) - . + 0x64
+ .long L(26bytes) - . + 0x68
+ .long L(27bytes) - . + 0x6c
+ .long L(28bytes) - . + 0x70
+ .long L(29bytes) - . + 0x74
+ .long L(30bytes) - . + 0x78
+ .long L(31bytes) - . + 0x7c
- LEAVE
- ret
END (BP_SYM (memcmp))
#undef bcmp
More information about the Libc-alpha
mailing list