This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] stdlib-bsearch: middle element calculation may overflow
On (03/30/17 09:18), Zack Weinberg wrote:
> On Thu, Mar 30, 2017 at 12:59 AM, Sergey Senozhatsky
> <sergey.senozhatsky.work@gmail.com> wrote:
> >
> > BTW, FreeBSD's kernel bsearch implementation is quite interesting
> > and unique. take a look:
> >
> > https://github.com/freebsd/freebsd/blob/master/sys/libkern/bsearch.c
>
> Oh, that's clever. I _thought_ there had to be a way to avoid the
> addition. This should indeed be immune to the overflow problem,
> although we still need to have a test for it.
>
> libstdc++-v3 appears to use the same trick: see
> https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libstdc%2B%2B-v3/include/bits/stl_algo.h;h=2cd5303a100b998f80369c0baf2cd39aedf14b52;hb=af44a97c14af3a0079590662d49a8ced98954376#l2037
> (sorry for length of URL). So we can crib from that and not have to
> worry about copyrights.
interesting.
so, a simple-minded direct re-implementation of libstdc++
approach may look like this
void *__bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
__compar_fn_t __compar)
{
const char *base = __base, *pivot;
size_t len = __nmemb;
size_t half;
int comparison;
while (len > 0) {
half = len >> 1;
pivot = base + half * __size;
comparison = (*__compar) (__key, pivot);
if (comparison == 0)
return (void *)pivot;
if (comparison < 0) {
len = half;
} else {
base = pivot + __size;
len = len - half - 1;
}
}
return NULL;
}
and it generates a bit worse code, than BSD version, due to this part
len = len - half - 1;
which is
sub $0x1,%rbx
lea (%r15,%rbp,1),%r12
sub %r14,%rbx
objdump
400730: 41 57 push %r15
400732: 41 56 push %r14
400734: 41 55 push %r13
400736: 41 54 push %r12
400738: 55 push %rbp
400739: 53 push %rbx
40073a: 48 83 ec 18 sub $0x18,%rsp
40073e: 48 85 d2 test %rdx,%rdx
400741: 48 89 7c 24 08 mov %rdi,0x8(%rsp)
400746: 74 51 je 400799 <_bsearch+0x69>
400748: 49 89 f4 mov %rsi,%r12
40074b: 48 89 d3 mov %rdx,%rbx
40074e: 48 89 cd mov %rcx,%rbp
400751: 4d 89 c5 mov %r8,%r13
400754: eb 1a jmp 400770 <_bsearch+0x40>
400756: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
40075d: 00 00 00
400760: 48 83 eb 01 sub $0x1,%rbx
400764: 4d 8d 24 2f lea (%r15,%rbp,1),%r12
400768: 4c 29 f3 sub %r14,%rbx
40076b: 48 85 db test %rbx,%rbx
40076e: 74 29 je 400799 <_bsearch+0x69>
400770: 49 89 de mov %rbx,%r14
400773: 48 8b 7c 24 08 mov 0x8(%rsp),%rdi
400778: 49 d1 ee shr %r14
40077b: 4d 89 f7 mov %r14,%r15
40077e: 4c 0f af fd imul %rbp,%r15
400782: 4d 01 e7 add %r12,%r15
400785: 4c 89 fe mov %r15,%rsi
400788: 41 ff d5 callq *%r13
40078b: 85 c0 test %eax,%eax
40078d: 74 0d je 40079c <_bsearch+0x6c>
40078f: 79 cf jns 400760 <_bsearch+0x30>
400791: 4c 89 f3 mov %r14,%rbx
400794: 48 85 db test %rbx,%rbx
400797: 75 d7 jne 400770 <_bsearch+0x40>
400799: 45 31 ff xor %r15d,%r15d
40079c: 48 83 c4 18 add $0x18,%rsp
4007a0: 4c 89 f8 mov %r15,%rax
4007a3: 5b pop %rbx
4007a4: 5d pop %rbp
4007a5: 41 5c pop %r12
4007a7: 41 5d pop %r13
4007a9: 41 5e pop %r14
4007ab: 41 5f pop %r15
4007ad: c3 retq
4007ae: 66 90 xchg %ax,%ax
while something like this (basically, a BDS version)
void *bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
__compar_fn_t __compar)
{
const char *base = __base, *pivot;
size_t len = __nmemb;
int comparison;
while (len > 0) {
pivot = base + (len >> 1) * __size;
comparison = (*__compar) (__key, pivot);
if (comparison == 0)
return (void *)pivot;
if (comparison > 0) {
base = pivot + __size;
len--;
}
len >>= 1;
}
return NULL;
}
uses
shr %rbx
objdump
4006b0: 41 57 push %r15
4006b2: 41 56 push %r14
4006b4: 41 55 push %r13
4006b6: 41 54 push %r12
4006b8: 55 push %rbp
4006b9: 53 push %rbx
4006ba: 48 83 ec 18 sub $0x18,%rsp
4006be: 48 85 d2 test %rdx,%rdx
4006c1: 48 89 7c 24 08 mov %rdi,0x8(%rsp)
4006c6: 74 53 je 40071b <__bsearch+0x6b>
4006c8: 49 89 f4 mov %rsi,%r12
4006cb: 48 89 d3 mov %rdx,%rbx
4006ce: 48 89 cd mov %rcx,%rbp
4006d1: 4d 89 c5 mov %r8,%r13
4006d4: eb 1a jmp 4006f0 <__bsearch+0x40>
4006d6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
4006dd: 00 00 00
4006e0: 48 83 eb 01 sub $0x1,%rbx
4006e4: 4d 8d 24 2e lea (%r14,%rbp,1),%r12
4006e8: 48 d1 eb shr %rbx
4006eb: 48 85 db test %rbx,%rbx
4006ee: 74 2b je 40071b <__bsearch+0x6b>
4006f0: 49 89 df mov %rbx,%r15
4006f3: 48 8b 7c 24 08 mov 0x8(%rsp),%rdi
4006f8: 49 d1 ef shr %r15
4006fb: 4c 89 fa mov %r15,%rdx
4006fe: 48 0f af d5 imul %rbp,%rdx
400702: 4d 8d 34 14 lea (%r12,%rdx,1),%r14
400706: 4c 89 f6 mov %r14,%rsi
400709: 41 ff d5 callq *%r13
40070c: 83 f8 00 cmp $0x0,%eax
40070f: 74 0d je 40071e <__bsearch+0x6e>
400711: 7f cd jg 4006e0 <__bsearch+0x30>
400713: 4c 89 fb mov %r15,%rbx
400716: 48 85 db test %rbx,%rbx
400719: 75 d5 jne 4006f0 <__bsearch+0x40>
40071b: 45 31 f6 xor %r14d,%r14d
40071e: 48 83 c4 18 add $0x18,%rsp
400722: 4c 89 f0 mov %r14,%rax
400725: 5b pop %rbx
400726: 5d pop %rbp
400727: 41 5c pop %r12
400729: 41 5d pop %r13
40072b: 41 5e pop %r14
40072d: 41 5f pop %r15
40072f: c3 retq
on practice, this seem to matter a lot (gcc7, x86_64, -O2)
LIBSTDC++ BSD
Performance counter stats for './a.out': Performance counter stats for './a.out':
529.563747 task-clock:u (msec) # 0.999 CPUs utilized 431.084495 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec 0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec
48 page-faults:u # 0.091 K/sec 49 page-faults:u # 0.114 K/sec
1,660,238,680 cycles:u # 3.135 GHz (83.01%) 1,359,410,967 cycles:u # 3.153 GHz (83.30%)
18,865,782 stalled-cycles-frontend:u # 1.14% frontend cycles idle (83.49%) 112,645,472 stalled-cycles-frontend:u # 8.29% frontend cycles idle (83.30%)
28,578,724 stalled-cycles-backend:u # 1.72% backend cycles idle (67.15%) 64,645,660 stalled-cycles-backend:u # 4.76% backend cycles idle (66.60%)
3,809,381,770 instructions:u # 2.29 insn per cycle 3,602,947,356 instructions:u # 2.65 insn per cycle
# 0.01 stalled cycles per insn (83.57%) # 0.03 stalled cycles per insn (83.30%)
949,275,159 branches:u # 1792.561 M/sec (83.57%) 730,987,355 branches:u # 1695.694 M/sec (83.30%)
4,026 branch-misses:u # 0.00% of all branches (83.16%) 10,441,749 branch-misses:u # 1.43% of all branches (83.78%)
0.530168158 seconds time elapsed 0.431656059 seconds time elapsed
Performance counter stats for './a.out': Performance counter stats for './a.out':
531.296478 task-clock:u (msec) # 0.999 CPUs utilized 436.154599 task-clock:u (msec) # 0.998 CPUs utilized
0 context-switches:u # 0.000 K/sec 0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec
50 page-faults:u # 0.094 K/sec 50 page-faults:u # 0.115 K/sec
1,659,440,099 cycles:u # 3.123 GHz (83.06%) 1,348,636,891 cycles:u # 3.092 GHz (83.46%)
19,236,667 stalled-cycles-frontend:u # 1.16% frontend cycles idle (83.06%) 119,038,311 stalled-cycles-frontend:u # 8.83% frontend cycles idle (83.47%)
28,226,341 stalled-cycles-backend:u # 1.70% backend cycles idle (67.06%) 82,372,803 stalled-cycles-backend:u # 6.11% backend cycles idle (67.00%)
3,810,692,002 instructions:u # 2.30 insn per cycle 3,593,517,874 instructions:u # 2.66 insn per cycle
# 0.01 stalled cycles per insn (83.61%) # 0.03 stalled cycles per insn (83.49%)
950,132,896 branches:u # 1788.329 M/sec (83.64%) 732,065,055 branches:u # 1678.453 M/sec (83.50%)
4,383 branch-misses:u # 0.00% of all branches (83.55%) 9,774,701 branch-misses:u # 1.34% of all branches (82.86%)
0.531825605 seconds time elapsed 0.436922711 seconds time elapsed
Performance counter stats for './a.out': Performance counter stats for './a.out':
529.942094 task-clock:u (msec) # 0.998 CPUs utilized 435.002430 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec 0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec
48 page-faults:u # 0.091 K/sec 49 page-faults:u # 0.113 K/sec
1,661,178,675 cycles:u # 3.135 GHz (83.02%) 1,351,752,937 cycles:u # 3.107 GHz (83.40%)
21,119,773 stalled-cycles-frontend:u # 1.27% frontend cycles idle (83.04%) 115,705,283 stalled-cycles-frontend:u # 8.56% frontend cycles idle (83.46%)
28,723,902 stalled-cycles-backend:u # 1.73% backend cycles idle (67.16%) 73,129,341 stalled-cycles-backend:u # 5.41% backend cycles idle (66.90%)
3,806,932,705 instructions:u # 2.29 insn per cycle 3,607,666,833 instructions:u # 2.67 insn per cycle
# 0.01 stalled cycles per insn (83.58%) # 0.03 stalled cycles per insn (83.45%)
949,754,552 branches:u # 1792.186 M/sec (83.58%) 728,361,887 branches:u # 1674.386 M/sec (83.45%)
4,278 branch-misses:u # 0.00% of all branches (83.25%) 10,571,686 branch-misses:u # 1.45% of all branches (83.00%)
0.530759498 seconds time elapsed 0.435504277 seconds time elapsed
Performance counter stats for './a.out': Performance counter stats for './a.out':
535.067404 task-clock:u (msec) # 0.999 CPUs utilized 433.108622 task-clock:u (msec) # 0.998 CPUs utilized
0 context-switches:u # 0.000 K/sec 0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec
49 page-faults:u # 0.092 K/sec 48 page-faults:u # 0.111 K/sec
1,662,263,437 cycles:u # 3.107 GHz (83.21%) 1,357,994,817 cycles:u # 3.135 GHz (83.38%)
23,904,662 stalled-cycles-frontend:u # 1.44% frontend cycles idle (83.18%) 117,585,762 stalled-cycles-frontend:u # 8.66% frontend cycles idle (82.73%)
30,990,320 stalled-cycles-backend:u # 1.86% backend cycles idle (66.75%) 71,549,183 stalled-cycles-backend:u # 5.27% backend cycles idle (66.75%)
3,788,168,547 instructions:u # 2.28 insn per cycle 3,608,615,459 instructions:u # 2.66 insn per cycle
# 0.01 stalled cycles per insn (83.66%) # 0.03 stalled cycles per insn (83.38%)
949,809,536 branches:u # 1775.121 M/sec (83.74%) 730,772,715 branches:u # 1687.274 M/sec (83.38%)
5,849 branch-misses:u # 0.00% of all branches (83.47%) 10,279,444 branch-misses:u # 1.41% of all branches (83.90%)
0.535721929 seconds time elapsed 0.433903691 seconds time elapsed
for comparison, unpatched glibc bsearch()
Performance counter stats for './a.out':
447.937515 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
48 page-faults:u # 0.107 K/sec
1,409,322,732 cycles:u # 3.146 GHz (83.26%)
137,696,335 stalled-cycles-frontend:u # 9.77% frontend cycles idle (83.26%)
107,106,201 stalled-cycles-backend:u # 7.60% backend cycles idle (66.51%)
3,390,292,254 instructions:u # 2.41 insn per cycle
# 0.04 stalled cycles per insn (83.26%)
968,222,443 branches:u # 2161.512 M/sec (83.86%)
13,048,699 branch-misses:u # 1.35% of all branches (83.44%)
0.448465765 seconds time elapsed
Performance counter stats for './a.out':
451.069138 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
49 page-faults:u # 0.109 K/sec
1,406,560,677 cycles:u # 3.118 GHz (83.38%)
132,787,307 stalled-cycles-frontend:u # 9.44% frontend cycles idle (83.37%)
106,997,916 stalled-cycles-backend:u # 7.61% backend cycles idle (66.75%)
3,395,472,906 instructions:u # 2.41 insn per cycle
# 0.04 stalled cycles per insn (83.38%)
966,504,113 branches:u # 2142.696 M/sec (83.37%)
11,790,044 branch-misses:u # 1.22% of all branches (83.39%)
0.451607453 seconds time elapsed
Performance counter stats for './a.out':
449.328189 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
49 page-faults:u # 0.109 K/sec
1,408,416,612 cycles:u # 3.134 GHz (83.31%)
129,766,221 stalled-cycles-frontend:u # 9.21% frontend cycles idle (83.31%)
116,797,583 stalled-cycles-backend:u # 8.29% backend cycles idle (66.62%)
3,397,205,219 instructions:u # 2.41 insn per cycle
# 0.04 stalled cycles per insn (83.31%)
971,010,420 branches:u # 2161.027 M/sec (83.31%)
12,745,400 branch-misses:u # 1.31% of all branches (83.81%)
0.449839176 seconds time elapsed
Performance counter stats for './a.out':
446.382825 task-clock:u (msec) # 0.998 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
48 page-faults:u # 0.108 K/sec
1,401,235,764 cycles:u # 3.139 GHz (83.20%)
135,184,574 stalled-cycles-frontend:u # 9.65% frontend cycles idle (83.20%)
112,230,343 stalled-cycles-backend:u # 8.01% backend cycles idle (66.40%)
3,371,533,704 instructions:u # 2.41 insn per cycle
# 0.04 stalled cycles per insn (83.20%)
972,968,920 branches:u # 2179.674 M/sec (83.86%)
12,100,921 branch-misses:u # 1.24% of all branches (83.39%)
0.447111384 seconds time elapsed
> As long as we're microoptimizing, I found an article
> <https://realm.io/news/how-we-beat-cpp-stl-binary-search/> which, in
> essence, says that because the branches after the comparison are going
> to be unpredictable, you really want to be using conditional moves
> instead. It might be worth trying a little to convince GCC and LLVM
> to do that. (It's too bad that there's no __builtin_unpredictable()
> to go with __builtin_expect().)
thanks for the link.
-ss