This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] stdlib-bsearch: middle element calculation may overflow


On (03/30/17 09:18), Zack Weinberg wrote:
> On Thu, Mar 30, 2017 at 12:59 AM, Sergey Senozhatsky
> <sergey.senozhatsky.work@gmail.com> wrote:
> >
> > BTW, FreeBSD's kernel bsearch implementation is quite interesting
> > and unique. take a look:
> >
> > https://github.com/freebsd/freebsd/blob/master/sys/libkern/bsearch.c
> 
> Oh, that's clever.  I _thought_ there had to be a way to avoid the
> addition.  This should indeed be immune to the overflow problem,
> although we still need to have a test for it.
> 
> libstdc++-v3 appears to use the same trick: see
> https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libstdc%2B%2B-v3/include/bits/stl_algo.h;h=2cd5303a100b998f80369c0baf2cd39aedf14b52;hb=af44a97c14af3a0079590662d49a8ced98954376#l2037
> (sorry for length of URL).  So we can crib from that and not have to
> worry about copyrights.

interesting.

so, a simple-minded direct re-implementation of libstdc++
approach may look like this

void *__bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
		__compar_fn_t __compar)
{
	const char *base = __base, *pivot;
	size_t len = __nmemb;
	size_t half;
	int comparison;

	while (len > 0) {
		half = len >> 1;
		pivot = base + half * __size;
		comparison = (*__compar) (__key, pivot);

		if (comparison == 0)
			return (void *)pivot;

		if (comparison < 0) {
			len = half;
		} else {
			base = pivot + __size;
			len = len - half - 1;
		}
	}

	return NULL;
}


and it generates a bit worse code, than BSD version, due to this part

	len = len - half - 1;

which is 
  	sub    $0x1,%rbx
  	lea    (%r15,%rbp,1),%r12
  	sub    %r14,%rbx

objdump

  400730:	41 57                	push   %r15
  400732:	41 56                	push   %r14
  400734:	41 55                	push   %r13
  400736:	41 54                	push   %r12
  400738:	55                   	push   %rbp
  400739:	53                   	push   %rbx
  40073a:	48 83 ec 18          	sub    $0x18,%rsp
  40073e:	48 85 d2             	test   %rdx,%rdx
  400741:	48 89 7c 24 08       	mov    %rdi,0x8(%rsp)
  400746:	74 51                	je     400799 <_bsearch+0x69>
  400748:	49 89 f4             	mov    %rsi,%r12
  40074b:	48 89 d3             	mov    %rdx,%rbx
  40074e:	48 89 cd             	mov    %rcx,%rbp
  400751:	4d 89 c5             	mov    %r8,%r13
  400754:	eb 1a                	jmp    400770 <_bsearch+0x40>
  400756:	66 2e 0f 1f 84 00 00 	nopw   %cs:0x0(%rax,%rax,1)
  40075d:	00 00 00 
  400760:	48 83 eb 01          	sub    $0x1,%rbx
  400764:	4d 8d 24 2f          	lea    (%r15,%rbp,1),%r12
  400768:	4c 29 f3             	sub    %r14,%rbx
  40076b:	48 85 db             	test   %rbx,%rbx
  40076e:	74 29                	je     400799 <_bsearch+0x69>
  400770:	49 89 de             	mov    %rbx,%r14
  400773:	48 8b 7c 24 08       	mov    0x8(%rsp),%rdi
  400778:	49 d1 ee             	shr    %r14
  40077b:	4d 89 f7             	mov    %r14,%r15
  40077e:	4c 0f af fd          	imul   %rbp,%r15
  400782:	4d 01 e7             	add    %r12,%r15
  400785:	4c 89 fe             	mov    %r15,%rsi
  400788:	41 ff d5             	callq  *%r13
  40078b:	85 c0                	test   %eax,%eax
  40078d:	74 0d                	je     40079c <_bsearch+0x6c>
  40078f:	79 cf                	jns    400760 <_bsearch+0x30>
  400791:	4c 89 f3             	mov    %r14,%rbx
  400794:	48 85 db             	test   %rbx,%rbx
  400797:	75 d7                	jne    400770 <_bsearch+0x40>
  400799:	45 31 ff             	xor    %r15d,%r15d
  40079c:	48 83 c4 18          	add    $0x18,%rsp
  4007a0:	4c 89 f8             	mov    %r15,%rax
  4007a3:	5b                   	pop    %rbx
  4007a4:	5d                   	pop    %rbp
  4007a5:	41 5c                	pop    %r12
  4007a7:	41 5d                	pop    %r13
  4007a9:	41 5e                	pop    %r14
  4007ab:	41 5f                	pop    %r15
  4007ad:	c3                   	retq   
  4007ae:	66 90                	xchg   %ax,%ax


while something like this (basically, a BDS version)

void *bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
		__compar_fn_t __compar)
{
	const char *base = __base, *pivot;
	size_t len = __nmemb;
	int comparison;

	while (len > 0) {
		pivot = base + (len >> 1) * __size;
		comparison = (*__compar) (__key, pivot);

		if (comparison == 0)
			return (void *)pivot;

		if (comparison > 0) {
			base = pivot + __size;
			len--;
		}
		len >>= 1;
	}

	return NULL;
}

uses 
		shr    %rbx

objdump

  4006b0:	41 57                	push   %r15
  4006b2:	41 56                	push   %r14
  4006b4:	41 55                	push   %r13
  4006b6:	41 54                	push   %r12
  4006b8:	55                   	push   %rbp
  4006b9:	53                   	push   %rbx
  4006ba:	48 83 ec 18          	sub    $0x18,%rsp
  4006be:	48 85 d2             	test   %rdx,%rdx
  4006c1:	48 89 7c 24 08       	mov    %rdi,0x8(%rsp)
  4006c6:	74 53                	je     40071b <__bsearch+0x6b>
  4006c8:	49 89 f4             	mov    %rsi,%r12
  4006cb:	48 89 d3             	mov    %rdx,%rbx
  4006ce:	48 89 cd             	mov    %rcx,%rbp
  4006d1:	4d 89 c5             	mov    %r8,%r13
  4006d4:	eb 1a                	jmp    4006f0 <__bsearch+0x40>
  4006d6:	66 2e 0f 1f 84 00 00 	nopw   %cs:0x0(%rax,%rax,1)
  4006dd:	00 00 00 
  4006e0:	48 83 eb 01          	sub    $0x1,%rbx
  4006e4:	4d 8d 24 2e          	lea    (%r14,%rbp,1),%r12
  4006e8:	48 d1 eb             	shr    %rbx
  4006eb:	48 85 db             	test   %rbx,%rbx
  4006ee:	74 2b                	je     40071b <__bsearch+0x6b>
  4006f0:	49 89 df             	mov    %rbx,%r15
  4006f3:	48 8b 7c 24 08       	mov    0x8(%rsp),%rdi
  4006f8:	49 d1 ef             	shr    %r15
  4006fb:	4c 89 fa             	mov    %r15,%rdx
  4006fe:	48 0f af d5          	imul   %rbp,%rdx
  400702:	4d 8d 34 14          	lea    (%r12,%rdx,1),%r14
  400706:	4c 89 f6             	mov    %r14,%rsi
  400709:	41 ff d5             	callq  *%r13
  40070c:	83 f8 00             	cmp    $0x0,%eax
  40070f:	74 0d                	je     40071e <__bsearch+0x6e>
  400711:	7f cd                	jg     4006e0 <__bsearch+0x30>
  400713:	4c 89 fb             	mov    %r15,%rbx
  400716:	48 85 db             	test   %rbx,%rbx
  400719:	75 d5                	jne    4006f0 <__bsearch+0x40>
  40071b:	45 31 f6             	xor    %r14d,%r14d
  40071e:	48 83 c4 18          	add    $0x18,%rsp
  400722:	4c 89 f0             	mov    %r14,%rax
  400725:	5b                   	pop    %rbx
  400726:	5d                   	pop    %rbp
  400727:	41 5c                	pop    %r12
  400729:	41 5d                	pop    %r13
  40072b:	41 5e                	pop    %r14
  40072d:	41 5f                	pop    %r15
  40072f:	c3                   	retq   



on practice, this seem to matter a lot (gcc7, x86_64, -O2)


		LIBSTDC++											BSD
 Performance counter stats for './a.out':	 						Performance counter stats for './a.out':
	
        529.563747      task-clock:u (msec)       #    0.999 CPUs utilized          		        431.084495      task-clock:u (msec)       #    0.999 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  		                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  		                 0      cpu-migrations:u          #    0.000 K/sec                  
                48      page-faults:u             #    0.091 K/sec                  		                49      page-faults:u             #    0.114 K/sec                  
     1,660,238,680      cycles:u                  #    3.135 GHz                      (83.01%)	     1,359,410,967      cycles:u                  #    3.153 GHz                      (83.30%)
        18,865,782      stalled-cycles-frontend:u #    1.14% frontend cycles idle     (83.49%)	       112,645,472      stalled-cycles-frontend:u #    8.29% frontend cycles idle     (83.30%)
        28,578,724      stalled-cycles-backend:u  #    1.72% backend cycles idle      (67.15%)	        64,645,660      stalled-cycles-backend:u  #    4.76% backend cycles idle      (66.60%)
     3,809,381,770      instructions:u            #    2.29  insn per cycle         		     3,602,947,356      instructions:u            #    2.65  insn per cycle         
                                                  #    0.01  stalled cycles per insn  (83.57%)	                                                  #    0.03  stalled cycles per insn  (83.30%)
       949,275,159      branches:u                # 1792.561 M/sec                    (83.57%)	       730,987,355      branches:u                # 1695.694 M/sec                    (83.30%)
             4,026      branch-misses:u           #    0.00% of all branches          (83.16%)	        10,441,749      branch-misses:u           #    1.43% of all branches          (83.78%)
	
       0.530168158 seconds time elapsed	       							0.431656059 seconds time elapsed
	
	
 Performance counter stats for './a.out':	 						Performance counter stats for './a.out':
	
        531.296478      task-clock:u (msec)       #    0.999 CPUs utilized          		        436.154599      task-clock:u (msec)       #    0.998 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  		                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  		                 0      cpu-migrations:u          #    0.000 K/sec                  
                50      page-faults:u             #    0.094 K/sec                  		                50      page-faults:u             #    0.115 K/sec                  
     1,659,440,099      cycles:u                  #    3.123 GHz                      (83.06%)	     1,348,636,891      cycles:u                  #    3.092 GHz                      (83.46%)
        19,236,667      stalled-cycles-frontend:u #    1.16% frontend cycles idle     (83.06%)	       119,038,311      stalled-cycles-frontend:u #    8.83% frontend cycles idle     (83.47%)
        28,226,341      stalled-cycles-backend:u  #    1.70% backend cycles idle      (67.06%)	        82,372,803      stalled-cycles-backend:u  #    6.11% backend cycles idle      (67.00%)
     3,810,692,002      instructions:u            #    2.30  insn per cycle         		     3,593,517,874      instructions:u            #    2.66  insn per cycle         
                                                  #    0.01  stalled cycles per insn  (83.61%)	                                                  #    0.03  stalled cycles per insn  (83.49%)
       950,132,896      branches:u                # 1788.329 M/sec                    (83.64%)	       732,065,055      branches:u                # 1678.453 M/sec                    (83.50%)
             4,383      branch-misses:u           #    0.00% of all branches          (83.55%)	         9,774,701      branch-misses:u           #    1.34% of all branches          (82.86%)
	
       0.531825605 seconds time elapsed	       							0.436922711 seconds time elapsed
	
	
 Performance counter stats for './a.out':	 						Performance counter stats for './a.out':
	
        529.942094      task-clock:u (msec)       #    0.998 CPUs utilized          		        435.002430      task-clock:u (msec)       #    0.999 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  		                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  		                 0      cpu-migrations:u          #    0.000 K/sec                  
                48      page-faults:u             #    0.091 K/sec                  		                49      page-faults:u             #    0.113 K/sec                  
     1,661,178,675      cycles:u                  #    3.135 GHz                      (83.02%)	     1,351,752,937      cycles:u                  #    3.107 GHz                      (83.40%)
        21,119,773      stalled-cycles-frontend:u #    1.27% frontend cycles idle     (83.04%)	       115,705,283      stalled-cycles-frontend:u #    8.56% frontend cycles idle     (83.46%)
        28,723,902      stalled-cycles-backend:u  #    1.73% backend cycles idle      (67.16%)	        73,129,341      stalled-cycles-backend:u  #    5.41% backend cycles idle      (66.90%)
     3,806,932,705      instructions:u            #    2.29  insn per cycle         		     3,607,666,833      instructions:u            #    2.67  insn per cycle         
                                                  #    0.01  stalled cycles per insn  (83.58%)	                                                  #    0.03  stalled cycles per insn  (83.45%)
       949,754,552      branches:u                # 1792.186 M/sec                    (83.58%)	       728,361,887      branches:u                # 1674.386 M/sec                    (83.45%)
             4,278      branch-misses:u           #    0.00% of all branches          (83.25%)	        10,571,686      branch-misses:u           #    1.45% of all branches          (83.00%)
	
       0.530759498 seconds time elapsed	       							0.435504277 seconds time elapsed
	
	
 Performance counter stats for './a.out':	 						Performance counter stats for './a.out':
	
        535.067404      task-clock:u (msec)       #    0.999 CPUs utilized          		        433.108622      task-clock:u (msec)       #    0.998 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  		                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  		                 0      cpu-migrations:u          #    0.000 K/sec                  
                49      page-faults:u             #    0.092 K/sec                  		                48      page-faults:u             #    0.111 K/sec                  
     1,662,263,437      cycles:u                  #    3.107 GHz                      (83.21%)	     1,357,994,817      cycles:u                  #    3.135 GHz                      (83.38%)
        23,904,662      stalled-cycles-frontend:u #    1.44% frontend cycles idle     (83.18%)	       117,585,762      stalled-cycles-frontend:u #    8.66% frontend cycles idle     (82.73%)
        30,990,320      stalled-cycles-backend:u  #    1.86% backend cycles idle      (66.75%)	        71,549,183      stalled-cycles-backend:u  #    5.27% backend cycles idle      (66.75%)
     3,788,168,547      instructions:u            #    2.28  insn per cycle         		     3,608,615,459      instructions:u            #    2.66  insn per cycle         
                                                  #    0.01  stalled cycles per insn  (83.66%)	                                                  #    0.03  stalled cycles per insn  (83.38%)
       949,809,536      branches:u                # 1775.121 M/sec                    (83.74%)	       730,772,715      branches:u                # 1687.274 M/sec                    (83.38%)
             5,849      branch-misses:u           #    0.00% of all branches          (83.47%)	        10,279,444      branch-misses:u           #    1.41% of all branches          (83.90%)
	
       0.535721929 seconds time elapsed	       							0.433903691 seconds time elapsed





for comparison, unpatched glibc bsearch()

 Performance counter stats for './a.out':

        447.937515      task-clock:u (msec)       #    0.999 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  
                48      page-faults:u             #    0.107 K/sec                  
     1,409,322,732      cycles:u                  #    3.146 GHz                      (83.26%)
       137,696,335      stalled-cycles-frontend:u #    9.77% frontend cycles idle     (83.26%)
       107,106,201      stalled-cycles-backend:u  #    7.60% backend cycles idle      (66.51%)
     3,390,292,254      instructions:u            #    2.41  insn per cycle         
                                                  #    0.04  stalled cycles per insn  (83.26%)
       968,222,443      branches:u                # 2161.512 M/sec                    (83.86%)
        13,048,699      branch-misses:u           #    1.35% of all branches          (83.44%)

       0.448465765 seconds time elapsed


 Performance counter stats for './a.out':

        451.069138      task-clock:u (msec)       #    0.999 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  
                49      page-faults:u             #    0.109 K/sec                  
     1,406,560,677      cycles:u                  #    3.118 GHz                      (83.38%)
       132,787,307      stalled-cycles-frontend:u #    9.44% frontend cycles idle     (83.37%)
       106,997,916      stalled-cycles-backend:u  #    7.61% backend cycles idle      (66.75%)
     3,395,472,906      instructions:u            #    2.41  insn per cycle         
                                                  #    0.04  stalled cycles per insn  (83.38%)
       966,504,113      branches:u                # 2142.696 M/sec                    (83.37%)
        11,790,044      branch-misses:u           #    1.22% of all branches          (83.39%)

       0.451607453 seconds time elapsed


 Performance counter stats for './a.out':

        449.328189      task-clock:u (msec)       #    0.999 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  
                49      page-faults:u             #    0.109 K/sec                  
     1,408,416,612      cycles:u                  #    3.134 GHz                      (83.31%)
       129,766,221      stalled-cycles-frontend:u #    9.21% frontend cycles idle     (83.31%)
       116,797,583      stalled-cycles-backend:u  #    8.29% backend cycles idle      (66.62%)
     3,397,205,219      instructions:u            #    2.41  insn per cycle         
                                                  #    0.04  stalled cycles per insn  (83.31%)
       971,010,420      branches:u                # 2161.027 M/sec                    (83.31%)
        12,745,400      branch-misses:u           #    1.31% of all branches          (83.81%)

       0.449839176 seconds time elapsed


 Performance counter stats for './a.out':

        446.382825      task-clock:u (msec)       #    0.998 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  
                48      page-faults:u             #    0.108 K/sec                  
     1,401,235,764      cycles:u                  #    3.139 GHz                      (83.20%)
       135,184,574      stalled-cycles-frontend:u #    9.65% frontend cycles idle     (83.20%)
       112,230,343      stalled-cycles-backend:u  #    8.01% backend cycles idle      (66.40%)
     3,371,533,704      instructions:u            #    2.41  insn per cycle         
                                                  #    0.04  stalled cycles per insn  (83.20%)
       972,968,920      branches:u                # 2179.674 M/sec                    (83.86%)
        12,100,921      branch-misses:u           #    1.24% of all branches          (83.39%)

       0.447111384 seconds time elapsed




> As long as we're microoptimizing, I found an article
> <https://realm.io/news/how-we-beat-cpp-stl-binary-search/> which, in
> essence, says that because the branches after the comparison are going
> to be unpredictable, you really want to be using conditional moves
> instead.  It might be worth trying a little to convince GCC and LLVM
> to do that.  (It's too bad that there's no __builtin_unpredictable()
> to go with __builtin_expect().)

thanks for the link.

	-ss


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]