This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] stdlib-bsearch: middle element calculation may overflow


On Wed, Mar 29, 2017 at 7:02 PM, Sergey Senozhatsky
<sergey.senozhatsky.work@gmail.com> wrote:
> On (03/28/17 11:30), Zack Weinberg wrote:
>> On Thu, Mar 16, 2017 at 11:32 PM, Sergey Senozhatsky
>> <sergey.senozhatsky.work@gmail.com> wrote:
>> > so I modified the test I posted earlier. moved array to .bss, made array
>> > size reasonable (so there is no overflow). each ./a.out executes bsearch()
>> > 10000000 times.
>>
>> Thanks for this.  It might also be useful to show the codegen
>> difference, which (on x86-64) is
>>
>> -bsearch_stock:
>> +bsearch_patched:
>>  .LFB10:
>>      .cfi_startproc
>>      pushq    %r15
>> @@ -43,9 +43,11 @@
>>      cmpq    %r13, %r14
>>      jnb    .L6
>>  .L9:
>> -    leaq    (%r14,%r13), %rbx
>> +    movq    %r13, %rbx
>>      movq    (%rsp), %rdi
>> +    subq    %r14, %rbx
>>      shrq    %rbx
>> +    addq    %r14, %rbx
>>      movq    %rbx, %r15
>>      imulq    %r12, %r15
>>      addq    8(%rsp), %r15
>>
>> This probably hurts by extending the critical dependency chain, as
>> well as by issuing more instructions.  I would like to know what the
>> codegen difference looks like on more RISCy architectures (maybe MIPS
>> and AArch64?) -- I have to wonder whether a CPU that didn't have
>> complex LEA would wind up generating essentially the same code either
>> way.
>
> sorry fot the delay.
>
> ARM
>
> arm-none-eabi-gcc (Arch Repository) 6.3.0
>
> -O2
>
>         UNPATCHED                                                                               PATCHED
>
>     8368:       e92d4ff8        push    {r3, r4, r5, r6, r7, r8, r9, sl, fp, lr}            83e4:       e92d4ff8        push    {r3, r4, r5, r6, r7, r8, r9, sl, fp, lr}
>     836c:       e2526000        subs    r6, r2, #0                                          83e8:       e2526000        subs    r6, r2, #0
>     8370:       e59da028        ldr     sl, [sp, #40]   ; 0x28                              83ec:       e59da028        ldr     sl, [sp, #40]   ; 0x28
>     8374:       11a07000        movne   r7, r0                                              83f0:       11a07000        movne   r7, r0
>     8378:       11a08001        movne   r8, r1                                              83f4:       11a08001        movne   r8, r1
>     837c:       11a09003        movne   r9, r3                                              83f8:       11a09003        movne   r9, r3
>     8380:       13a05000        movne   r5, #0                                              83fc:       13a05000        movne   r5, #0
>     8384:       1a000004        bne     839c <____bsearch+0x34>                             8400:       1a000004        bne     8418 <__bsearch+0x34>
>     8388:       ea00000f        b       83cc <____bsearch+0x64>                             8404:       ea00000f        b       8448 <__bsearch+0x64>
>     838c:       0a000011        beq     83d8 <____bsearch+0x70>                             8408:       0a000011        beq     8454 <__bsearch+0x70>
>     8390:       e2845001        add     r5, r4, #1                                          840c:       e2845001        add     r5, r4, #1
>     8394:       e1550006        cmp     r5, r6                                              8410:       e1550006        cmp     r5, r6
>     8398:       2a00000b        bcs     83cc <____bsearch+0x64>                             8414:       2a00000b        bcs     8448 <__bsearch+0x64>
>     839c:       e0854006        add     r4, r5, r6                                          8418:       e0464005        sub     r4, r6, r5
>     83a0:       e1a040a4        lsr     r4, r4, #1                                          841c:       e08540a4        add     r4, r5, r4, lsr #1

This is just the magic here.  ARM is special in that they have shifts
with their adds.  AARCH64 is similar but on some microarch, doing the
shift seperate from the add is better.  So this shows that it is worse
on those targets for latency reasons :).

Thanks,
Andrew

>     83a4:       e02b8499        mla     fp, r9, r4, r8                                      8420:       e02b8499        mla     fp, r9, r4, r8
>     83a8:       e1a00007        mov     r0, r7                                              8424:       e1a00007        mov     r0, r7
>     83ac:       e1a0100b        mov     r1, fp                                              8428:       e1a0100b        mov     r1, fp
>     83b0:       e1a0e00f        mov     lr, pc                                              842c:       e1a0e00f        mov     lr, pc
>     83b4:       e12fff1a        bx      sl                                                  8430:       e12fff1a        bx      sl
>     83b8:       e3500000        cmp     r0, #0                                              8434:       e3500000        cmp     r0, #0
>     83bc:       aafffff2        bge     838c <____bsearch+0x24>                             8438:       aafffff2        bge     8408 <__bsearch+0x24>
>     83c0:       e1a06004        mov     r6, r4                                              843c:       e1a06004        mov     r6, r4
>     83c4:       e1550006        cmp     r5, r6                                              8440:       e1550006        cmp     r5, r6
>     83c8:       3afffff3        bcc     839c <____bsearch+0x34>                             8444:       3afffff3        bcc     8418 <__bsearch+0x34>
>     83cc:       e3a00000        mov     r0, #0                                              8448:       e3a00000        mov     r0, #0
>     83d0:       e8bd4ff8        pop     {r3, r4, r5, r6, r7, r8, r9, sl, fp, lr}            844c:       e8bd4ff8        pop     {r3, r4, r5, r6, r7, r8, r9, sl, fp, lr}
>     83d4:       e12fff1e        bx      lr                                                  8450:       e12fff1e        bx      lr
>     83d8:       e1a0000b        mov     r0, fp                                              8454:       e1a0000b        mov     r0, fp
>     83dc:       e8bd4ff8        pop     {r3, r4, r5, r6, r7, r8, r9, sl, fp, lr}            8458:       e8bd4ff8        pop     {r3, r4, r5, r6, r7, r8, r9, sl, fp, lr}
>     83e0:       e12fff1e        bx      lr                                                  845c:       e12fff1e        bx      lr
>
>         -ss


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]