This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: Gcc builtin review: isinf, insnan ...
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Cc: Andrew Pinski <pinskia at gmail dot com>
- Date: Mon, 25 May 2015 22:53:46 +0200
- Subject: Re: Gcc builtin review: isinf, insnan ...
- Authentication-results: sourceware.org; auth=none
- References: <20150525101505 dot GA11233 at domone> <20150525114545 dot GC11233 at domone> <20150525121634 dot GF11233 at domone> <20150525175219 dot GB15258 at domone>
I raised this issue before but didn't wrote patch so I should do it now.
I would be silent about glibc as it shares same flaw as gcc.
Main problem that these functions try to be branchless. Which causes
performance regression for most applications versus branched code.
A problem is that predicted branch is free while conditional store
always cost cycle. So you need to have unpredictable branch to get
performance gain. When branch is 95% predicted then branchless code
wouldn't pay for itself if it adds one cycle versus branched and
misprediction costs 20 cycles.
And NaN is quite exceptional value so branches will almost always be
predicted. Otherwise user has other problems, like that if 5% of his
data are NaN's then result will likely be garbage.
Then you have problem that with modern gcc you wont likely save branch.
Most of these functions are surrounded by if. From gcc-4.9 it will
optimize out that branch as its predicated and it results in simpler
code.
More evidence about that is that I took assembly of benchmark below and
changed conditional move to jump which improves performance back by 10%
For showing that I wrote simple example of branched isinf that is around
10% faster than builtin.
#ifdef BRANCHED
static inline int
isinf (double dx)
{
union u {
double d;
long l;
};
union u u;
u.d = dx;
long x = u.l;
return 2 * x == 0xffe0000000000000 ? (x == 0x7ff0000000000000 ? 1 : -1) : 0;
}
#endif
int main()
{
int ret;
int i, j;
double *d = malloc (800000);
for (j=0; j<1000000; j++)
for (i=0; i<1000; i++)
if (__builtin_expect(isinf (d[i]),0))
ret += 42;
return ret;
}
.file "inf.c"
.section .text.unlikely,"ax",@progbits
.LCOLDB2:
.section .text.startup,"ax",@progbits
.LHOTB2:
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movl $800000, %edi
call malloc
movsd .LC0(%rip), %xmm2
leaq 8000(%rax), %rsi
movsd .LC1(%rip), %xmm1
movl $1000000, %edi
.p2align 4,,10
.p2align 3
.L2:
movq %rax, %rdx
.p2align 4,,10
.p2align 3
.L3:
movsd (%rdx), %xmm0
andpd %xmm2, %xmm0
ucomisd %xmm1, %xmm0
ja .LC
leal 42(%rbx), %ebx
.LC:
addq $8, %rdx
cmpq %rsi, %rdx
jne .L3
subl $1, %edi
jne .L2
movl %ebx, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.section .text.unlikely
.LCOLDE2:
.section .text.startup
.LHOTE2:
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LC0:
.long 4294967295
.long 2147483647
.long 0
.long 0
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC1:
.long 4294967295
.long 2146435071
.ident "GCC: (Debian 4.9.2-9) 4.9.2"
.section .note.GNU-stack,"",@progbits