nextafter() about an order of magnitude slower than trivial implementation
Stefan Kanthak
stefan.kanthak@nexgo.de
Mon Aug 16 16:03:39 GMT 2021
Hi,
to test the accuracy of (elementary) floating-point functions
against a higher precision reference I use the following code:
double input = 1.0, output = exp(input), reference = M_E;
if (output == reference)
printf("correctly rounded\n");
else if (nextafter(output, reference) == reference)
printf("faithfully rounded\n");
else
printf("...");
Testing and benchmarking an exponential function that consumes
about 10ns/call on an AMD EPYC 7262, I noticed that nextafter()
itself is DEAD SLOW: it consumes about 8.5ns/call!
The following (quick&dirty) implementation consumes 0.7ns/call,
i.e. is about an order of magnitude faster:
--- after.c ---
double nextafter(double from, double to)
{
if (from == to)
return to;
if (to != to)
return to;
if (from != from)
return from;
if (from == 0.0)
return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;
unsigned long long ull = *(unsigned long long *) &from;
if ((from < to) == (from < 0.0))
ull--;
else
ull++;
return 0.0 + *(double *) &ull;
}
--- EOF ---
JFTR: the code generated by GCC for this function is FAR from
optimal; the assembly implementation shown below is more
than 2x faster and consumes <0.3ns/call, i.e. is about
30x faster than glibc's nextafter()!
The data from 'perf stat' show that glibc's nextafter() executes
about 30 instructions more than the above trivial implementation,
and that it is NOT well suited for modern super-scalar processors.
Stefan
PS: test program and logs
[stefan@rome ~]$ cat bench.c
// Copyright (C) 2005-2021, Stefan Kanthak <stefan.kanthak@nexgo.de>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <time.h>
__attribute__((noinline))
double nop(double foo, double bar)
{
return foo + bar;
}
inline static
double lfsr64(void)
{
// 64-bit linear feedback shift register (Galois form) using
// primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"),
// initialised with 2**64 / golden ratio
static uint64_t lfsr = 0x9E3779B97F4A7C15;
const uint64_t sign = (int64_t) lfsr >> 63;
lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign);
return *(double *) &lfsr;
}
inline static
double random64(void)
{
static uint64_t seed = 0x0123456789ABCDEF;
seed = seed * 6364136223846793005 + 1442695040888963407;
return *(double *) &seed;
}
int main(void)
{
clock_t t0, t1, t2, tt;
uint32_t n;
volatile double result;
t0 = clock();
for (n = 500000000u; n > 0u; n--) {
result = nop(lfsr64(), 0.0);
result = nop(random64(), 1.0 / 0.0);
}
t1 = clock();
for (n = 500000000u; n > 0u; n--) {
result = nextafter(lfsr64(), 0.0);
result = nextafter(random64(), 1.0 / 0.0);
}
t2 = clock();
tt = t2 - t0; t2 -= t1; t1 -= t0; t0 = t2 - t1;
printf("\n"
"nop() %4lu.%06lu 0\n"
"nextafter() %4lu.%06lu %4lu.%06lu\n"
" %4lu.%06lu nano-seconds\n",
t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
}
[stefan@rome ~]$ gcc -O3 -lm bench.c
[stefan@rome ~]$ perf stat ./a.out
nop() 1.480000 0
nextafter() 10.060000 8.580000
11.540000 nano-seconds
Performance counter stats for './a.out':
11,548.78 msec task-clock:u # 1.000 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
145 page-faults:u # 0.013 K/sec
38,917,213,536 cycles:u # 3.370 GHz (83.33%)
~~~~~~~~~~~~~~
15,647,656,615 stalled-cycles-frontend:u # 40.21% frontend cycles idle (83.33%)
10,746,044,422 stalled-cycles-backend:u # 27.61% backend cycles idle (83.33%)
69,739,403,870 instructions:u # 1.79 insn per cycle
~~~~~~~~~~~~~~ ~~~~
# 0.22 stalled cycles per insn (83.33%)
16,495,748,110 branches:u # 1428.354 M/sec (83.33%)
500,701,246 branch-misses:u # 3.04% of all branches (83.33%)
11.550414029 seconds time elapsed
11.548265000 seconds user
0.000999000 seconds sys
[stefan@rome ~]$ gcc -O3 bench.c after.c
[stefan@rome ~]$ perf stat ./a.out
nop() 1.490000 0
nextafter() 2.210000 0.720000
3.700000 nano-seconds
Performance counter stats for './a.out':
3,702.89 msec task-clock:u # 1.000 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
122 page-faults:u # 0.033 K/sec
12,407,345,183 cycles:u # 3.351 GHz (83.32%)
~~~~~~~~~~~~~~
135,817 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.34%)
5,498,895,906 stalled-cycles-backend:u # 44.32% backend cycles idle (83.34%)
38,002,430,460 instructions:u # 3.06 insn per cycle
~~~~~~~~~~~~~~ ~~~~
# 0.14 stalled cycles per insn (83.34%)
7,497,381,393 branches:u # 2024.735 M/sec (83.34%)
497,462 branch-misses:u # 0.01% of all branches (83.32%)
3.703648875 seconds time elapsed
3.703294000 seconds user
0.000000000 seconds sys
[stefan@rome ~]cat after.s
# Copyright © 2004-2021, Stefan Kanthak <stefan.kanthak@nexgo.de>
.arch generic64
.code64
.intel_syntax noprefix
.text
# xmm0 = from
# xmm1 = to
nextafter:
xorpd xmm2, xmm2 # xmm2 = 0.0
ucomisd xmm1, xmm2 # CF = (to < 0.0)
jp .Lto # to = NAN?
sbb rax, rax # rax = (to < 0.0) ? -1 : 0
ucomisd xmm0, xmm1 # CF = (from < to)
jp .Lfrom # from = NAN?
je .Lto # from = to?
.Lnotequal:
sbb rcx, rcx # rcx = (from < to) ? -1 : 0
ucomisd xmm0, xmm2 # CF = (from < 0.0)
jz .Lzero # from = 0.0?
.Lnext:
movq rdx, xmm0 # rdx = from
sbb rax, rax # rax = (from < 0.0) ? -1 : 0
xor rax, rcx # rax = (from < 0.0) = (from < to) ? 0 : -1
or rax, 1 # rax = (from < 0.0) = (from < to) ? 1 : -1
sub rdx, rax
movq xmm0, rdx
addsd xmm0, xmm2
ret
.Lzero:
shl rax, 63 # rax = (to < -0.0) ? 0x8000000000000000 : 0
or rax, 1 # rax = (to < -0.0) ? 0x8000000000000001 : 1
movq xmm0, rax # xmm0 = (to < -0.0) ? -0x1.0p-1074 : 0x1.0p-1074
addsd xmm0, xmm2
ret
.Lto:
movsd xmm0, xmm1 # xmm0 = to
.Lfrom:
ret
.size nextafter, .-nextafter
.type nextafter, @function
.global nextafter
.end
[stefan@rome ~]$ perf stat ./a.out
nop() 1.630000 0
nextafter() 1.910000 0.280000
3.540000 nano-seconds
Performance counter stats for './a.out':
3,547.12 msec task-clock:u # 1.000 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
123 page-faults:u # 0.035 K/sec
11,949,840,797 cycles:u # 3.369 GHz (83.32%)
~~~~~~~~~~~~~~
129,627 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.34%)
3,998,960,716 stalled-cycles-backend:u # 33.46% backend cycles idle (83.34%)
37,493,523,285 instructions:u # 3.14 insn per cycle
~~~~~~~~~~~~~~ ~~~~
# 0.11 stalled cycles per insn (83.34%)
7,998,559,192 branches:u # 2254.945 M/sec (83.34%)
497,565 branch-misses:u # 0.01% of all branches (83.32%)
3.547999008 seconds time elapsed
3.546671000 seconds user
0.000999000 seconds sys
[stefan@rome ~]$
More information about the Libc-help
mailing list