This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
rint/nearbyint for x86-64
- From: John Reiser <jreiser at BitWagon dot com>
- To: libc-alpha at sourceware dot org
- Date: Wed, 12 Mar 2008 10:24:30 -0700
- Subject: rint/nearbyint for x86-64
Regarding rint and nearbyint for x86_64:
> Take absolute value, ADD then SUBtract the power of 2 whose LSB in f.p.
> representation has the place value 1 (namely: 2**23 or 2**52), copy the
> sign of the original argument.
This implementation uses a data dependency (indexing by the sign of the
input) to avoid any branching.
-----
// nearbyint, rint, nearbyintf, rintf for x86_64 using SSE instructions.
// Convert to rounded integer value in floating-point represenation.
// rint temporarily enables Precision exception (SIGFPE if result != input),
// nearbyint uses existing PrecisionMask (usually 1, thus SIGFPE inhibited.)
// Copyright 2008 BitWagon Software LLC. All rights reserved.
// Licensed under GNU Lesser General Public License (LGPL) version 2.1.
// Written by John Reiser, March 2008.
//#include <machine/asm.h>
#define ENTRY(sym) sym: .globl sym; .type sym,@function
#define END(sym) .size sym, . - sym
#define weak_alias(a, b) .globl b; b = a
P_PRECISION_MASK= 12 // bit number of PrecisonMask bit in MXCSR
.section .rodata
.align 16 // SSE memory operands demand 16-byte alignment
two52p:
.long 0,0x43300000 // +(2**52); lsb has place value +1
.long 0,0
.long 0,0xc3300000 // -(2**52); must be 16 bytes from two52p
.text
ENTRY(__nearbyint)
movmskpd %xmm0,%eax // extract packed signs to low 2 bits of eax
lea two52p(%rip),%rdx // no double index with %rip as base
and $1,%eax // sign of scalar in xmm0
add %eax,%eax // 2*
addpd (%rdx,%rax,8),%xmm0 // align binary point and integer point
subpd (%rdx,%rax,8),%xmm0 // go back near original value
ret
END(__nearbyint)
weak_alias(__nearbyint, nearbyint)
ENTRY(__rint)
movmskpd %xmm0,%eax // extract packed signs to low 2 bits of eax
stmxcsr -4(%rsp); movl -4(%rsp),%ecx // original mxcsr
lea two52p(%rip),%rdx // no double index with %rip as base
andl $~(1<<P_PRECISION_MASK),-4(%rsp)
ldmxcsr -4(%rsp) // enable Precision exception
add %eax,%eax // 2*
movl %ecx,-4(%rsp) // mxcsr at entry
addpd (%rdx,%rax,8),%xmm0 // align binary point and integer point
subpd (%rdx,%rax,8),%xmm0 // go back near original value
ldmxcsr -4(%rsp) // restore original mxcsr
ret
END(__rint)
weak_alias(__rint,rint)
.section .rodata
.align 16 // SSE memory operands demand 16-byte alignment
two23p:
.long 0x4b000000 // +(2**23); lsb has place value +1
.long 0,0,0
.long 0xcb000000 // -(2**23); must be 16 bytes from two23p
.text
ENTRY(__nearbyintf)
movmskps %xmm0,%eax // extract packed signs to low 4 bits of eax
lea two23p(%rip),%rdx // no double index with %rip as base
and $1,%eax // sign of scalar in xmm0
add %eax,%eax // 2*
addps (%rdx,%rax,8),%xmm0 // align binary point and integer point
subps (%rdx,%rax,8),%xmm0 // go back near original value
ret
END(__nearbyintf)
weak_alias(__nearbyintf,nearbyintf)
ENTRY(__rintf)
movmskps %xmm0,%eax // extract packed signs to low 4 bits of eax
stmxcsr -4(%rsp); movl -4(%rsp),%ecx // original mxcsr
lea two23p(%rip),%rdx // no double index with %rip as base
andl $~(1<<P_PRECISION_MASK),-4(%rsp)
and $1,%eax // sign of scalar in xmm0
ldmxcsr -4(%rsp) // enable Precision exception
add %eax,%eax // 2*
movl %ecx,-4(%rsp) // mxcsr at entry
addps (%rdx,%rax,8),%xmm0 // align binary point and integer point
subps (%rdx,%rax,8),%xmm0 // go back near original value
ldmxcsr -4(%rsp) // restore original mxcsr
ret
END(__rintf)
weak_alias(__rintf,rintf)
--
John Reiser, jreiser@BitWagon.com