This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC][PATCH 1/2] aarch64: Add optimized ASIMD version of sinf


This patch adds the optimized ASIMD version of sinf for Aarch64.
The algorithm and code flow is based on the SSE version of sinf
implementation for x86_64 in sysdeps/x86_64/fpu/s_sinf.S.

        * sysdeps/aarch64/fpu/multiarch/Makefile: New file.
        * sysdeps/aarch64/fpu/multiarch/s_sinf.c: Likewise.
        * sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S: Likewise.
---
 sysdeps/aarch64/fpu/multiarch/Makefile       |   3 +
 sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S | 382 +++++++++++++++++++++++++++
 sysdeps/aarch64/fpu/multiarch/s_sinf.c       |  31 +++
 3 files changed, 416 insertions(+)
 create mode 100644 sysdeps/aarch64/fpu/multiarch/Makefile
 create mode 100644 sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S
 create mode 100644 sysdeps/aarch64/fpu/multiarch/s_sinf.c

diff --git a/sysdeps/aarch64/fpu/multiarch/Makefile b/sysdeps/aarch64/fpu/multiarch/Makefile
new file mode 100644
index 0000000000..2092e9a885
--- /dev/null
+++ b/sysdeps/aarch64/fpu/multiarch/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_sinf-asimd
+endif
diff --git a/sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S b/sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S
new file mode 100644
index 0000000000..83f26c0c33
--- /dev/null
+++ b/sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S
@@ -0,0 +1,382 @@
+/* Optimized ASIMD version of sinf
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x| == 0: return x.
+ *  2) if |x| <  2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+ *  3) if |x| <  2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+ *  4) if |x| <   Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          s = sign(x) * (-1.0)^((n>>2)&1)
+ *          if(n&2 != 0) {
+ *              using cos(t) polynomial for |t|<Pi/4, result is
+ *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ *          } else {
+ *              using sin(t) polynomial for |t|<Pi/4, result is
+ *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  sin(+-0) = +-0 not raising inexact/underflow,
+ *  sin(subnormal) raises inexact/underflow,
+ *  sin(min_normalized) raises inexact/underflow,
+ *  sin(normalized) raises inexact,
+ *  sin(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  sin(NaN) = NaN.
+ */
+
+sp_x		.req	s0			/* SP x */
+dp_x		.req	d1			/* DP x */
+sp_abs_x	.req	s2			/* SP |x| */
+dp_abs_x	.req	d3			/* DP |x| */
+t_val		.req	d4			/* DP t */
+dp_pio4		.req	d5			/* DP Pi/4 */
+dp_zero		.req	d6			/* DP 0.0 */
+dp_one		.req	d7			/* DP 1.0 */
+
+bits_abs_x	.req	w0			/* Bits of SP |x| */
+sign_x		.req	w1			/* Sign bit of x */
+n_val		.req	w2			/* n */
+bits_x		.req	w3			/* Bits of SP x */
+
+ENTRY_ALIGN(__sinf_asimd, 6)
+	/* Input: single precision x in s0 */
+	ldr	w9, L(SP_PIO4)			/* Pi/4 */
+	fmov	bits_x, sp_x			/* Bits of x */
+	ubfx	bits_abs_x, bits_x, #0, #31	/* Bits of |x| */
+	cmp	bits_abs_x, w9			/* |x|<Pi/4?  */
+	blt	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	ldr	w10, L(SP_9PIO4)		/* 9*Pi/4 */
+	fmov	sp_abs_x, bits_abs_x		/* SP |x| */
+	cmp	bits_abs_x, w10			/* |x|>9*Pi/4?  */
+	bge	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	ldr	s16, L(SP_INVPIO4)		/* SP 1/(Pi/4) */
+	fcvt	dp_abs_x, sp_abs_x		/* DP |x| */
+	ldr	dp_pio4, L(DP_PIO4)
+	fmul	s16, sp_abs_x, s16		/* SP |x|/(Pi/4) */
+	fcvtzu	w12, s16			/* k=trunc(|x|/(Pi/4)) */
+	add	n_val, w12, #1			/* n=k+1*/
+	and	w12, n_val, #0x0e		/* j=n&0x0e */
+	ucvtf	d16, w12			/* DP j */
+	fmsub	t_val, d16, dp_pio4, dp_abs_x	/* t=|x|-j*Pi/4 */
+
+	.p2align	3
+L(reconstruction):
+	/* Input: w2=n, d4=t, w1=sign_x */
+	tst	n_val, #2			/* n&2? */
+	fmov	dp_one, #1.0			/* DP 1.0 */
+	adr	x14, L(DP_C)			/* Cos Poly Coefficients */
+	adr	x15, L(DP_S)			/* Sin Poly Coefficients */
+	fcsel	d17, t_val, dp_one, EQ		/* q=t or 1.0 */
+	lsr	sign_x, bits_x, #31		/* Sign bit of x */
+	eor	w9, sign_x, n_val, LSR #2	/* (n>>2) XOR sign(x) */
+	lsl	x9, x9, #63			/* sign_sin */
+	fmov	d20, x9				/* sign_sin */
+	csel	x14, x15, x14, EQ		/* K=Sin or Cos Coefficients */
+	eor	v17.8b, v17.8b, v20.8b		/* r=sign_sin XOR (1.0 or t) */
+
+	.p2align	3
+L(sin_cos_poly):
+	/*
+	 * Here if sin(x) is evalutaed by sin(t)/cos(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = r * (1.0+t^2*(K0+t^2*(K1+t^2*(K2+t^2*(K3+t^2*K4)))))
+	 * where r=s*t, Kx=Sx (Sin Polynomial Coefficients) if n&2==0
+	 *	 r=s,   Kx=Cx (Cos Polynomial Coefficients) otherwise
+	 */
+	fmul	d18, t_val, t_val		/* y=t^2 */
+	fmul	d19, d18, d18			/* z=t^4 */
+	ldr	d21, [x14, #0*8]		/* K0 */
+	ldp	d22, d23, [x14, #1*8]		/* K1,K2 */
+	ldp	d24, d25, [x14, #3*8]		/* K3,K4 */
+	fmadd	d23, d25, d19, d23		/* K2+z*K4 */
+	fmadd	d22, d24, d19, d22		/* K1+z*K3 */
+	fmadd	d21, d23, d19, d21		/* K0+z*(K2+z*K4) */
+	fmul	d22, d22, d19			/* z*(K1+z*K3) */
+	/* y*(K0+y*(K1+y*(K2+y*(K3+y*K4)))) */
+	fmadd	d22, d21, d18, d22
+	fmadd	d22, d22, d17, d17		/* DP result */
+	fcvt	s0, d22				/* SP result */
+	ret
+
+	.p2align	3
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	mov	w8, #0x7f8			/* InfNan>>20 */
+	cmp	bits_abs_x, w8, LSL #20		/* x is Inf or NaN?  */
+	bge	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	fcvt	dp_abs_x, sp_abs_x		/* DP |x| */
+	fmov	dp_one, #1.0			/* DP 1.0 */
+	mov	w11, #0x4b0			/* 2^23>>20 */
+	cmp	bits_abs_x, w11, LSL #20	/* |x|>=2^23?  */
+	bge	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	adr	x14, L(DP_PIO4HILO)
+	ldr	d16, L(DP_INVPIO4)		/* 1/(Pi/4) */
+	ldp	d17, d18, [x14] 		/* -PIO4HI,-PIO4LO */
+	fmadd	d16, dp_abs_x, d16, dp_one	/* |x|/(Pi/4)+1.0 */
+	fcvtzu	n_val, d16			/* n=trunc(|x|/(Pi/4)+1.0) */
+	and	w10, n_val, #0xfffffffe		/* j=n&0xfffffffe */
+	ucvtf	d16, w10			/* DP j */
+	fmadd	d17, d16, d17, dp_abs_x		/* |x|-j*PIO4HI */
+	fmadd	t_val, d16, d18, d17		/* t=|x|-j*PIO4HI-j*PIO4LO */
+	b	L(reconstruction)
+
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+	movz	x11, #0x4330, LSL #48		/* 2^52 */
+	fmov	d21, x11			/* DP 2^52 */
+	ldr	dp_pio4, L(DP_PIO4)		/* Pi/4 */
+	fmov	dp_zero, xzr			/* 0.0 */
+	adr	x14, L(_FPI)
+	lsr	w8, bits_abs_x, #23		/* eb = biased exponent of x */
+	add	w8, w8, #-0x7f+59		/* bitpos=eb-BIAS_32+59 */
+	mov	w9, #28				/* =28 */
+	udiv	w10, w8, w9			/* j=bitpos/28 */
+	mov	x11, #0xffffffff00000000	/* DP_HI_MASK */
+	add	x14, x14, x10, LSL #3
+	ldr	d16, [x14, #-2*8]		/* FPI[j-2] */
+	ldr	d17, [x14, #-1*8]		/* FPI[j-1] */
+	ldr	q18, [x14]			/* FPI[j+1]|FPI[j] */
+	mul	w10, w10, w9			/* j*28 */
+	add	w10, w10, #19			/* j*28+19 */
+	fmov	d20, x11			/* DP_HI_MASK */
+	fmul	d16, dp_abs_x, d16		/* tmp3 */
+	fmul	d17, dp_abs_x, d17		/* tmp2 */
+	fmul	v18.2d, v18.2d, v3.d[0]		/* tmp1|tmp0 */
+	cmp	w8, w10				/* bitpos>=j*28+19?  */
+	and	v20.8b, v16.8b, v20.8b		/* HI(tmp3) */
+	fcsel	d20, dp_zero, d20, LT		/* d=HI(tmp3) OR 0.0 */
+	fsub	d16, d16, d20			/* tmp3=tmp3-d */
+	fadd	d22, d16, d17			/* tmp5=tmp3+tmp2 */
+	fadd	d20, d22, d21			/* tmp6=tmp5+2^52 */
+	fsub	d21, d20, d21			/* tmp4=tmp6-2^52 */
+	faddp	d18, v18.2d			/* tmp7=tmp0+tmp1 */
+	fmov	w10, s20			/* k=I64_LO(tmp6) */
+	fcmp	d21, d22			/* tmp4>tmp5?  */
+	cset	w9, GT				/* c=1 or 0 */
+	fcsel	d22, dp_one, dp_zero, GT	/* d=1.0 or 0.0 */
+	sub	w10, w10, w9			/* k-=c */
+	fsub	d21, d21, d22			/* tmp4-=d */
+	and	w11, w10, #1			/* k&1 */
+	ucvtf	d20, w11			/* DP k&1 */
+	fsub	d16, d16, d21			/* tmp3-=tmp4 */
+	fmsub	d20, d20, dp_one, d16		/* t=-1.0*[k&1]+tmp3 */
+	fadd	d20, d20, d17			/* t+=tmp2 */
+	add	n_val, w10, #1			/* n=k+1 */
+	fadd	d20, d20, d18			/* t+=tmp7 */
+	fmul	t_val, d20, dp_pio4		/* t*=PI/4 */
+	b	L(reconstruction)
+
+	.p2align	3
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	fcvt	dp_x, sp_x			/* DP x */
+	mov	w10, #0x3d0			/* 2^-5>>20 */
+	cmp	bits_abs_x, w10, LSL #20	/* |x|<2^-5? */
+	blt	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	fmov	t_val, dp_x			/* DP t=x */
+	adr	x14, L(DP_S)			/* Kx=Sx */
+	fmov	d17, dp_x			/* r=x */
+	b	L(sin_cos_poly)
+
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	mov	w11, #0x320			/* 2^-27>>20 */
+	cmp	bits_abs_x, w11, LSL #20	/* |x|<2^-27? */
+	blt	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	adr	x14, L(DP_SIN2)
+	ldp	d16, d17, [x14]			/* DP SIN2_0,SIN2_1 */
+	fmul	d18, dp_x, dp_x			/* y=x^2 */
+	fmadd	d16, d17, d18, d16		/* DP SIN2_0+x^2*SIN2_1 */
+	fmul	d16, d16, d18			/* DP x^2*SIN2_0+x^4*SIN2_1 */
+	fmadd	d16, d16, dp_x, dp_x		/* DP result */
+	fcvt	s0, d16				/* SP result */
+	ret
+
+L(arg_less_2pn27):
+	/* Here if |x|<2^-27 */
+	cbnz	bits_abs_x, L(arg_not_zero)
+	/* Here if |x|==0 */
+	ret
+
+L(arg_not_zero):
+	/* Here if 0<|x|<2^-27 */
+	/*
+	 * Special cases here:
+	 *  sin(subnormal) raises inexact/underflow
+	 *  sin(min_normalized) raises inexact/underflow
+	 *  sin(normalized) raises inexact
+	 */
+	movz	x8, #0x3cd0, LSL #48		/* DP_SMALL=2^-50 */
+	fmov	d16, x8				/* DP_SMALL */
+	fmsub	dp_x, dp_x, d16, dp_x		/* Result is x-x*DP_SMALL */
+	fcvt	s0, dp_x
+	ret
+
+	.p2align	3
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	bne	L(skip_errno_setting)		/* in case of x is NaN */
+
+	/* Here if x is Inf. Set errno to EDOM.  */
+	adrp	x14, :gottprel: errno
+	ldr	PTR_REG(14), [x14, #:gottprel_lo12:errno]
+	mrs	x15, tpidr_el0
+	mov	w8, #EDOM			/* EDOM */
+	str	w8, [x15, x14]			/* Store EDOM in errno */
+
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued.  */
+	fsub	s0, sp_x, sp_x			/* Result is NaN */
+	ret
+
+END(__sinf_asimd)
+
+	.section .rodata, "a"
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+   for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5.  */
+	.p2align 3
+L(DP_SIN2):
+	.long	0x5543d49d,0xbfc55555		/* DP_SIN2_0 */
+	.long	0x75cec8c5,0x3f8110f4		/* DP_SIN2_1 */
+	.type L(DP_SIN2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SIN2))
+
+/* Coefficients of polynomial
+   for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_S):
+	.long	0x55551cd9,0xbfc55555		/* S0 */
+	.long	0x10c2688b,0x3f811111		/* S1 */
+	.long	0x8b4bd1f9,0xbf2a019f		/* S2 */
+	.long	0x64e6b5b4,0x3ec71d72		/* S3 */
+	.long	0x1674b58a,0xbe5a947e		/* S4 */
+	.type L(DP_S), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S))
+
+	.p2align 3
+/* Coefficients of polynomial
+   for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
+L(DP_C):
+	.long	0xfffe98ae,0xbfdfffff		/* C0 */
+	.long	0x545c50c7,0x3fa55555		/* C1 */
+	.long	0x348b6874,0xbf56c16b		/* C2 */
+	.long	0x9ac43cc0,0x3efa00eb		/* C3 */
+	.long	0xdd8844d7,0xbe923c97		/* C4 */
+	.type L(DP_C4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb		/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30		/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HILO):
+	.long	0x54000000,0xbfe921fb		/* High part of Pi/4 */
+	.long	0x11A62633,0xbe010b46		/* Low part of Pi/4 */
+	.type L(DP_PIO4HILO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HILO))
+
+	.p2align 2
+L(SP_PIO4):
+	.long	0x3f490fdb			/* Pi/4 */
+	.type L(SP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_PIO4))
+
+	.p2align 2
+L(SP_9PIO4):
+	.long	0x40e231d6			/* 9*Pi/4 */
+	.type L(SP_9PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_9PIO4))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983			/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+weak_alias (__sinf, sinf)
diff --git a/sysdeps/aarch64/fpu/multiarch/s_sinf.c b/sysdeps/aarch64/fpu/multiarch/s_sinf.c
new file mode 100644
index 0000000000..78fdc9e5d0
--- /dev/null
+++ b/sysdeps/aarch64/fpu/multiarch/s_sinf.c
@@ -0,0 +1,31 @@
+/* Multiple versions of sinf.  AARCH64 Version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+#include <sys/auxv.h>
+
+extern float __sinf_asimd (float);
+extern float __sinf_aarch64 (float);
+float __sinf (float);
+
+libm_ifunc (__sinf,
+	    (GLRO (dl_hwcap) & HWCAP_ASIMD) ? __sinf_asimd : __sinf_aarch64);
+weak_alias (__sinf, sinf);
+
+#define SINF __sinf_aarch64
+#include <sysdeps/ieee754/flt-32/s_sinf.c>
-- 
2.12.2


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]