This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC][PATCH 0/2] aarch64: Add optimized ASIMD versions of sinf/cosf


On Tue, 2017-06-13 at 15:49 -0300, Adhemerval Zanella wrote:
> 
> On 13/06/2017 14:49, Wilco Dijkstra wrote:
> > 
> > Adhemerval Zanella wrote:
> > 
> > > 
> > > I think a good starting point I would be if Ashwin in could
> > > provide us with a C skeleton with same implementation done in
> > > assembly.
> > I don't see the point of asking him to do that. It would be a
> > significant amount of
> > work which would be wasted once Szabolcs posts his implementation.
> It was not a demand, but rather a suggestion if it were the case he
> has a
> starting implementation based on C (which I know might not be the
> case).
After your suggestion, I tried implementing a C skeleton (attached with
this mail) of sinf using the same algorithm. I am able to see same kind
of speedup with the C version.

But I am not sure whether to spend time addressing other technical
comments on the current patch and cleaning this C version for
submission as there is likely another patch coming from Szabolcs that
supposedly can supersede my work.

Ashwin
> 
> Also, it would be helpful which kind of algorithm strategies Szabolcs
> is
> planing to do different than Ashwin and current implementation that
> is
> intended to supersede both implementation (like short algorithm
> description
> sent by Ashwin).
I would also like know about the same.

In my work, I only used algorithms that are already in libm in other
architectures' sinf/cosf implementations. So I guess the issue
that Szabolcs raised about math code licensing doesn't really apply to
my patch??

Ashwin
> 
> > 
> > 
/* Optimized ASIMD version of sinf
   Copyright (C) 2017 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <errno.h>
#include <math.h>
#include <math_private.h>

/* Short algorithm description:
 *
 *  1) if |x| == 0: return x.
 *  2) if |x| <  2^-27: return x-x*DP_SMALL, raise underflow only when needed.
 *  3) if |x| <  2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
 *  4) if |x| <   Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
 *  5) if |x| < 9*Pi/4:
 *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1,
 *           t=|x|-j*Pi/4.
 *      5.2) Reconstruction:
 *          s = sign(x) * (-1.0)^((n>>2)&1)
 *          if(n&2 != 0) {
 *              using cos(t) polynomial for |t|<Pi/4, result is
 *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
 *          } else {
 *              using sin(t) polynomial for |t|<Pi/4, result is
 *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
 *          }
 *  6) if |x| < 2^23, large args:
 *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
 *           t=|x|-j*Pi/4.
 *      6.2) Reconstruction same as (5.2).
 *  7) if |x| >= 2^23, very large args:
 *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
 *           t=|x|-j*Pi/4.
 *      7.2) Reconstruction same as (5.2).
 *  8) if x is Inf, return x-x, and set errno=EDOM.
 *  9) if x is NaN, return x-x.
 *
 * Special cases:
 *  sin(+-0) = +-0 not raising inexact/underflow,
 *  sin(subnormal) raises inexact/underflow,
 *  sin(min_normalized) raises inexact/underflow,
 *  sin(normalized) raises inexact,
 *  sin(Inf) = NaN, raises invalid, sets errno to EDOM,
 *  sin(NaN) = NaN.
 */

static double invpio4_table[25] = {
	0.00000000000000000000000000000000e+00,
	1.27323953807353973388671875000000e+00,
	6.66162294771233121082332218065858e-09,
	4.55202009562027200395410188316081e-18,
	5.05365203780056430379174094616698e-26,
	3.33657353140390501092355175630980e-34,
	2.31774265657771014271759915567725e-43,
	1.41079183488085906188916890478151e-51,
	1.78201357714620429447917285584916e-59,
	6.45440934111020426845713721490652e-68,
	2.96289605657163538186678664280422e-77,
	2.34290278673081796193027756956770e-85,
	6.89165747744598758512920848666612e-94,
	2.61827895738527799147711877424548e-102,
	5.22516501694879285329083609946870e-111,
	2.31723558129677581012126324525784e-119,
	5.36762980505877895920512518100769e-128,
	2.49914273179058265651805723374739e-135,
	3.32028340088181692034775714274009e-144,
	1.98261407071607720791943444409774e-152,
	1.34423330943468258263688817309715e-162,
	2.61360711509865349546753597258351e-169,
	2.26640747502086603170125306310953e-178,
	2.39096791372273354372455558796085e-186,
	2.14336400443697767564443045577643e-194
};

float __sinf (float x)
{
	uint64_t ix, n;
	double   k0, k1, k2, k3, k4, w, r, y, z, t;

	GET_FLOAT_WORD(ix, x);
	ix &= 0x7fffffff;

	if (ix < 0x3d000000)
		goto small_args;

	if (ix >= 0x3f490fdb)
		goto large_args;

	/* Here if 2^-5<=|x|<Pi/4 */
	/* Sin Polynomial Coefficients */
	k0 = -1.66666666666265311791406134034332e-01;
	k1 = 8.33333332439092043519845987020744e-03;
	k2 = -1.98412633515623692105969699817081e-04;
	k3 = 2.75552591873811586009688362475245e-06;
	k4 = -2.47545996176987174320511533257699e-08;
	y  = x * x;
	z  = y * y;
	r  = x * (1.0 + y * (k0 + z * (k2 + z * k4)) + z * (k1 + z * k3));
	return r;

large_args:
	if (ix < 0x40e231d6) {
		/* Here if Pi/4<=|x|<9*Pi/4 */
		double   pio4, invpio4, j;

		pio4     = 7.85398163397448278999490867136046e-01;
		invpio4  = 1.27323954473516276486577680771006e+00;
		t        = fabs (x);
		n        = t * invpio4 + 1.0;
		j        = n & 0x0e;
		t        = t - j * pio4;
	} else if (ix < 0x4b000000) {
		/* Here if 9*Pi/4<=|x|<2^23 */
		double   pio4hi, pio4lo, invpio4, j;

		pio4hi   = -7.85398162901401519775390625000000e-01;
		pio4lo   = -4.96046789840270212596747252887163e-10;
		invpio4  = 1.27323954473516276486577680771006e+00;
		t        = fabs (x);
		n        = t * invpio4 + 1.0;
		j        = n & 0xfffffffe;
		t        = t + j * pio4hi + j * pio4lo;
	} else if (ix < 0x7f800000) {
		/* Here if 2^23<=|x|<=Inf */
		uint64_t bitpos, j;
		double   pio4, tmp0, tmp1, tmp2, tmp3, tmp4;

		pio4     = 7.85398163397448278999490867136046e-01;
		t        = fabs (x);
		bitpos   = (ix >> 23) - 0x7f + 59;
		j        = (bitpos * ((0x100000000 / 28) + 1)) >> 32;
		tmp0     = invpio4_table[j - 2] * t;
		tmp1     = invpio4_table[j - 1] * t;
		tmp2     = invpio4_table[j]     * t;
		tmp3     = invpio4_table[j + 1] * t;
		tmp0     = tmp0 - ((uint64_t)tmp0 & ~0x7);
		tmp4     = tmp0 + tmp1;
		n        = tmp4;
		tmp0     = tmp0 - n;
		t        = tmp0 - (n & 0x1) + tmp1 + tmp2 + tmp3;

		if (t > 1.0) {
			n  = n + 1;
			t -= 2.0;
		}

		n        = n + 1;
		t        = t * pio4;
	} else {
		/* Here if x is Inf or Nan */
		if (ix == 0x7f800000)
			__set_errno (EDOM);
		return x - x;
	}
	switch (n & 0x2) {
	case 2:
		/* Use Cos Polynomial */
		k0 = -4.99999999994893751242841517523630e-01;
		k1 = 4.16666665534264832326805105822132e-02;
		k2 = -1.38888806593809050610177635576292e-03;
		k3 = 2.47989607241011055654977823792251e-05;
		k4 = -2.71747891329266278019784327732444e-07;
		w  = 1.0;
		break;
	default:
		/* Use Sin Polynomial */
		k0 = -1.66666666666265311791406134034332e-01;
		k1 = 8.33333332439092043519845987020744e-03;
		k2 = -1.98412633515623692105969699817081e-04;
		k3 = 2.75552591873811586009688362475245e-06;
		k4 = -2.47545996176987174320511533257699e-08;
		w  = t;
	}
	GET_FLOAT_WORD(ix, x);
	if (((ix >> 31) ^ (n >> 2)) & 0x1)
		w *= -1.0;

	y = t * t;
	z = y * y;
	r = w * (1.0 + y * (k0 + z * (k2 + z * k4)) + z * (k1 + z * k3));
	return r;

small_args:
	if (ix >= 0x32000000) {
		/* Here if 2^-27<=|x|<2^-5 */
		k0 = -1.66666666634829235826842364076583e-01;
		k1 = 8.33312019844746100505350483445000e-03;
		y  = x * x;
		z  = y * y;
		r  = x + x * y * k0 + x * z * k1;
		return r;
	} else if (ix != 0) {
		/* Here if 0<=|x|<2^-27 */
		double small;

		small  = 8.88178419700125232338905334472656e-16;
		r      = x - x * small;
		return r;
	}

	/* Here if |x| == 0 */
	return x;
}

weak_alias (__sinf, sinf);

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]