Index: src/newlib/libm/machine/spu/headers/acosd2.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/acosd2.h +++ src/newlib/libm/machine/spu/headers/acosd2.h @@ -40,9 +40,8 @@ #ifndef _ACOSD2_H_ #define _ACOSD2_H_ 1 -#include - #include "simdmath.h" +#include #include "sqrtd2.h" #include "divd2.h" Index: src/newlib/libm/machine/spu/headers/asind2.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/asind2.h +++ src/newlib/libm/machine/spu/headers/asind2.h @@ -41,9 +41,8 @@ #ifndef _ASIND2_H_ #define _ASIND2_H_ 1 -#include - #include "simdmath.h" +#include #include "sqrtd2.h" #include "divd2.h" Index: src/newlib/libm/machine/spu/headers/asinhf4.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/asinhf4.h +++ src/newlib/libm/machine/spu/headers/asinhf4.h @@ -96,8 +96,8 @@ #define ASINH_MAC25 6.4472103118896484375000000000000000000000000000000000000000000000000000E-3 #define ASINH_MAC27 -5.7400376708419234664351851851851851851851851851851851851851851851851852E-3 #define ASINH_MAC29 5.1533096823199041958512931034482758620689655172413793103448275862068966E-3 -#if 0 #define ASINH_MAC31 -4.6601434869150961599042338709677419354838709677419354838709677419354839E-3 +#if 0 #define ASINH_MAC33 4.2409070936793630773370916193181818181818181818181818181818181818181818E-3 #define ASINH_MAC35 -3.8809645588376692363194056919642857142857142857142857142857142857142857E-3 #define ASINH_MAC37 3.5692053938259345454138678473395270270270270270270270270270270270270270E-3 @@ -107,34 +107,43 @@ #endif - static __inline vector float _asinhf4(vector float x) { vec_float4 sign_mask = spu_splats(-0.0f); vec_float4 onef = spu_splats(1.0f); - vec_float4 result, fresult, mresult;; + vec_uint4 oneu = spu_splats(1u); + vec_uint4 twou = spu_splats(2u); + vec_uint4 threeu = spu_splats(3u); + vec_float4 ln2 = spu_splats(6.931471805599453094172321E-1f); + vec_float4 largef = spu_splats(9.21e18f); + vec_float4 result, fresult, mresult; vec_float4 xabs, xsqu; /* Where we switch from maclaurin to formula */ - vec_float4 switch_approx = spu_splats(0.685f); + vec_float4 switch_approx = spu_splats(0.74f); + vec_float4 trunc_part2 = spu_splats(20.0f); + vec_uint4 truncadd; + vec_uint4 islarge; vec_uint4 use_form; - xabs = spu_andc(x, sign_mask); xsqu = spu_mul(x, x); + islarge = spu_cmpgt(xabs, largef); /* * Formula: * asinh = ln(|x| + sqrt(x^2 + 1)) */ - fresult = _sqrtf4(spu_madd(x, x, onef)); - fresult = spu_add(xabs, fresult); - fresult = _logf4(fresult); + vec_float4 logarg = spu_add(xabs, _sqrtf4(spu_madd(xabs, xabs, onef))); + logarg = spu_sel(logarg, xabs, islarge); + fresult = _logf4(logarg); + fresult = spu_sel(fresult, spu_add(fresult, ln2), islarge); /* * Maclaurin Series */ - mresult = spu_madd(xsqu, spu_splats((float)ASINH_MAC29), spu_splats((float)ASINH_MAC27)); + mresult = spu_madd(xsqu, spu_splats((float)ASINH_MAC31), spu_splats((float)ASINH_MAC29)); + mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC27)); mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC25)); mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC23)); mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC21)); @@ -150,13 +159,18 @@ static __inline vector float _asinhf4(ve mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC01)); mresult = spu_mul(xabs, mresult); - /* * Choose between series and formula */ - use_form = spu_cmpgt(xabs, switch_approx); + use_form = spu_cmpgt(xabs, switch_approx); result = spu_sel(mresult, fresult, use_form); + /* + * Truncation correction on spu + */ + truncadd = spu_sel(oneu, threeu, use_form); + truncadd = spu_sel(truncadd, twou, spu_cmpgt(xabs, trunc_part2)); + result = (vec_float4)spu_add((vec_uint4)result, truncadd); /* Preserve sign - asinh is anti-symmetric */ result = spu_sel(result, x, (vec_uint4)sign_mask); Index: src/newlib/libm/machine/spu/headers/divd2.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/divd2.h +++ src/newlib/libm/machine/spu/headers/divd2.h @@ -1,232 +1,237 @@ -/* -------------------------------------------------------------- */ -/* (C)Copyright 2001,2008, */ -/* International Business Machines Corporation, */ -/* Sony Computer Entertainment, Incorporated, */ -/* Toshiba Corporation, */ -/* */ -/* All Rights Reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the */ -/* following conditions are met: */ -/* */ -/* - Redistributions of source code must retain the above copyright*/ -/* notice, this list of conditions and the following disclaimer. */ -/* */ -/* - Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* - Neither the name of IBM Corporation nor the names of its */ -/* contributors may be used to endorse or promote products */ -/* derived from this software without specific prior written */ -/* permission. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ -/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */ -/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ -/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */ -/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */ -/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */ -/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ -/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */ -/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */ -/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* -------------------------------------------------------------- */ -/* PROLOG END TAG zYx */ -#ifdef __SPU__ - -#ifndef _DIVD2_H_ -#define _DIVD2_H_ 1 - -#include - -/* - * FUNCTION - * vector double _divd2(vector double a, vector double b) - * - * DESCRIPTION - * _divd2 divides the vector dividend a by the vector divisor b and - * returns the resulting vector quotient. Maximum error 0.5 ULPS for - * normalized results, 1ulp for denorm results, over entire double - * range including denorms, compared to true result in round-to-nearest - * rounding mode. Handles Inf or NaN operands and results correctly. - */ -static __inline vector double _divd2(vector double a, vector double b) -{ - - - /* Variables - */ - vec_float4 inv_bf, mant_bf; - vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult; - vec_int4 exp, tmp; - vec_uint4 exp_a, exp_b, exp_q1, overflow, nounderflow, normal, utmp, - sign_a, sign_b, a_frac, b_frac, a_frac_0, b_frac_0, a_exp_0, b_exp_0, - a_exp_ones, b_exp_ones, a_nan, b_nan, a_inf, b_inf, a_zero, b_zero, - res_nan, sign_res; - - /* Constants - */ - vec_float4 onef = spu_splats(1.0f); - vec_double2 one = spu_splats(1.0); - vec_uint4 exp_mask = (vec_uint4) { 0x7FF00000, 0, 0x7FF00000, 0 }; - vec_uint4 sign_mask = (vec_uint4) { 0x80000000, 0, 0x80000000, 0}; - vec_uint4 sign_exp_mask = (vec_uint4) { 0xFFF00000, 0, 0xFFF00000,0}; - vec_uint4 frac_mask =(vec_uint4) { 0x000FFFFF, 0xFFFFFFFF, 0x000FFFFF, 0xFFFFFFFF }; - vec_uchar16 swap32 = (vec_uchar16) ((vec_uint4) { 0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B} ); - vec_uint4 zero = (vec_uint4) { 0, 0, 0, 0 }; - vec_int4 e1022 = (vec_int4) { 0x000003FE, 0, 0x000003FE, 0 }; - vec_int4 emax = (vec_int4) { 0x000007FE, 0, 0x000007FE, 0 }; - vec_int4 e1 = (vec_int4) { 0x00000001, 0, 0x00000001, 0 }; - - vec_uint4 nan = (vec_uint4) { 0x7FF80000, 0, 0x7FF80000, 0}; - - /* Extract exponents and underflow denorm arguments to signed zero. - */ - exp_a = spu_and((vec_uint4)a, exp_mask); - exp_b = spu_and((vec_uint4)b, exp_mask); - - sign_a = spu_and((vec_uint4)a, sign_mask); - sign_b = spu_and((vec_uint4)b, sign_mask); - - a_exp_0 = spu_cmpeq (exp_a, 0); - utmp = spu_shuffle (a_exp_0, a_exp_0, swap32); - a_exp_0 = spu_and (a_exp_0, utmp); - b_exp_0 = spu_cmpeq (exp_b, 0); - utmp = spu_shuffle (b_exp_0, b_exp_0, swap32); - b_exp_0 = spu_and (b_exp_0, utmp); - - a = spu_sel(a, (vec_double2)sign_a, (vec_ullong2)a_exp_0); - b = spu_sel(b, (vec_double2)sign_b, (vec_ullong2)b_exp_0); - - /* Force the divisor and dividend into the range [1.0,2.0). - (Unless they're zero.) - */ - mant_a = spu_sel(a, one, (vec_ullong2)sign_exp_mask); - mant_b = spu_sel(b, one, (vec_ullong2)sign_exp_mask); - - /* Approximate the single reciprocal of b by using - * the single precision reciprocal estimate followed by one - * single precision iteration of Newton-Raphson. - */ - mant_bf = spu_roundtf(mant_b); - inv_bf = spu_re(mant_bf); - inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf); - - /* Perform 2 more Newton-Raphson iterations in double precision. - */ - inv_b = spu_extend(inv_bf); - inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b); - q0 = spu_mul(mant_a, inv_b); - q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0); - - /* Compute the quotient's expected exponent. If the exponent - * is out of range, then force the resulting exponent to 0. - * (1023 with the bias). We correct for the out of range - * values by computing a multiplier (mult) that will force the - * result to the correct out of range value and set the - * correct exception flag (UNF, OVF, or neither). - */ - exp_q1 = spu_and((vec_uint4)q1, exp_mask); - exp = spu_sub((vec_int4)exp_a, (vec_int4)exp_b); - exp = spu_rlmaska(exp, -20); // shift right to allow enough bits for working - tmp = spu_rlmaska((vec_int4)exp_q1, -20); - exp = spu_add(exp, tmp); // biased exponent of result (right justified) - - /* The default multiplier is 1.0. If an underflow is detected (the computed - * exponent is less than or equal to a biased 0), force the multiplier to 0.0. - * If exp<=0 set mult = 2**(unbiased exp + 1022) and unbiased exp = -1022 - * = biased 1, the smallest normalized exponent. If exp<-51 set - * mult = 2**(-1074) to ensure underflowing result. Otherwise mult=1. - */ - normal = spu_cmpgt(exp, 0); - nounderflow = spu_cmpgt(exp, -52); - tmp = spu_add(exp, e1022); - mult = (vec_double2)spu_sl(tmp, 20); - mult = spu_sel(mult, one, (vec_ullong2)normal); - mult = spu_sel((vec_double2)e1, mult, (vec_ullong2)nounderflow); - exp = spu_sel(e1, exp, normal); // unbiased -1022 is biased 1 - - /* Force the multiplier to positive infinity (exp_mask) and the biased - * exponent to 1022, if the computed biased exponent is > emax. - */ - overflow = spu_cmpgt(exp, (vec_int4)emax); - exp = spu_sel(exp, (vec_int4)e1022, overflow); - mult = spu_sel(mult, (vec_double2)exp_mask, (vec_ullong2)overflow); - - /* Determine if a, b are Inf, NaN, or zero. - * Since these are rare, it would improve speed if these could be detected - * quickly and a branch used to avoid slowing down the main path. However - * most of the work seems to be in the detection. - */ - a_exp_ones = spu_cmpeq (exp_a, exp_mask); - utmp = spu_shuffle (a_exp_ones, a_exp_ones, swap32); - a_exp_ones = spu_and (a_exp_ones, utmp); - - a_frac = spu_and ((vec_uint4)a, frac_mask); - a_frac_0 = spu_cmpeq (a_frac, 0); - utmp = spu_shuffle (a_frac_0, a_frac_0, swap32); - a_frac_0 = spu_and (a_frac_0, utmp); - - a_zero = spu_and (a_exp_0, a_frac_0); - a_inf = spu_and (a_exp_ones, a_frac_0); - a_nan = spu_andc (a_exp_ones, a_frac_0); - - b_exp_ones = spu_cmpeq (exp_b, exp_mask); - utmp = spu_shuffle (b_exp_ones, b_exp_ones, swap32); - b_exp_ones = spu_and (b_exp_ones, utmp); - - b_frac = spu_and ((vec_uint4)b, frac_mask); - b_frac_0 = spu_cmpeq (b_frac, 0); - utmp = spu_shuffle (b_frac_0, b_frac_0, swap32); - b_frac_0 = spu_and (b_frac_0, utmp); - - b_zero = spu_and (b_exp_0, b_frac_0); - b_inf = spu_and (b_exp_ones, b_frac_0); - b_nan = spu_andc (b_exp_ones, b_frac_0); - - /* Handle exception cases */ - - /* Result is 0 for 0/x, x!=0, or x/Inf, x!=Inf. - * Set mult=0 for 0/0 or Inf/Inf now, since it will be replaced - * with NaN later. - */ - utmp = spu_or (a_zero, b_inf); - mult = spu_sel(mult, (vec_double2)zero, (vec_ullong2)utmp); - - /* Result is Inf for x/0, x!=0. Set mult=Inf for 0/0 now, since it - * will be replaced with NaN later. - */ - mult = spu_sel(mult, (vec_double2)exp_mask, (vec_ullong2)b_zero); - - /* Result is NaN if either operand is, or Inf/Inf, or 0/0. - */ - res_nan = spu_or (a_nan, b_nan); - utmp = spu_and (a_inf, b_inf); - res_nan = spu_or (res_nan, utmp); - utmp = spu_and (a_zero, b_zero); - res_nan = spu_or (res_nan, utmp); - mult = spu_sel(mult, (vec_double2)nan, (vec_ullong2)res_nan); - - /* Insert sign of result into mult. - */ - sign_res = spu_xor (sign_a, sign_b); - mult = spu_or (mult, (vec_double2)sign_res); - - /* Insert the sign and exponent into the result and perform the - * final multiplication. - */ - exp = spu_sl(exp, 20); - q2 = spu_sel(q1, (vec_double2)exp, (vec_ullong2)exp_mask); - q2 = spu_mul(q2, mult); - - return (q2); -} - -#endif /* _DIVD2_H_ */ -#endif /* __SPU__ */ +/* -------------------------------------------------------------- */ +/* (C)Copyright 2001,2008, */ +/* International Business Machines Corporation, */ +/* Sony Computer Entertainment, Incorporated, */ +/* Toshiba Corporation, */ +/* */ +/* All Rights Reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the */ +/* following conditions are met: */ +/* */ +/* - Redistributions of source code must retain the above copyright*/ +/* notice, this list of conditions and the following disclaimer. */ +/* */ +/* - Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* - Neither the name of IBM Corporation nor the names of its */ +/* contributors may be used to endorse or promote products */ +/* derived from this software without specific prior written */ +/* permission. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ +/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */ +/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ +/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */ +/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */ +/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */ +/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ +/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */ +/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */ +/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* -------------------------------------------------------------- */ +/* PROLOG END TAG zYx */ +#ifdef __SPU__ + +#ifndef _DIVD2_H_ +#define _DIVD2_H_ 1 + +#include + +/* + * FUNCTION + * vector double _divd2(vector double a, vector double b) + * + * DESCRIPTION + * _divd2 divides the vector dividend a by the vector divisor b and + * returns the resulting vector quotient. Maximum error about 0.5 ulp + * over entire double range including denorms, compared to true result + * in round-to-nearest rounding mode. Handles Inf or NaN operands and + * results correctly. + */ +static __inline vector double _divd2(vector double a_in, vector double b_in) +{ + /* Variables */ + vec_int4 exp, exp_bias; + vec_uint4 no_underflow, overflow; + vec_float4 mant_bf, inv_bf; + vec_ullong2 exp_a, exp_b; + vec_ullong2 a_nan, a_zero, a_inf, a_denorm; + vec_ullong2 b_nan, b_zero, b_inf, b_denorm; + vec_ullong2 nan; + vec_double2 a, b; + vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult; + + /* Constants */ + vec_float4 onef = spu_splats(1.0f); + vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL); + vec_double2 one = spu_splats(1.0); + +#ifdef __SPU_EDP__ + vec_double2 denorm_scale = (vec_double2)spu_splats(0x4330000000000000ULL); + + /* Identify all possible special values that must be accomodated including: + * +-0, +-infinity, +-denorm, and NaNs. + */ + a_nan = spu_testsv(a_in, (SPU_SV_NAN)); + a_zero = spu_testsv(a_in, (SPU_SV_NEG_ZERO | SPU_SV_POS_ZERO)); + a_inf = spu_testsv(a_in, (SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY)); + a_denorm = spu_testsv(a_in, (SPU_SV_NEG_DENORM | SPU_SV_POS_DENORM)); + + b_nan = spu_testsv(b_in, (SPU_SV_NAN)); + b_zero = spu_testsv(b_in, (SPU_SV_NEG_ZERO | SPU_SV_POS_ZERO)); + b_inf = spu_testsv(b_in, (SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY)); + b_denorm = spu_testsv(b_in, (SPU_SV_NEG_DENORM | SPU_SV_POS_DENORM)); + + /* Scale denorm inputs to into normalized numbers by conditionally scaling the + * input parameters. + */ + a = spu_sel(a_in, spu_mul(a_in, denorm_scale), a_denorm); + b = spu_sel(b_in, spu_mul(b_in, denorm_scale), b_denorm); + +#else /* !__SPU_EDP__ */ + vec_uint4 a_exp, b_exp; + vec_ullong2 a_mant_0, b_mant_0; + vec_ullong2 a_exp_1s, b_exp_1s; + vec_ullong2 sign_exp_mask; + + vec_uint4 exp_mask_u32 = spu_splats((unsigned int)0x7FF00000); + vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3, 0,1,2,3, 8, 9,10,11, 8,9,10,11}; + vec_uchar16 swap_32 = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; + vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL); + vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL); + + sign_exp_mask = spu_or(sign_mask, exp_mask); + + /* Extract the floating point components from each of the operands including + * exponent and mantissa. + */ + a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32); + a_exp = spu_shuffle(a_exp, a_exp, splat_hi); + b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32); + b_exp = spu_shuffle(b_exp, b_exp, splat_hi); + + a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0); + a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32)); + + b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0); + b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32)); + + a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32); + b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32); + + /* Identify all possible special values that must be accomodated including: + * +-denorm, +-0, +-infinity, and NaNs. + */ + a_denorm = (vec_ullong2)spu_cmpeq(a_exp, 0); /* really is a_exp_0 */ + a_nan = spu_andc(a_exp_1s, a_mant_0); + a_zero = spu_and (a_denorm, a_mant_0); + a_inf = spu_and (a_exp_1s, a_mant_0); + + b_denorm = (vec_ullong2)spu_cmpeq(b_exp, 0); /* really is b_exp_0 */ + b_nan = spu_andc(b_exp_1s, b_mant_0); + b_zero = spu_and (b_denorm, b_mant_0); + b_inf = spu_and (b_exp_1s, b_mant_0); + + /* Scale denorm inputs to into normalized numbers by conditionally scaling the + * input parameters. + */ + a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask)); + a = spu_sel(a_in, a, a_denorm); + + b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask)); + b = spu_sel(b_in, b, b_denorm); + +#endif /* __SPU_EDP__ */ + + /* Extract the divisor and dividend exponent and force parameters into the signed + * range [1.0,2.0) or [-1.0,2.0). + */ + exp_a = spu_and((vec_ullong2)a, exp_mask); + exp_b = spu_and((vec_ullong2)b, exp_mask); + + mant_a = spu_sel(a, one, (vec_ullong2)exp_mask); + mant_b = spu_sel(b, one, (vec_ullong2)exp_mask); + + /* Approximate the single reciprocal of b by using + * the single precision reciprocal estimate followed by one + * single precision iteration of Newton-Raphson. + */ + mant_bf = spu_roundtf(mant_b); + inv_bf = spu_re(mant_bf); + inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf); + + /* Perform 2 more Newton-Raphson iterations in double precision. The + * result (q1) is in the range (0.5, 2.0). + */ + inv_b = spu_extend(inv_bf); + inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b); + q0 = spu_mul(mant_a, inv_b); + q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0); + + + /* Determine the exponent correction factor that must be applied + * to q1 by taking into account the exponent of the normalized inputs + * and the scale factors that were applied to normalize them. + */ + exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20); + exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34))); + + /* Bias the quotient exponent depending on the sign of the exponent correction + * factor so that a single multiplier will ensure the entire double precision + * domain (including denorms) can be achieved. + * + * exp bias q1 adjust exp + * ===== ======== ========== + * positive 2^+65 -65 + * negative 2^-64 +64 + */ + exp_bias = spu_xor(spu_rlmaska(exp, -31), 64); + + + exp = spu_sub(exp, exp_bias); + + q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask); + + /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the + * expected result. + */ + exp = spu_add(exp, 0x3FF); + no_underflow = spu_cmpgt(exp, 0); + overflow = spu_cmpgt(exp, 0x7FF); + exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow); + exp = spu_and(exp, (vec_int4)exp_mask); + mult = spu_sel((vec_double2)exp, (vec_double2)exp_mask, (vec_ullong2)overflow); + + /* Handle special value conditions. These include: + * + * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN + * results. + * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results. + * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results. + */ + mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf)); + mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero)); + + nan = spu_or(a_nan, b_nan); + nan = spu_or(nan, spu_and(a_zero, b_zero)); + nan = spu_or(nan, spu_and(a_inf, b_inf)); + + mult = spu_or(mult, (vec_double2)nan); + + /* Scale the final quotient */ + + q2 = spu_mul(q1, mult); + + return (q2); +} + +#endif /* _DIVD2_H_ */ +#endif /* __SPU__ */ Index: src/newlib/libm/machine/spu/headers/erf_utils.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/erf_utils.h +++ src/newlib/libm/machine/spu/headers/erf_utils.h @@ -166,57 +166,6 @@ _tresult = spu_mul(_tresult, spu_splats(TWO_OVER_SQRT_PI)); \ } -#define TAYLOR_ERFF4(_xabs, _xsqu, _tresult) { \ - _tresult = spu_madd(_xsqu, spu_splats((float)TAYLOR_ERF_45), spu_splats((float)TAYLOR_ERF_44)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_43)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_42)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_41)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_40)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_39)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_38)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_37)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_36)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_35)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_34)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_33)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_32)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_31)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_30)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_29)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_28)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_27)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_26)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_25)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_24)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_23)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_22)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_21)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_20)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_19)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_18)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_17)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_16)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_15)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_14)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_13)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_12)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_11)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_10)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_09)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_08)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_07)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_06)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_05)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_04)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_03)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_02)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_01)); \ - _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_00)); \ - _tresult = spu_mul(_tresult, _xabs); \ - _tresult = spu_mul(_tresult, spu_splats((float)TWO_OVER_SQRT_PI)); \ -} - - /* * Continued Fractions Approximation of Erfc() @@ -266,43 +215,7 @@ vec_float4 inv_xsqu; \ inv_xsqu = _recipf4(_xsqu); \ v = spu_mul(inv_xsqu, onehalff); \ - p = spu_splats(3.025f); q = onef; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(40.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(39.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(38.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(37.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(36.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(35.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(34.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(33.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(32.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(31.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(30.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(29.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(28.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(27.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(26.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(25.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(24.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(23.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(22.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(21.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(20.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(19.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(18.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(17.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(16.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(15.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(14.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(13.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(12.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(11.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats(10.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats( 9.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats( 8.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats( 7.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats( 6.0f)), plast); q = plast; plast = p; qlast = q; \ - p = spu_madd(qlast, spu_mul(v, spu_splats( 5.0f)), plast); q = plast; plast = p; qlast = q; \ + p = spu_splats(1.945f); q = onef; plast = p; qlast = q; \ p = spu_madd(qlast, spu_mul(v, spu_splats( 4.0f)), plast); q = plast; plast = p; qlast = q; \ p = spu_madd(qlast, spu_mul(v, spu_splats( 3.0f)), plast); q = plast; plast = p; qlast = q; \ p = spu_madd(qlast, spu_mul(v, spu_splats( 2.0f)), plast); q = plast; plast = p; qlast = q; \ Index: src/newlib/libm/machine/spu/headers/erfcd2.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/erfcd2.h +++ src/newlib/libm/machine/spu/headers/erfcd2.h @@ -80,15 +80,10 @@ static __inline vector double _erfcd2(ve vec_float4 approx_point = spu_splats(1.71f); vec_double2 xabs, xsqu, xsign; - vec_uint4 xhigh, xabshigh; - vec_uint4 isnan, isneg; + vec_uint4 isneg; vec_double2 tresult, presult, result; xsign = spu_and(x, sign_mask); - - /* Force Denorms to 0 */ - x = spu_add(x, zerod); - xabs = spu_andc(x, sign_mask); xsqu = spu_mul(x, x); @@ -118,18 +113,11 @@ static __inline vector double _erfcd2(ve result = spu_sel(tresult, presult, (vec_ullong2)spu_cmpgt(xf, approx_point)); /* - * Special cases/errors. + * Special cases */ - xhigh = (vec_uint4)spu_shuffle(x, x, dup_even); - xabshigh = (vec_uint4)spu_shuffle(xabs, xabs, dup_even); - - /* x = +/- infinite */ - result = spu_sel(result, zerod, (vec_ullong2)spu_cmpeq(xhigh, 0x7FF00000)); - result = spu_sel(result, twod, (vec_ullong2)spu_cmpeq(xhigh, 0xFFF00000)); - - /* x = nan, return x */ - isnan = spu_cmpgt(xabshigh, 0x7FF00000); - result = spu_sel(result, x, (vec_ullong2)isnan); + result = spu_sel(result, twod, spu_testsv(x, SPU_SV_NEG_INFINITY)); + result = spu_sel(result, zerod, spu_testsv(x, SPU_SV_POS_INFINITY)); + result = spu_sel(result, x, spu_testsv(x, SPU_SV_NEG_DENORM | SPU_SV_POS_DENORM)); return result; } Index: src/newlib/libm/machine/spu/headers/erfcf4.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/erfcf4.h +++ src/newlib/libm/machine/spu/headers/erfcf4.h @@ -40,11 +40,11 @@ #define _ERFCF4_H_ 1 #include - -#include "expf4.h" +#include "erff4.h" +#include "erf_utils.h" #include "recipf4.h" +#include "expf4.h" #include "divf4.h" -#include "erf_utils.h" /* * FUNCTION @@ -63,56 +63,374 @@ static __inline vector float _erfcf4(vector float x) { - vec_float4 onehalff = spu_splats(0.5f); - vec_float4 zerof = spu_splats(0.0f); - vec_float4 onef = spu_splats(1.0f); - vec_float4 twof = spu_splats(2.0f); - vec_float4 sign_mask = spu_splats(-0.0f); + vec_float4 sign_maskf = spu_splats(-0.0f); + vec_float4 zerof = spu_splats(0.0f); + vec_float4 onehalff = spu_splats(0.5f); + vec_float4 onef = spu_splats(1.0f); + vec_float4 twof = spu_splats(2.0f); + vec_float4 clamp = spu_splats(10.0542f); // Erfc = 0 above this (in single precision) + vec_float4 xabs = spu_andc(x, sign_maskf); + vec_float4 result; - /* This is where we switch from near zero approx. */ - vec_float4 approx_point = spu_splats(0.89f); + /* + * First thing we do is setup the description of each partition. + * This consists of: + * - Start x of partition + * - Offset (used for evaluating power series expanded around a point) + * - Truncation adjustment. + */ - vec_float4 xabs, xsqu, xsign; - vec_uint4 isneg; - vec_float4 tresult, presult, result; - xsign = spu_and(x, sign_mask); + /*************************************************************** + * REGION 0: Approximation Near 0 from Above + * + */ +#define SDM_ERFCF4_0_START 0.0f +#define SDM_ERFCF4_0_OFF 0.0f +#define SDM_ERFCF4_0_TRUNC 1u + +#define SDM_ERFCF4_0_00 0.9999999999999949135f +#define SDM_ERFCF4_0_01 -1.1283791670931702608f +#define SDM_ERFCF4_0_02 -1.8051894620430502228e-10f +#define SDM_ERFCF4_0_03 0.37612639455729408814f +#define SDM_ERFCF4_0_04 -8.8929793006257568262e-8f +#define SDM_ERFCF4_0_05 -0.11283705324835578294f +#define SDM_ERFCF4_0_06 -5.4670494993502827210e-6f +#define SDM_ERFCF4_0_07 0.026889802515535093351f +#define SDM_ERFCF4_0_08 -0.000071498114084857387620f +#define SDM_ERFCF4_0_09 -0.0050714210985129775210f +#define SDM_ERFCF4_0_10 -0.00022683372291701701701f +#define SDM_ERFCF4_0_11 0.0010796064437231401311f +#define SDM_ERFCF4_0_12 -0.00012982218714593684809f +#define SDM_ERFCF4_0_13 -0.00010102962499433144847f +#define SDM_ERFCF4_0_14 0.000025784829228223517886f - /* Force Denorms to 0 */ - x = spu_add(x, zerof); - xabs = spu_andc(x, sign_mask); - xsqu = spu_mul(x, x); + /*************************************************************** + * REGION 1: Near 1 + */ +#define SDM_ERFCF4_1_START 0.88f +#define SDM_ERFCF4_1_OFF 1.125f +#define SDM_ERFCF4_1_TRUNC 1u + +#define SDM_ERFCF4_1_00 0.111611768298292224f +#define SDM_ERFCF4_1_01 -0.318273958500769283f +#define SDM_ERFCF4_1_02 0.358058203313365464f +#define SDM_ERFCF4_1_03 -0.162452332984767661f +#define SDM_ERFCF4_1_04 -0.0279732971338566734f +#define SDM_ERFCF4_1_05 0.0613236836056658061f +#define SDM_ERFCF4_1_06 -0.0155368354497628942f +#define SDM_ERFCF4_1_07 -0.00960689422582997228f +#define SDM_ERFCF4_1_08 0.00603126088310672760f +#define SDM_ERFCF4_1_09 0.000360191989801368303f +#define SDM_ERFCF4_1_10 -0.00115326735470205975f +#define SDM_ERFCF4_1_11 0.000176955087857924673f +#define SDM_ERFCF4_1_12 0.000141558399011799664f +#define SDM_ERFCF4_1_13 -0.0000494556968345700811f +#define SDM_ERFCF4_1_14 0.0f - /* - * Use Taylor Series for x near 0 - * Preserve sign of x in result, since erf(-x) = -erf(x) - * This approximation is for erf, so adjust for erfc. - */ - TAYLOR_ERFF4(xabs, xsqu, tresult); - tresult = spu_or(tresult, xsign); - tresult = spu_sub(onef, tresult); - /* - * Now, use the Continued Fractions approximation away - * from 0. If x < 0, use erfc(-x) = 2 - erfc(x) + /*************************************************************** + * REGION 2: */ - CONTFRAC_ERFCF4(xabs, xsqu, presult); - isneg = spu_rlmaska((vec_uint4)x, -32); - presult = spu_sel(presult, spu_sub(twof, presult), isneg); +#define SDM_ERFCF4_2_START 1.50f +#define SDM_ERFCF4_2_OFF 1.75f +#define SDM_ERFCF4_2_TRUNC 0u + +#define SDM_ERFCF4_2_00 0.0133283287808175777f +#define SDM_ERFCF4_2_01 -0.0527749959301503715f +#define SDM_ERFCF4_2_02 0.0923562428777631589f +#define SDM_ERFCF4_2_03 -0.0901572847140068856f +#define SDM_ERFCF4_2_04 0.0481022098321682995f +#define SDM_ERFCF4_2_05 -0.00662436146831574865f +#define SDM_ERFCF4_2_06 -0.00896304509872736070f +#define SDM_ERFCF4_2_07 0.00605875147039124009f +#define SDM_ERFCF4_2_08 -0.000730051247140304322f +#define SDM_ERFCF4_2_09 -0.000894181745354844871f +#define SDM_ERFCF4_2_10 0.000442750499254694174f +#define SDM_ERFCF4_2_11 5.44549038611738718e-6f +#define SDM_ERFCF4_2_12 -0.0000686716770072681921f +#define SDM_ERFCF4_2_13 0.0000177205746526325771f +#define SDM_ERFCF4_2_14 0.0f + + + /*************************************************************** + * REGION 3: + */ +#define SDM_ERFCF4_3_START 2.0f +#define SDM_ERFCF4_3_OFF 2.25f +#define SDM_ERFCF4_3_TRUNC 1u + +#define SDM_ERFCF4_3_00 0.00146271658668117865f +#define SDM_ERFCF4_3_01 -0.00714231902201798319f +#define SDM_ERFCF4_3_02 0.0160702177995404628f +#define SDM_ERFCF4_3_03 -0.0217245536919713662f +#define SDM_ERFCF4_3_04 0.0190833836369542972f +#define SDM_ERFCF4_3_05 -0.0106576791656674587f +#define SDM_ERFCF4_3_06 0.00290435707106278173f +#define SDM_ERFCF4_3_07 0.000670455969951892490f +#define SDM_ERFCF4_3_08 -0.000999493712611392590f +#define SDM_ERFCF4_3_09 0.000369380417703939461f +#define SDM_ERFCF4_3_10 0.0000114665831641414663f +#define SDM_ERFCF4_3_11 -0.0000651349432823388933f +#define SDM_ERFCF4_3_12 0.0000226882426454011034f +#define SDM_ERFCF4_3_13 1.33207467538330703e-6f +#define SDM_ERFCF4_3_14 0.0f + + + /*************************************************************** + * REGION 4: + */ +#define SDM_ERFCF4_4_START 2.46f +#define SDM_ERFCF4_4_OFF 2.75f +#define SDM_ERFCF4_4_TRUNC 1u + +#define SDM_ERFCF4_4_00 0.000100621922119681351f +#define SDM_ERFCF4_4_01 -0.000586277247093792324f +#define SDM_ERFCF4_4_02 0.00161226242950792873f +#define SDM_ERFCF4_4_03 -0.00276038870506660526f +#define SDM_ERFCF4_4_04 0.00325811365963060576f +#define SDM_ERFCF4_4_05 -0.00275580841407368484f +#define SDM_ERFCF4_4_06 0.00165732740366604948f +#define SDM_ERFCF4_4_07 -0.000646040956672447276f +#define SDM_ERFCF4_4_08 0.0000890115712124397128f +#define SDM_ERFCF4_4_09 0.0000712231147231515843f +#define SDM_ERFCF4_4_10 -0.0000549969924243893176f +#define SDM_ERFCF4_4_11 0.0000158438047120425837f +#define SDM_ERFCF4_4_12 1.07113381370613701e-6f +#define SDM_ERFCF4_4_13 0.0f +#define SDM_ERFCF4_4_14 0.0f + + + /*************************************************************** + * REGION 5: + */ +#define SDM_ERFCF4_5_START 2.95f +#define SDM_ERFCF4_5_OFF 3.25f +#define SDM_ERFCF4_5_TRUNC 1u + +#define SDM_ERFCF4_5_00 4.30277946372736864e-6f +#define SDM_ERFCF4_5_01 -0.0000291890253835816989f +#define SDM_ERFCF4_5_02 0.0000948643324966405230f +#define SDM_ERFCF4_5_03 -0.000195809711948193862f +#define SDM_ERFCF4_5_04 0.000286569337750268210f +#define SDM_ERFCF4_5_05 -0.000313797225490890491f +#define SDM_ERFCF4_5_06 0.000263528504215059911f +#define SDM_ERFCF4_5_07 -0.000169991414511391200f +#define SDM_ERFCF4_5_08 0.0000816476305301353867f +#define SDM_ERFCF4_5_09 -0.0000259138470056606003f +#define SDM_ERFCF4_5_10 2.32886623721087698e-6f +#define SDM_ERFCF4_5_11 2.86429946075621661e-6f +#define SDM_ERFCF4_5_12 0.0f +#define SDM_ERFCF4_5_13 0.0f +#define SDM_ERFCF4_5_14 0.0f + + + /*************************************************************** + * REGION 6: + */ +#define SDM_ERFCF4_6_START 3.45f +#define SDM_ERFCF4_6_OFF 3.625f +#define SDM_ERFCF4_6_TRUNC 1u + +#define SDM_ERFCF4_6_00 2.95140192507759025e-7f +#define SDM_ERFCF4_6_01 -2.21592028463311237e-6f +#define SDM_ERFCF4_6_02 8.03271103179503198e-6f +#define SDM_ERFCF4_6_03 -0.0000186737448986269582f +#define SDM_ERFCF4_6_04 0.0000311685922848296785f +#define SDM_ERFCF4_6_05 -0.0000395923353434149457f +#define SDM_ERFCF4_6_06 0.0000395291139306718091f +#define SDM_ERFCF4_6_07 -0.0000315141214892874786f +#define SDM_ERFCF4_6_08 0.0000200891481859513911f +#define SDM_ERFCF4_6_09 -0.0000100551790824327187f +#define SDM_ERFCF4_6_10 3.71860071281680690e-6f +#define SDM_ERFCF4_6_11 -8.05502983594814356e-7f +#define SDM_ERFCF4_6_12 -7.67662978382552699e-8f +#define SDM_ERFCF4_6_13 1.56408548403936681e-7f +#define SDM_ERFCF4_6_14 0.0f +#define SDM_ERFCF4_6_15 0.0f +#define SDM_ERFCF4_6_16 0.0f +#define SDM_ERFCF4_6_17 0.0f + + + /*************************************************************** + * REGION 7: + */ +#define SDM_ERFCF4_7_START 3.55f +#define SDM_ERFCF4_7_OFF 4.0f +#define SDM_ERFCF4_7_TRUNC 2u + +#define SDM_ERFCF4_7_00 1.54172579002800189e-8f +#define SDM_ERFCF4_7_01 -1.2698234671866558e-7f +#define SDM_ERFCF4_7_02 5.0792938687466233e-7f +#define SDM_ERFCF4_7_03 -1.3121509160928777e-6f +#define SDM_ERFCF4_7_04 2.4549920365608679e-6f +#define SDM_ERFCF4_7_05 -3.5343419836695254e-6f +#define SDM_ERFCF4_7_06 4.0577914351431357e-6f +#define SDM_ERFCF4_7_07 -3.7959659297660776e-6f +#define SDM_ERFCF4_7_08 2.9264391936639771e-6f +#define SDM_ERFCF4_7_09 -1.8631747969134646e-6f +#define SDM_ERFCF4_7_10 9.702839808793979e-7f +#define SDM_ERFCF4_7_11 -4.0077792841735885e-7f +#define SDM_ERFCF4_7_12 1.2017256123590621e-7f +#define SDM_ERFCF4_7_13 -1.7432381111955779e-8f +#define SDM_ERFCF4_7_14 0.0f + + + /*************************************************************** + * Now we load the description of each partition. + */ + + /* Start point for each partition */ + vec_float4 r1start = spu_splats(SDM_ERFCF4_1_START); + vec_float4 r2start = spu_splats(SDM_ERFCF4_2_START); + vec_float4 r3start = spu_splats(SDM_ERFCF4_3_START); + vec_float4 r4start = spu_splats(SDM_ERFCF4_4_START); + vec_float4 r5start = spu_splats(SDM_ERFCF4_5_START); + vec_float4 r6start = spu_splats(SDM_ERFCF4_6_START); + vec_float4 r7start = spu_splats(SDM_ERFCF4_7_START); + + /* X Offset for each partition */ + vec_float4 xoffseta = (vec_float4) {SDM_ERFCF4_0_OFF, SDM_ERFCF4_1_OFF, SDM_ERFCF4_2_OFF, SDM_ERFCF4_3_OFF}; + vec_float4 xoffsetb = (vec_float4) {SDM_ERFCF4_4_OFF, SDM_ERFCF4_5_OFF, SDM_ERFCF4_6_OFF, SDM_ERFCF4_7_OFF}; + + /* Truncation Correction for each partition */ + vec_uint4 tcorra = (vec_uint4) {SDM_ERFCF4_0_TRUNC, SDM_ERFCF4_1_TRUNC, SDM_ERFCF4_2_TRUNC, SDM_ERFCF4_3_TRUNC}; + vec_uint4 tcorrb = (vec_uint4) {SDM_ERFCF4_4_TRUNC, SDM_ERFCF4_5_TRUNC, SDM_ERFCF4_6_TRUNC, SDM_ERFCF4_7_TRUNC}; + + /* The coefficients for each partition */ + vec_float4 c00a = (vec_float4) {SDM_ERFCF4_0_00, SDM_ERFCF4_1_00, SDM_ERFCF4_2_00, SDM_ERFCF4_3_00}; + vec_float4 c01a = (vec_float4) {SDM_ERFCF4_0_01, SDM_ERFCF4_1_01, SDM_ERFCF4_2_01, SDM_ERFCF4_3_01}; + vec_float4 c02a = (vec_float4) {SDM_ERFCF4_0_02, SDM_ERFCF4_1_02, SDM_ERFCF4_2_02, SDM_ERFCF4_3_02}; + vec_float4 c03a = (vec_float4) {SDM_ERFCF4_0_03, SDM_ERFCF4_1_03, SDM_ERFCF4_2_03, SDM_ERFCF4_3_03}; + vec_float4 c04a = (vec_float4) {SDM_ERFCF4_0_04, SDM_ERFCF4_1_04, SDM_ERFCF4_2_04, SDM_ERFCF4_3_04}; + vec_float4 c05a = (vec_float4) {SDM_ERFCF4_0_05, SDM_ERFCF4_1_05, SDM_ERFCF4_2_05, SDM_ERFCF4_3_05}; + vec_float4 c06a = (vec_float4) {SDM_ERFCF4_0_06, SDM_ERFCF4_1_06, SDM_ERFCF4_2_06, SDM_ERFCF4_3_06}; + vec_float4 c07a = (vec_float4) {SDM_ERFCF4_0_07, SDM_ERFCF4_1_07, SDM_ERFCF4_2_07, SDM_ERFCF4_3_07}; + vec_float4 c08a = (vec_float4) {SDM_ERFCF4_0_08, SDM_ERFCF4_1_08, SDM_ERFCF4_2_08, SDM_ERFCF4_3_08}; + vec_float4 c09a = (vec_float4) {SDM_ERFCF4_0_09, SDM_ERFCF4_1_09, SDM_ERFCF4_2_09, SDM_ERFCF4_3_09}; + vec_float4 c10a = (vec_float4) {SDM_ERFCF4_0_10, SDM_ERFCF4_1_10, SDM_ERFCF4_2_10, SDM_ERFCF4_3_10}; + vec_float4 c11a = (vec_float4) {SDM_ERFCF4_0_11, SDM_ERFCF4_1_11, SDM_ERFCF4_2_11, SDM_ERFCF4_3_11}; + vec_float4 c12a = (vec_float4) {SDM_ERFCF4_0_12, SDM_ERFCF4_1_12, SDM_ERFCF4_2_12, SDM_ERFCF4_3_12}; + vec_float4 c13a = (vec_float4) {SDM_ERFCF4_0_13, SDM_ERFCF4_1_13, SDM_ERFCF4_2_13, SDM_ERFCF4_3_13}; + vec_float4 c14a = (vec_float4) {SDM_ERFCF4_0_14, SDM_ERFCF4_1_14, SDM_ERFCF4_2_14, SDM_ERFCF4_3_14}; + + vec_float4 c00b = (vec_float4) {SDM_ERFCF4_4_00, SDM_ERFCF4_5_00, SDM_ERFCF4_6_00, SDM_ERFCF4_7_00}; + vec_float4 c01b = (vec_float4) {SDM_ERFCF4_4_01, SDM_ERFCF4_5_01, SDM_ERFCF4_6_01, SDM_ERFCF4_7_01}; + vec_float4 c02b = (vec_float4) {SDM_ERFCF4_4_02, SDM_ERFCF4_5_02, SDM_ERFCF4_6_02, SDM_ERFCF4_7_02}; + vec_float4 c03b = (vec_float4) {SDM_ERFCF4_4_03, SDM_ERFCF4_5_03, SDM_ERFCF4_6_03, SDM_ERFCF4_7_03}; + vec_float4 c04b = (vec_float4) {SDM_ERFCF4_4_04, SDM_ERFCF4_5_04, SDM_ERFCF4_6_04, SDM_ERFCF4_7_04}; + vec_float4 c05b = (vec_float4) {SDM_ERFCF4_4_05, SDM_ERFCF4_5_05, SDM_ERFCF4_6_05, SDM_ERFCF4_7_05}; + vec_float4 c06b = (vec_float4) {SDM_ERFCF4_4_06, SDM_ERFCF4_5_06, SDM_ERFCF4_6_06, SDM_ERFCF4_7_06}; + vec_float4 c07b = (vec_float4) {SDM_ERFCF4_4_07, SDM_ERFCF4_5_07, SDM_ERFCF4_6_07, SDM_ERFCF4_7_07}; + vec_float4 c08b = (vec_float4) {SDM_ERFCF4_4_08, SDM_ERFCF4_5_08, SDM_ERFCF4_6_08, SDM_ERFCF4_7_08}; + vec_float4 c09b = (vec_float4) {SDM_ERFCF4_4_09, SDM_ERFCF4_5_09, SDM_ERFCF4_6_09, SDM_ERFCF4_7_09}; + vec_float4 c10b = (vec_float4) {SDM_ERFCF4_4_10, SDM_ERFCF4_5_10, SDM_ERFCF4_6_10, SDM_ERFCF4_7_10}; + vec_float4 c11b = (vec_float4) {SDM_ERFCF4_4_11, SDM_ERFCF4_5_11, SDM_ERFCF4_6_11, SDM_ERFCF4_7_11}; + vec_float4 c12b = (vec_float4) {SDM_ERFCF4_4_12, SDM_ERFCF4_5_12, SDM_ERFCF4_6_12, SDM_ERFCF4_7_12}; + vec_float4 c13b = (vec_float4) {SDM_ERFCF4_4_13, SDM_ERFCF4_5_13, SDM_ERFCF4_6_13, SDM_ERFCF4_7_13}; + vec_float4 c14b = (vec_float4) {SDM_ERFCF4_4_14, SDM_ERFCF4_5_14, SDM_ERFCF4_6_14, SDM_ERFCF4_7_14}; + + vec_uchar16 shuffle0 = (vec_uchar16) spu_splats(0x00010203); + vec_uchar16 shuffle1 = (vec_uchar16) spu_splats(0x04050607); + vec_uchar16 shuffle2 = (vec_uchar16) spu_splats(0x08090A0B); + vec_uchar16 shuffle3 = (vec_uchar16) spu_splats(0x0C0D0E0F); + vec_uchar16 shuffle4 = (vec_uchar16) spu_splats(0x10111213); + vec_uchar16 shuffle5 = (vec_uchar16) spu_splats(0x14151617); + vec_uchar16 shuffle6 = (vec_uchar16) spu_splats(0x18191A1B); + vec_uchar16 shuffle7 = (vec_uchar16) spu_splats(0x1C1D1E1F); + /* - * Select the appropriate approximation. + * Determine the shuffle pattern based on which partition + * each element of x is in. */ - result = spu_sel(tresult, presult, spu_cmpgt(xabs, approx_point)); + vec_uchar16 gt_r1start = (vec_uchar16)spu_cmpabsgt(x, r1start); + vec_uchar16 gt_r2start = (vec_uchar16)spu_cmpabsgt(x, r2start); + vec_uchar16 gt_r3start = (vec_uchar16)spu_cmpabsgt(x, r3start); + vec_uchar16 gt_r4start = (vec_uchar16)spu_cmpabsgt(x, r4start); + vec_uchar16 gt_r5start = (vec_uchar16)spu_cmpabsgt(x, r5start); + vec_uchar16 gt_r6start = (vec_uchar16)spu_cmpabsgt(x, r6start); + vec_uchar16 gt_r7start = (vec_uchar16)spu_cmpabsgt(x, r7start); + + vec_uchar16 shufflepattern; + shufflepattern = spu_sel(shuffle0, shuffle1, gt_r1start); + shufflepattern = spu_sel(shufflepattern, shuffle2, gt_r2start); + shufflepattern = spu_sel(shufflepattern, shuffle3, gt_r3start); + shufflepattern = spu_sel(shufflepattern, shuffle4, gt_r4start); + shufflepattern = spu_sel(shufflepattern, shuffle5, gt_r5start); + shufflepattern = spu_sel(shufflepattern, shuffle6, gt_r6start); + shufflepattern = spu_sel(shufflepattern, shuffle7, gt_r7start); + + + /* Use the shuffle pattern to select the coefficients */ + vec_float4 coeff_14 = spu_shuffle(c14a, c14b, shufflepattern); + vec_float4 coeff_13 = spu_shuffle(c13a, c13b, shufflepattern); + vec_float4 coeff_12 = spu_shuffle(c12a, c12b, shufflepattern); + vec_float4 coeff_11 = spu_shuffle(c11a, c11b, shufflepattern); + vec_float4 coeff_10 = spu_shuffle(c10a, c10b, shufflepattern); + vec_float4 coeff_09 = spu_shuffle(c09a, c09b, shufflepattern); + vec_float4 coeff_08 = spu_shuffle(c08a, c08b, shufflepattern); + vec_float4 coeff_07 = spu_shuffle(c07a, c07b, shufflepattern); + vec_float4 coeff_06 = spu_shuffle(c06a, c06b, shufflepattern); + vec_float4 coeff_05 = spu_shuffle(c05a, c05b, shufflepattern); + vec_float4 coeff_04 = spu_shuffle(c04a, c04b, shufflepattern); + vec_float4 coeff_03 = spu_shuffle(c03a, c03b, shufflepattern); + vec_float4 coeff_02 = spu_shuffle(c02a, c02b, shufflepattern); + vec_float4 coeff_01 = spu_shuffle(c01a, c01b, shufflepattern); + vec_float4 coeff_00 = spu_shuffle(c00a, c00b, shufflepattern); + + vec_float4 xoffset = spu_shuffle(xoffseta, xoffsetb, shufflepattern); + vec_uint4 tcorrection = spu_shuffle(tcorra, tcorrb, shufflepattern); + /* - * Special cases/errors. + * We've completed the coeff. setup. Now we actually do the + * approximation below. */ - /* x = +/- infinite */ - result = spu_sel(result, zerof, spu_cmpeq((vec_uint4)xabs, 0x7F800000)); - result = spu_sel(result, twof, spu_cmpeq((vec_uint4)xabs, 0xFF800000)); + /* Adjust x value here (for approximations about a point) */ + vec_float4 xappr = spu_sub(xabs, xoffset); + + + /* Now we do the multiplies. + * Use Horner's method. + */ + result = coeff_14; + result = spu_madd(xappr, result, coeff_13); + result = spu_madd(xappr, result, coeff_12); + result = spu_madd(xappr, result, coeff_11); + result = spu_madd(xappr, result, coeff_10); + result = spu_madd(xappr, result, coeff_09); + result = spu_madd(xappr, result, coeff_08); + result = spu_madd(xappr, result, coeff_07); + result = spu_madd(xappr, result, coeff_06); + result = spu_madd(xappr, result, coeff_05); + result = spu_madd(xappr, result, coeff_04); + result = spu_madd(xappr, result, coeff_03); + result = spu_madd(xappr, result, coeff_02); + result = spu_madd(xappr, result, coeff_01); + result = spu_madd(xappr, result, coeff_00); + + /* Adjust due to systematic truncation. */ + result = (vec_float4)spu_add((vec_uint4)result, tcorrection); + + /* Use the continued fraction approximation for x above approx. 4 + * and below approx. 10 + */ + vec_float4 presult, xsqu; + xsqu = spu_mul(x, x); + CONTFRAC_ERFCF4(xabs, xsqu, presult); + + /* Select between polynomial and continued fraction */ + result = spu_sel(presult, result, spu_cmpgt(spu_splats(4.3f), xabs)); + + /* Above clamp value, set erfc = 0 */ + result = spu_sel(result, zerof, spu_cmpgt(xabs, clamp)); + + /* Negative x values */ + vec_uint4 gt0 = spu_cmpgt(x, zerof); + result = spu_sel(spu_sub(twof, result), result, gt0); return result; } Index: src/newlib/libm/machine/spu/headers/erfd2.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/erfd2.h +++ src/newlib/libm/machine/spu/headers/erfd2.h @@ -68,7 +68,6 @@ static __inline vector double _erfd2(vec { vec_uchar16 dup_even = ((vec_uchar16) { 0,1,2,3, 0,1,2,3, 8, 9,10,11, 8, 9,10,11 }); vec_double2 onehalfd = spu_splats(0.5); - vec_double2 zerod = spu_splats(0.0); vec_double2 oned = spu_splats(1.0); vec_double2 sign_mask = spu_splats(-0.0); @@ -76,15 +75,9 @@ static __inline vector double _erfd2(vec vec_float4 approx_point = spu_splats(1.77f); vec_double2 xabs, xsqu, xsign; - vec_uint4 xabshigh; - vec_uint4 isinf, isnan; vec_double2 tresult, presult, result; xsign = spu_and(x, sign_mask); - - /* Force Denorms to 0 */ - x = spu_add(x, zerod); - xabs = spu_andc(x, sign_mask); xsqu = spu_mul(x, x); @@ -112,15 +105,11 @@ static __inline vector double _erfd2(vec /* * Special cases/errors. */ - xabshigh = (vec_uint4)spu_shuffle(xabs, xabs, dup_even); /* x = +/- infinite, return +/-1 */ - isinf = spu_cmpeq(xabshigh, 0x7FF00000); - result = spu_sel(result, oned, (vec_ullong2)isinf); - /* x = nan, return x */ - isnan = spu_cmpgt(xabshigh, 0x7FF00000); - result = spu_sel(result, x, (vec_ullong2)isnan); + result = spu_sel(result, oned, spu_testsv(x, SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY)); + result = spu_sel(result, x, spu_testsv(x, SPU_SV_NEG_DENORM | SPU_SV_POS_DENORM)); /* * Preserve sign in result, since erf(-x) = -erf(x) Index: src/newlib/libm/machine/spu/headers/lgammaf4.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/lgammaf4.h +++ src/newlib/libm/machine/spu/headers/lgammaf4.h @@ -426,7 +426,7 @@ static __inline vector float _lgammaf4(v vec_float4 xappr = spu_sub(xabs, xoffset); /* If in Stirling partition, do some setup before the madds */ - xappr = spu_sel(xappr, inv_xsqu, (vector unsigned int)gt_r7start); + xappr = spu_sel(xappr, inv_xsqu, gt_r7start); @@ -463,13 +463,13 @@ static __inline vector float _lgammaf4(v */ /* Finish the Near 0 formula */ - result = spu_sel(spu_sub(result, ln_x), result, (vector unsigned int)gt_r1start); + result = spu_sel(spu_sub(result, ln_x), result, gt_r1start); /* Finish Stirling's Approximation */ vec_float4 resultstirling = spu_madd(spu_sub(xabs, spu_splats(0.5f)), ln_x, halflog2pi); resultstirling = spu_sub(resultstirling, xabs); resultstirling = spu_add(spu_mul(result,inv_x), resultstirling); - result = spu_sel(result, resultstirling, (vector unsigned int)gt_r7start); + result = spu_sel(result, resultstirling, gt_r7start); /* Adjust due to systematic truncation */ Index: src/newlib/libm/machine/spu/headers/recipd2.h =================================================================== --- src.orig/newlib/libm/machine/spu/headers/recipd2.h +++ src/newlib/libm/machine/spu/headers/recipd2.h @@ -1,113 +1,168 @@ -/* -------------------------------------------------------------- */ -/* (C)Copyright 2001,2008, */ -/* International Business Machines Corporation, */ -/* Sony Computer Entertainment, Incorporated, */ -/* Toshiba Corporation, */ -/* */ -/* All Rights Reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the */ -/* following conditions are met: */ -/* */ -/* - Redistributions of source code must retain the above copyright*/ -/* notice, this list of conditions and the following disclaimer. */ -/* */ -/* - Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* - Neither the name of IBM Corporation nor the names of its */ -/* contributors may be used to endorse or promote products */ -/* derived from this software without specific prior written */ -/* permission. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ -/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */ -/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ -/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */ -/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */ -/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */ -/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ -/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */ -/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */ -/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* -------------------------------------------------------------- */ -/* PROLOG END TAG zYx */ -#ifdef __SPU__ - -#ifndef _RECIPD2_H_ -#define _RECIPD2_H_ 1 - -#include - - -/* - * FUNCTION - * vector double _recipd2(vector double value) - * - * DESCRIPTION - * The _recipd2 function inverts "value" and returns the result. - * Computation is performed using the single precision reciprocal - * estimate and interpolate instructions to produce a 12 accurate - * estimate. - * - * One (1) iteration of a Newton-Raphson is performed to improve - * accuracy to single precision floating point. Two additional double - * precision iterations are needed to achieve a full double - * preicision result. - * - * The Newton-Raphson iteration is of the form: - * X[i+1] = X[i] * (2.0 - b*X[i]) - * where b is the input value to be inverted - * - */ -static __inline vector double _recipd2(vector double value_d) -{ - vector unsigned long long expmask = (vector unsigned long long) { 0x7FF0000000000000ULL, 0x7FF0000000000000ULL }; - vector float x0; - vector float value; - vector float two = spu_splats(2.0f); - vector double two_d = spu_splats(2.0); - vector double x1, x2, x3; - vector double bias; - - /* Bias the divisor to correct for double precision floating - * point exponents that are out of single precision range. - */ - bias = spu_xor(spu_and(value_d, (vector double)expmask), (vector double)expmask); - - value = spu_roundtf(spu_mul(value_d, bias)); - x0 = spu_re(value); - x1 = spu_extend(spu_mul(x0, spu_nmsub(value, x0, two))); - x1 = spu_mul(x1, bias); - x2 = spu_mul(x1, spu_nmsub(value_d, x1, two_d)); - x3 = spu_mul(x2, spu_nmsub(value_d, x2, two_d)); - - /* Handle input = +/- infinity or +/-0. */ - -#ifdef __SPU_EDP__ - vec_ullong2 is0inf = spu_testsv(value_d, SPU_SV_NEG_ZERO | SPU_SV_POS_ZERO | - SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY); -#else - vec_double2 nzero = spu_splats(-0.0); - vec_double2 xabs = spu_andc(value_d, nzero); - vector unsigned char swap = (vector unsigned char) {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; - vec_uint4 isinf = spu_cmpeq((vec_uint4)xabs, (vec_uint4)expmask); - vec_uint4 iszero = spu_cmpeq((vec_uint4)xabs, 0); - isinf = spu_and(isinf, spu_shuffle(isinf, isinf, swap)); - iszero = spu_and(iszero, spu_shuffle(iszero, iszero, swap)); - vec_ullong2 is0inf = (vec_ullong2)spu_or(isinf, iszero); -#endif /* __SPU_EDP__ */ - - x3 = spu_sel(x3, spu_xor(value_d, (vector double)expmask), is0inf); - - return (x3); -} - -#endif /* _RECIPD2_H_ */ -#endif /* __SPU__ */ +/* -------------------------------------------------------------- */ +/* (C)Copyright 2001,2008, */ +/* International Business Machines Corporation, */ +/* Sony Computer Entertainment, Incorporated, */ +/* Toshiba Corporation, */ +/* */ +/* All Rights Reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the */ +/* following conditions are met: */ +/* */ +/* - Redistributions of source code must retain the above copyright*/ +/* notice, this list of conditions and the following disclaimer. */ +/* */ +/* - Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* - Neither the name of IBM Corporation nor the names of its */ +/* contributors may be used to endorse or promote products */ +/* derived from this software without specific prior written */ +/* permission. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ +/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */ +/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ +/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */ +/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */ +/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */ +/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ +/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */ +/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */ +/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* -------------------------------------------------------------- */ +/* PROLOG END TAG zYx */ +#ifdef __SPU__ + +#ifndef _RECIPD2_H_ +#define _RECIPD2_H_ 1 + +#include + + +/* + * FUNCTION + * vector double _recipd2(vector double value) + * + * DESCRIPTION + * The _recipd2 function inverts "value" and returns the result. + * Computation is performed using the single precision reciprocal + * estimate and interpolate instructions to produce a 12 accurate + * estimate. + * + * One (1) iteration of a Newton-Raphson is performed to improve + * accuracy to single precision floating point. Two additional double + * precision iterations are needed to achieve a full double + * preicision result. + * + * The Newton-Raphson iteration is of the form: + * a) X[i+1] = X[i] * (2.0 - b*X[i]) + * or + * b) X[i+1] = X[i] + X[i]*(1.0 - X[i]*b) + * where b is the input value to be inverted + * + * The later (b) form improves the accuracy to 99.95% correctly rounded. + */ +static __inline vector double _recipd2(vector double value_in) +{ + vec_float4 x0; + vec_float4 value; + vec_float4 one = spu_splats(1.0f); + vec_double2 one_d = spu_splats(1.0); + vec_double2 x1, x2, x3; + vec_double2 scale; + vec_double2 exp, value_d; + vec_ullong2 expmask = spu_splats(0x7FF0000000000000ULL); + vec_ullong2 is0inf; + +#ifdef __SPU_EDP__ + vec_ullong2 isdenorm; + vec_ullong2 expmask_minus1 = spu_splats(0x7FE0000000000000ULL); + + /* Determine special input values. For example, if the input is a denorm, infinity or 0 */ + + isdenorm = spu_testsv(value_in, (SPU_SV_POS_DENORM | SPU_SV_NEG_DENORM)); + is0inf = spu_testsv(value_in, (SPU_SV_NEG_ZERO | SPU_SV_POS_ZERO | + SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY)); + + /* Scale the divisor to correct for double precision floating + * point exponents that are out of single precision range. + */ + exp = spu_and(value_in, (vec_double2)expmask); + scale = spu_xor(exp, (vec_double2)spu_sel(expmask, expmask_minus1, isdenorm)); + value_d = spu_mul(value_in, scale); + value = spu_roundtf(value_d); + + /* Perform reciprocal with 1 single precision and 2 double precision + * Newton-Raphson iterations. + */ + x0 = spu_re(value); + x1 = spu_extend(spu_madd(spu_nmsub(value, x0, one), x0, x0)); + x2 = spu_madd(spu_nmsub(value_d, x1, one_d), x1, x1); + x3 = spu_madd(spu_nmsub(value_d, x2, one_d), x2, x2); + x3 = spu_sel(spu_mul(x3, scale), spu_xor(value_in, (vector double)expmask), is0inf); + +#else /* !__SPU_EDP__ */ + + vec_uint4 isinf, iszero, isdenorm0; + vec_double2 value_abs; + vec_double2 sign = spu_splats(-0.0); + vec_double2 denorm_scale = (vec_double2)spu_splats(0x4330000000000000ULL); + vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL); + vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11}; + vec_uchar16 swap = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; + + value_abs = spu_andc(value_in, sign); + exp = spu_and(value_in, (vec_double2)expmask); + + /* Determine if the input is a special value. These include: + * denorm - then we must coerce it to a normal value. + * zero - then we must return an infinity + * infinity - then we must return a zero. + */ + isdenorm0 = spu_cmpeq(spu_shuffle((vec_uint4)exp, (vec_uint4)exp, splat_hi), 0); + + isinf = spu_cmpeq((vec_uint4)value_abs, (vec_uint4)expmask); + iszero = spu_cmpeq((vec_uint4)value_abs, 0); + isinf = spu_and(isinf, spu_shuffle(isinf, isinf, swap)); + iszero = spu_and(iszero, spu_shuffle(iszero, iszero, swap)); + is0inf = (vec_ullong2)spu_or(isinf, iszero); + + /* If the inputs is a denorm, we must first convert it to a normal number since + * arithmetic operations on denormals produces 0 on Cell/B.E. + */ + value_d = spu_sub(spu_or(value_abs, exp_53), exp_53); + value_d = spu_sel(value_abs, value_d, (vec_ullong2)isdenorm0); + + /* Scale the divisor to correct for double precision floating + * point exponents that are out of single precision range. + */ + scale = spu_xor(spu_and(value_d, (vec_double2)expmask), (vec_double2)expmask); + value_d = spu_mul(value_d, scale); + value = spu_roundtf(value_d); + + /* Perform reciprocal with 1 single precision and 2 double precision + * Newton-Raphson iterations. The bias is removed after the single + * precision iteration. + */ + x0 = spu_re(value); + x1 = spu_extend(spu_madd(spu_nmsub(value, x0, one), x0, x0)); + x2 = spu_madd(spu_nmsub(value_d, x1, one_d), x1, x1); + x3 = spu_madd(spu_nmsub(value_d, x2, one_d), x2, x2); + x3 = spu_mul(x3, spu_sel(scale, value_in, (vec_ullong2)sign)); + x3 = spu_sel(x3, spu_mul(x3, denorm_scale), (vec_ullong2)isdenorm0); + x3 = spu_sel(x3, spu_xor(value_in, (vector double)expmask), is0inf); + +#endif /* __SPU_EDP__ */ + + return (x3); +} + +#endif /* _RECIPD2_H_ */ +#endif /* __SPU__ */