This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [patch] SPU libm


On Tuesday 25 November 2008 17:35, Ken Werner wrote:
> Hi,
>
> the SPU implementation of libm incorporates header files of the simdmath
> library. The attached patch rebases these headers against the current
> version.
>
> Ken
>
> newlib/ChangeLog:
>
> 2008-11-25  Ken Werner  <ken.werner@de.ibm.com>
>
>         * libm/machine/spu/headers/acosd2.h: Rebase against current
> simdmath. * libm/machine/spu/headers/asind2.h: Likewise.
>         * libm/machine/spu/headers/asinhf4.h: Likewise.
>         * libm/machine/spu/headers/divd2.h: Likewise.
>         * libm/machine/spu/headers/erf_utils.h: Likewise.
>         * libm/machine/spu/headers/erfcd2.h: Likewise.
>         * libm/machine/spu/headers/erfcf4.h: Likewise.
>         * libm/machine/spu/headers/erfd2.h: Likewise.
>         * libm/machine/spu/headers/lgammaf4.h: Likewise.
>         * libm/machine/spu/headers/recipd2.h: Likewise.

The posted patch reintroduces the compile problem if lgammaf4.h gets compiled 
with GCC 4.3 or higher. This was already fixed 
(http://sourceware.org/ml/newlib/2008/msg00458.html). Here is a revised 
version of the patch.

Regards
Ken

newlib/ChangeLog:

2008-11-26 ÂKen Werner Â<ken.werner@de.ibm.com>

    * libm/machine/spu/headers/acosd2.h: Rebase against current simdmath.
    * libm/machine/spu/headers/asind2.h: Likewise.
    * libm/machine/spu/headers/asinhf4.h: Likewise.
    * libm/machine/spu/headers/divd2.h: Likewise.
    * libm/machine/spu/headers/erf_utils.h: Likewise.
    * libm/machine/spu/headers/erfcd2.h: Likewise.
    * libm/machine/spu/headers/erfcf4.h: Likewise.
    * libm/machine/spu/headers/erfd2.h: Likewise.
    * libm/machine/spu/headers/recipd2.h: Likewise.
Index: src/newlib/libm/machine/spu/headers/acosd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/acosd2.h
+++ src/newlib/libm/machine/spu/headers/acosd2.h
@@ -40,9 +40,8 @@
 #ifndef _ACOSD2_H_
 #define _ACOSD2_H_	1
 
-#include <spu_intrinsics.h>
-
 #include "simdmath.h"
+#include <spu_intrinsics.h>
 #include "sqrtd2.h"
 #include "divd2.h"
 
Index: src/newlib/libm/machine/spu/headers/asind2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/asind2.h
+++ src/newlib/libm/machine/spu/headers/asind2.h
@@ -41,9 +41,8 @@
 #ifndef _ASIND2_H_
 #define _ASIND2_H_	1
 
-#include <spu_intrinsics.h>
-
 #include "simdmath.h"
+#include <spu_intrinsics.h>
 #include "sqrtd2.h"
 #include "divd2.h"
 
Index: src/newlib/libm/machine/spu/headers/asinhf4.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/asinhf4.h
+++ src/newlib/libm/machine/spu/headers/asinhf4.h
@@ -96,8 +96,8 @@
 #define ASINH_MAC25     6.4472103118896484375000000000000000000000000000000000000000000000000000E-3
 #define ASINH_MAC27     -5.7400376708419234664351851851851851851851851851851851851851851851851852E-3
 #define ASINH_MAC29     5.1533096823199041958512931034482758620689655172413793103448275862068966E-3
-#if 0
 #define ASINH_MAC31     -4.6601434869150961599042338709677419354838709677419354838709677419354839E-3
+#if 0
 #define ASINH_MAC33     4.2409070936793630773370916193181818181818181818181818181818181818181818E-3
 #define ASINH_MAC35     -3.8809645588376692363194056919642857142857142857142857142857142857142857E-3
 #define ASINH_MAC37     3.5692053938259345454138678473395270270270270270270270270270270270270270E-3
@@ -107,34 +107,43 @@
 #endif
 
 
-
 static __inline vector float _asinhf4(vector float x)
 {
     vec_float4 sign_mask = spu_splats(-0.0f);
     vec_float4 onef      = spu_splats(1.0f);
-    vec_float4 result, fresult, mresult;;
+    vec_uint4  oneu      = spu_splats(1u);
+    vec_uint4  twou      = spu_splats(2u);
+    vec_uint4  threeu    = spu_splats(3u);
+    vec_float4 ln2       = spu_splats(6.931471805599453094172321E-1f);
+    vec_float4 largef    = spu_splats(9.21e18f);
+    vec_float4 result, fresult, mresult;
     vec_float4 xabs, xsqu;
     /* Where we switch from maclaurin to formula */
-    vec_float4  switch_approx = spu_splats(0.685f);
+    vec_float4  switch_approx = spu_splats(0.74f);
+    vec_float4  trunc_part2   = spu_splats(20.0f);
+    vec_uint4   truncadd;
+    vec_uint4   islarge;
     vec_uint4   use_form;
 
-   
     xabs = spu_andc(x, sign_mask);
     xsqu = spu_mul(x, x);
+    islarge = spu_cmpgt(xabs, largef);
 
     /*
      * Formula:
      *   asinh = ln(|x| + sqrt(x^2 + 1))
      */
-    fresult = _sqrtf4(spu_madd(x, x, onef));
-    fresult = spu_add(xabs, fresult);
-    fresult = _logf4(fresult);
 
+    vec_float4 logarg = spu_add(xabs, _sqrtf4(spu_madd(xabs, xabs, onef)));
+    logarg = spu_sel(logarg, xabs, islarge);
+    fresult = _logf4(logarg);
+    fresult = spu_sel(fresult, spu_add(fresult, ln2), islarge);
 
     /*
      * Maclaurin Series
      */
-    mresult = spu_madd(xsqu, spu_splats((float)ASINH_MAC29), spu_splats((float)ASINH_MAC27));
+    mresult = spu_madd(xsqu, spu_splats((float)ASINH_MAC31), spu_splats((float)ASINH_MAC29));
+    mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC27));
     mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC25));
     mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC23));
     mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC21));
@@ -150,13 +159,18 @@ static __inline vector float _asinhf4(ve
     mresult = spu_madd(xsqu, mresult, spu_splats((float)ASINH_MAC01));
     mresult = spu_mul(xabs, mresult);
 
-
     /*
      * Choose between series and formula
      */
-    use_form = spu_cmpgt(xabs, switch_approx);
+    use_form  = spu_cmpgt(xabs, switch_approx);
     result = spu_sel(mresult, fresult, use_form);
 
+    /*
+     * Truncation correction on spu
+     */
+    truncadd = spu_sel(oneu, threeu, use_form);
+    truncadd = spu_sel(truncadd, twou, spu_cmpgt(xabs, trunc_part2));
+    result = (vec_float4)spu_add((vec_uint4)result, truncadd);
 
     /* Preserve sign - asinh is anti-symmetric */
     result = spu_sel(result, x, (vec_uint4)sign_mask);
Index: src/newlib/libm/machine/spu/headers/divd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/divd2.h
+++ src/newlib/libm/machine/spu/headers/divd2.h
@@ -1,232 +1,237 @@
-/* --------------------------------------------------------------  */
-/* (C)Copyright 2001,2008,                                         */
-/* International Business Machines Corporation,                    */
-/* Sony Computer Entertainment, Incorporated,                      */
-/* Toshiba Corporation,                                            */
-/*                                                                 */
-/* All Rights Reserved.                                            */
-/*                                                                 */
-/* Redistribution and use in source and binary forms, with or      */
-/* without modification, are permitted provided that the           */
-/* following conditions are met:                                   */
-/*                                                                 */
-/* - Redistributions of source code must retain the above copyright*/
-/*   notice, this list of conditions and the following disclaimer. */
-/*                                                                 */
-/* - Redistributions in binary form must reproduce the above       */
-/*   copyright notice, this list of conditions and the following   */
-/*   disclaimer in the documentation and/or other materials        */
-/*   provided with the distribution.                               */
-/*                                                                 */
-/* - Neither the name of IBM Corporation nor the names of its      */
-/*   contributors may be used to endorse or promote products       */
-/*   derived from this software without specific prior written     */
-/*   permission.                                                   */
-/*                                                                 */
-/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
-/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
-/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
-/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
-/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
-/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
-/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
-/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
-/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
-/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
-/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
-/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
-/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
-/* --------------------------------------------------------------  */
-/* PROLOG END TAG zYx                                              */
-#ifdef __SPU__
-
-#ifndef _DIVD2_H_
-#define _DIVD2_H_		 1
-
-#include <spu_intrinsics.h>
-
-/*
- * FUNCTION
- * 	vector double _divd2(vector double a, vector double b)
- * 
- * DESCRIPTION
- * 	_divd2 divides the vector dividend a by the vector divisor b and 
- *      returns the resulting vector quotient.  Maximum error 0.5 ULPS for 
- *      normalized results, 1ulp for denorm results, over entire double 
- *      range including denorms, compared to true result in round-to-nearest
- *      rounding mode.  Handles Inf or NaN operands and results correctly.
- */
-static __inline vector double _divd2(vector double a, vector double b)
-{
-
-
-  /* Variables
-   */
-  vec_float4 inv_bf, mant_bf;
-  vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;
-  vec_int4 exp, tmp;
-  vec_uint4 exp_a, exp_b, exp_q1, overflow, nounderflow, normal, utmp,
-    sign_a, sign_b, a_frac, b_frac, a_frac_0, b_frac_0, a_exp_0, b_exp_0, 
-    a_exp_ones, b_exp_ones, a_nan, b_nan, a_inf, b_inf, a_zero, b_zero, 
-    res_nan, sign_res;
-
-  /* Constants
-   */
-  vec_float4 onef = spu_splats(1.0f);
-  vec_double2 one = spu_splats(1.0);
-  vec_uint4 exp_mask = (vec_uint4) { 0x7FF00000, 0, 0x7FF00000, 0 };
-  vec_uint4 sign_mask = (vec_uint4) { 0x80000000, 0, 0x80000000, 0};
-  vec_uint4 sign_exp_mask = (vec_uint4) { 0xFFF00000, 0, 0xFFF00000,0};
-  vec_uint4 frac_mask =(vec_uint4) { 0x000FFFFF, 0xFFFFFFFF, 0x000FFFFF, 0xFFFFFFFF };
-  vec_uchar16 swap32 = (vec_uchar16) ((vec_uint4) { 0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B} );
-  vec_uint4 zero = (vec_uint4) { 0, 0, 0, 0 };
-  vec_int4 e1022 = (vec_int4) { 0x000003FE, 0, 0x000003FE, 0 };
-  vec_int4 emax  = (vec_int4) { 0x000007FE, 0, 0x000007FE, 0 };
-  vec_int4 e1    = (vec_int4) { 0x00000001, 0, 0x00000001, 0 };
-
-  vec_uint4 nan  = (vec_uint4) { 0x7FF80000, 0, 0x7FF80000, 0};
-
-  /* Extract exponents and underflow denorm arguments to signed zero.
-   */
-  exp_a = spu_and((vec_uint4)a, exp_mask);
-  exp_b = spu_and((vec_uint4)b, exp_mask);
-
-  sign_a = spu_and((vec_uint4)a, sign_mask);
-  sign_b = spu_and((vec_uint4)b, sign_mask);
-
-  a_exp_0 = spu_cmpeq (exp_a, 0);
-  utmp = spu_shuffle (a_exp_0, a_exp_0, swap32);
-  a_exp_0 = spu_and (a_exp_0, utmp);
-  b_exp_0 = spu_cmpeq (exp_b, 0);
-  utmp = spu_shuffle (b_exp_0, b_exp_0, swap32);
-  b_exp_0 = spu_and (b_exp_0, utmp);
-
-  a = spu_sel(a, (vec_double2)sign_a, (vec_ullong2)a_exp_0);
-  b = spu_sel(b, (vec_double2)sign_b, (vec_ullong2)b_exp_0);
-
-  /* Force the divisor and dividend into the range [1.0,2.0).
-     (Unless they're zero.)
-  */
-  mant_a = spu_sel(a, one, (vec_ullong2)sign_exp_mask);
-  mant_b = spu_sel(b, one, (vec_ullong2)sign_exp_mask);
-
-  /* Approximate the single reciprocal of b by using
-   * the single precision reciprocal estimate followed by one 
-   * single precision iteration of Newton-Raphson.
-   */
-  mant_bf = spu_roundtf(mant_b);
-  inv_bf = spu_re(mant_bf);
-  inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);
-
-  /* Perform 2 more Newton-Raphson iterations in double precision.
-   */
-  inv_b = spu_extend(inv_bf);
-  inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
-  q0 = spu_mul(mant_a, inv_b);
-  q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);
-
-  /* Compute the quotient's expected exponent. If the exponent
-   * is out of range, then force the resulting exponent to 0.
-   * (1023 with the bias). We correct for the out of range 
-   * values by computing a multiplier (mult) that will force the 
-   * result to the correct out of range value and set the 
-   * correct exception flag (UNF, OVF, or neither).
-   */
-  exp_q1 = spu_and((vec_uint4)q1, exp_mask);
-  exp = spu_sub((vec_int4)exp_a, (vec_int4)exp_b);
-  exp = spu_rlmaska(exp, -20);  // shift right to allow enough bits for working
-  tmp = spu_rlmaska((vec_int4)exp_q1, -20);
-  exp = spu_add(exp, tmp);  // biased exponent of result (right justified)
-
-  /* The default multiplier is 1.0. If an underflow is detected (the computed 
-   * exponent is less than or equal to a biased 0), force the multiplier to 0.0.
-   * If exp<=0 set mult = 2**(unbiased exp + 1022) and unbiased exp = -1022
-   * = biased 1, the smallest normalized exponent.  If exp<-51 set 
-   * mult = 2**(-1074) to ensure underflowing result.  Otherwise mult=1.
-   */
-  normal = spu_cmpgt(exp, 0);
-  nounderflow = spu_cmpgt(exp, -52);
-  tmp = spu_add(exp, e1022);
-  mult = (vec_double2)spu_sl(tmp, 20);
-  mult = spu_sel(mult, one, (vec_ullong2)normal);
-  mult = spu_sel((vec_double2)e1, mult, (vec_ullong2)nounderflow);
-  exp = spu_sel(e1, exp, normal);  // unbiased -1022 is biased 1
-
-  /* Force the multiplier to positive infinity (exp_mask) and the biased 
-   * exponent to 1022, if the computed biased exponent is > emax.
-   */
-  overflow = spu_cmpgt(exp, (vec_int4)emax);
-  exp = spu_sel(exp, (vec_int4)e1022, overflow);
-  mult = spu_sel(mult, (vec_double2)exp_mask, (vec_ullong2)overflow);
-
-  /* Determine if a, b are Inf, NaN, or zero.
-   * Since these are rare, it would improve speed if these could be detected
-   * quickly and a branch used to avoid slowing down the main path.  However
-   * most of the work seems to be in the detection.
-   */
-  a_exp_ones = spu_cmpeq (exp_a, exp_mask);
-  utmp = spu_shuffle (a_exp_ones, a_exp_ones, swap32);
-  a_exp_ones = spu_and (a_exp_ones, utmp);
-
-  a_frac = spu_and ((vec_uint4)a, frac_mask);
-  a_frac_0 = spu_cmpeq (a_frac, 0);
-  utmp = spu_shuffle (a_frac_0, a_frac_0, swap32);
-  a_frac_0 = spu_and (a_frac_0, utmp);
-
-  a_zero = spu_and (a_exp_0, a_frac_0);
-  a_inf = spu_and (a_exp_ones, a_frac_0);
-  a_nan = spu_andc (a_exp_ones, a_frac_0);
-
-  b_exp_ones = spu_cmpeq (exp_b, exp_mask);
-  utmp = spu_shuffle (b_exp_ones, b_exp_ones, swap32);
-  b_exp_ones = spu_and (b_exp_ones, utmp);
-
-  b_frac = spu_and ((vec_uint4)b, frac_mask);
-  b_frac_0 = spu_cmpeq (b_frac, 0);
-  utmp = spu_shuffle (b_frac_0, b_frac_0, swap32);
-  b_frac_0 = spu_and (b_frac_0, utmp);
-
-  b_zero = spu_and (b_exp_0, b_frac_0);
-  b_inf = spu_and (b_exp_ones, b_frac_0);
-  b_nan = spu_andc (b_exp_ones, b_frac_0);
-
-  /* Handle exception cases */
-
-  /* Result is 0 for 0/x, x!=0, or x/Inf, x!=Inf.
-   * Set mult=0 for 0/0 or Inf/Inf now, since it will be replaced 
-   * with NaN later.
-   */
-  utmp = spu_or (a_zero, b_inf);
-  mult = spu_sel(mult, (vec_double2)zero, (vec_ullong2)utmp);
-
-  /* Result is Inf for x/0, x!=0.  Set mult=Inf for 0/0 now, since it
-   * will be replaced with NaN later.
-   */
-  mult = spu_sel(mult, (vec_double2)exp_mask, (vec_ullong2)b_zero);
-
-  /* Result is NaN if either operand is, or Inf/Inf, or 0/0.
-   */
-  res_nan = spu_or (a_nan, b_nan);
-  utmp = spu_and (a_inf, b_inf);
-  res_nan = spu_or (res_nan, utmp);
-  utmp = spu_and (a_zero, b_zero);
-  res_nan = spu_or (res_nan, utmp);
-  mult = spu_sel(mult, (vec_double2)nan, (vec_ullong2)res_nan);
-  
-  /* Insert sign of result into mult.
-   */
-  sign_res = spu_xor (sign_a, sign_b);
-  mult = spu_or (mult, (vec_double2)sign_res);
-
-  /* Insert the sign and exponent into the result and perform the
-   * final multiplication.
-   */
-  exp = spu_sl(exp, 20);
-  q2 = spu_sel(q1, (vec_double2)exp, (vec_ullong2)exp_mask);
-  q2 = spu_mul(q2, mult);
-
-  return (q2);
-}
-
-#endif /* _DIVD2_H_ */
-#endif /* __SPU__ */
+/* --------------------------------------------------------------  */
+/* (C)Copyright 2001,2008,                                         */
+/* International Business Machines Corporation,                    */
+/* Sony Computer Entertainment, Incorporated,                      */
+/* Toshiba Corporation,                                            */
+/*                                                                 */
+/* All Rights Reserved.                                            */
+/*                                                                 */
+/* Redistribution and use in source and binary forms, with or      */
+/* without modification, are permitted provided that the           */
+/* following conditions are met:                                   */
+/*                                                                 */
+/* - Redistributions of source code must retain the above copyright*/
+/*   notice, this list of conditions and the following disclaimer. */
+/*                                                                 */
+/* - Redistributions in binary form must reproduce the above       */
+/*   copyright notice, this list of conditions and the following   */
+/*   disclaimer in the documentation and/or other materials        */
+/*   provided with the distribution.                               */
+/*                                                                 */
+/* - Neither the name of IBM Corporation nor the names of its      */
+/*   contributors may be used to endorse or promote products       */
+/*   derived from this software without specific prior written     */
+/*   permission.                                                   */
+/*                                                                 */
+/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
+/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
+/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
+/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
+/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
+/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
+/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
+/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
+/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
+/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
+/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
+/* --------------------------------------------------------------  */
+/* PROLOG END TAG zYx                                              */
+#ifdef __SPU__
+
+#ifndef _DIVD2_H_
+#define _DIVD2_H_		 1
+
+#include <spu_intrinsics.h>
+
+/*
+ * FUNCTION
+ * 	vector double _divd2(vector double a, vector double b)
+ * 
+ * DESCRIPTION
+ * 	_divd2 divides the vector dividend a by the vector divisor b and 
+ *      returns the resulting vector quotient.  Maximum error about 0.5 ulp 
+ *      over entire double range including denorms, compared to true result
+ *      in round-to-nearest rounding mode.  Handles Inf or NaN operands and 
+ *	results correctly.
+ */
+static __inline vector double _divd2(vector double a_in, vector double b_in)
+{
+  /* Variables */
+  vec_int4    exp, exp_bias;
+  vec_uint4   no_underflow, overflow;
+  vec_float4  mant_bf, inv_bf;
+  vec_ullong2 exp_a, exp_b;
+  vec_ullong2 a_nan, a_zero, a_inf, a_denorm;
+  vec_ullong2 b_nan, b_zero, b_inf, b_denorm;
+  vec_ullong2 nan;
+  vec_double2 a, b;
+  vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;
+
+  /* Constants */
+  vec_float4  onef = spu_splats(1.0f);
+  vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
+  vec_double2 one = spu_splats(1.0);
+
+#ifdef __SPU_EDP__  
+  vec_double2 denorm_scale = (vec_double2)spu_splats(0x4330000000000000ULL);
+
+  /* Identify all possible special values that must be accomodated including:
+   * +-0, +-infinity, +-denorm, and NaNs.
+   */
+  a_nan    = spu_testsv(a_in, (SPU_SV_NAN));
+  a_zero   = spu_testsv(a_in, (SPU_SV_NEG_ZERO     | SPU_SV_POS_ZERO));
+  a_inf    = spu_testsv(a_in, (SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
+  a_denorm = spu_testsv(a_in, (SPU_SV_NEG_DENORM   | SPU_SV_POS_DENORM));
+
+  b_nan    = spu_testsv(b_in, (SPU_SV_NAN));
+  b_zero   = spu_testsv(b_in, (SPU_SV_NEG_ZERO     | SPU_SV_POS_ZERO));
+  b_inf    = spu_testsv(b_in, (SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
+  b_denorm = spu_testsv(b_in, (SPU_SV_NEG_DENORM   | SPU_SV_POS_DENORM));
+
+  /* Scale denorm inputs to into normalized numbers by conditionally scaling the 
+   * input parameters.
+   */
+  a = spu_sel(a_in, spu_mul(a_in, denorm_scale), a_denorm);
+  b = spu_sel(b_in, spu_mul(b_in, denorm_scale), b_denorm);
+
+#else /* !__SPU_EDP__ */
+  vec_uint4   a_exp, b_exp;
+  vec_ullong2 a_mant_0, b_mant_0;
+  vec_ullong2 a_exp_1s, b_exp_1s;
+  vec_ullong2 sign_exp_mask;
+
+  vec_uint4   exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
+  vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3, 0,1,2,3,  8, 9,10,11, 8,9,10,11};
+  vec_uchar16 swap_32 = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
+  vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
+  vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);
+
+  sign_exp_mask = spu_or(sign_mask, exp_mask);
+
+  /* Extract the floating point components from each of the operands including
+   * exponent and mantissa.
+   */
+  a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
+  a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
+  b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
+  b_exp = spu_shuffle(b_exp, b_exp, splat_hi);
+
+  a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
+  a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));
+
+  b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
+  b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));
+
+  a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
+  b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);
+
+  /* Identify all possible special values that must be accomodated including:
+   * +-denorm, +-0, +-infinity, and NaNs.
+   */
+  a_denorm = (vec_ullong2)spu_cmpeq(a_exp, 0);		/* really is a_exp_0 */
+  a_nan    = spu_andc(a_exp_1s, a_mant_0);
+  a_zero   = spu_and (a_denorm, a_mant_0);
+  a_inf    = spu_and (a_exp_1s, a_mant_0);
+
+  b_denorm = (vec_ullong2)spu_cmpeq(b_exp, 0);		/* really is b_exp_0 */
+  b_nan    = spu_andc(b_exp_1s, b_mant_0);
+  b_zero   = spu_and (b_denorm, b_mant_0);
+  b_inf    = spu_and (b_exp_1s, b_mant_0);
+
+  /* Scale denorm inputs to into normalized numbers by conditionally scaling the 
+   * input parameters.
+   */
+  a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
+  a = spu_sel(a_in, a, a_denorm);
+
+  b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
+  b = spu_sel(b_in, b, b_denorm);
+
+#endif /* __SPU_EDP__ */
+
+  /* Extract the divisor and dividend exponent and force parameters into the signed 
+   * range [1.0,2.0) or [-1.0,2.0).
+   */
+  exp_a = spu_and((vec_ullong2)a, exp_mask);
+  exp_b = spu_and((vec_ullong2)b, exp_mask);
+
+  mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
+  mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);
+  
+  /* Approximate the single reciprocal of b by using
+   * the single precision reciprocal estimate followed by one 
+   * single precision iteration of Newton-Raphson.
+   */
+  mant_bf = spu_roundtf(mant_b);
+  inv_bf = spu_re(mant_bf);
+  inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);
+
+  /* Perform 2 more Newton-Raphson iterations in double precision. The
+   * result (q1) is in the range (0.5, 2.0).
+   */
+  inv_b = spu_extend(inv_bf);
+  inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
+  q0 = spu_mul(mant_a, inv_b);
+  q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);
+
+
+  /* Determine the exponent correction factor that must be applied 
+   * to q1 by taking into account the exponent of the normalized inputs
+   * and the scale factors that were applied to normalize them.
+   */
+  exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
+  exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));
+  
+  /* Bias the quotient exponent depending on the sign of the exponent correction
+   * factor so that a single multiplier will ensure the entire double precision
+   * domain (including denorms) can be achieved.
+   *
+   *    exp 	       bias q1     adjust exp
+   *   =====	       ========    ==========
+   *   positive         2^+65         -65
+   *   negative         2^-64         +64
+   */
+  exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
+
+
+  exp = spu_sub(exp, exp_bias);
+
+  q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);
+
+  /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the 
+   * expected result. 
+   */
+  exp = spu_add(exp, 0x3FF);
+  no_underflow = spu_cmpgt(exp, 0);
+  overflow = spu_cmpgt(exp, 0x7FF);
+  exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
+  exp = spu_and(exp, (vec_int4)exp_mask);
+  mult = spu_sel((vec_double2)exp, (vec_double2)exp_mask, (vec_ullong2)overflow);
+
+  /* Handle special value conditions. These include:
+   *
+   * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN 
+   *    results.
+   * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
+   * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
+   */
+  mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
+  mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));
+
+  nan = spu_or(a_nan, b_nan);
+  nan = spu_or(nan, spu_and(a_zero, b_zero));
+  nan = spu_or(nan, spu_and(a_inf, b_inf));
+
+  mult = spu_or(mult, (vec_double2)nan);
+
+  /* Scale the final quotient */
+
+  q2 = spu_mul(q1, mult);
+
+  return (q2);
+}
+
+#endif /* _DIVD2_H_ */
+#endif /* __SPU__ */
Index: src/newlib/libm/machine/spu/headers/erf_utils.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/erf_utils.h
+++ src/newlib/libm/machine/spu/headers/erf_utils.h
@@ -166,57 +166,6 @@
   _tresult = spu_mul(_tresult, spu_splats(TWO_OVER_SQRT_PI));                          \
 }
 
-#define TAYLOR_ERFF4(_xabs, _xsqu, _tresult)  {                                          \
-  _tresult = spu_madd(_xsqu, spu_splats((float)TAYLOR_ERF_45), spu_splats((float)TAYLOR_ERF_44));    \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_43));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_42));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_41));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_40));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_39));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_38));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_37));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_36));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_35));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_34));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_33));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_32));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_31));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_30));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_29));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_28));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_27));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_26));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_25));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_24));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_23));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_22));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_21));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_20));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_19));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_18));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_17));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_16));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_15));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_14));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_13));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_12));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_11));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_10));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_09));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_08));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_07));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_06));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_05));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_04));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_03));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_02));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_01));                     \
-  _tresult = spu_madd(_tresult, _xsqu, spu_splats((float)TAYLOR_ERF_00));                     \
-  _tresult = spu_mul(_tresult, _xabs);                                                 \
-  _tresult = spu_mul(_tresult, spu_splats((float)TWO_OVER_SQRT_PI));                          \
-}
-
-
 
   /*
    * Continued Fractions Approximation of Erfc()
@@ -266,43 +215,7 @@
   vec_float4 inv_xsqu;                  \
   inv_xsqu = _recipf4(_xsqu);            \
   v = spu_mul(inv_xsqu, onehalff);       \
-  p = spu_splats(3.025f); q = onef; plast = p; qlast = q;                                        \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(40.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(39.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(38.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(37.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(36.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(35.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(34.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(33.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(32.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(31.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(30.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(29.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(28.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(27.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(26.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(25.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(24.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(23.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(22.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(21.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(20.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(19.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(18.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(17.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(16.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(15.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(14.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(13.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(12.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(11.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats(10.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats( 9.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats( 8.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats( 7.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats( 6.0f)), plast); q = plast; plast = p; qlast = q;    \
-  p = spu_madd(qlast, spu_mul(v, spu_splats( 5.0f)), plast); q = plast; plast = p; qlast = q;    \
+  p = spu_splats(1.945f); q = onef; plast = p; qlast = q;                                        \
   p = spu_madd(qlast, spu_mul(v, spu_splats( 4.0f)), plast); q = plast; plast = p; qlast = q;    \
   p = spu_madd(qlast, spu_mul(v, spu_splats( 3.0f)), plast); q = plast; plast = p; qlast = q;    \
   p = spu_madd(qlast, spu_mul(v, spu_splats( 2.0f)), plast); q = plast; plast = p; qlast = q;    \
Index: src/newlib/libm/machine/spu/headers/erfcd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/erfcd2.h
+++ src/newlib/libm/machine/spu/headers/erfcd2.h
@@ -80,15 +80,10 @@ static __inline vector double _erfcd2(ve
   vec_float4 approx_point = spu_splats(1.71f);
 
   vec_double2 xabs, xsqu, xsign;
-  vec_uint4 xhigh, xabshigh;
-  vec_uint4 isnan, isneg;
+  vec_uint4 isneg;
   vec_double2 tresult, presult, result;
 
   xsign = spu_and(x, sign_mask);
-
-  /* Force Denorms to 0 */
-  x = spu_add(x, zerod);
-
   xabs = spu_andc(x, sign_mask);
   xsqu = spu_mul(x, x);
 
@@ -118,18 +113,11 @@ static __inline vector double _erfcd2(ve
   result = spu_sel(tresult, presult, (vec_ullong2)spu_cmpgt(xf, approx_point));
 
   /*
-   * Special cases/errors.
+   * Special cases
    */
-  xhigh = (vec_uint4)spu_shuffle(x, x, dup_even);
-  xabshigh = (vec_uint4)spu_shuffle(xabs, xabs, dup_even);
-
-  /* x = +/- infinite */
-  result = spu_sel(result, zerod, (vec_ullong2)spu_cmpeq(xhigh, 0x7FF00000));
-  result = spu_sel(result, twod,  (vec_ullong2)spu_cmpeq(xhigh, 0xFFF00000));
-
-  /* x = nan, return x */
-  isnan = spu_cmpgt(xabshigh, 0x7FF00000);
-  result = spu_sel(result, x, (vec_ullong2)isnan);
+  result = spu_sel(result,  twod, spu_testsv(x, SPU_SV_NEG_INFINITY));
+  result = spu_sel(result, zerod, spu_testsv(x, SPU_SV_POS_INFINITY));
+  result = spu_sel(result,     x, spu_testsv(x, SPU_SV_NEG_DENORM | SPU_SV_POS_DENORM));
 
   return result;
 }
Index: src/newlib/libm/machine/spu/headers/erfcf4.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/erfcf4.h
+++ src/newlib/libm/machine/spu/headers/erfcf4.h
@@ -40,11 +40,11 @@
 #define _ERFCF4_H_	1
 
 #include <spu_intrinsics.h>
-
-#include "expf4.h"
+#include "erff4.h"
+#include "erf_utils.h"
 #include "recipf4.h"
+#include "expf4.h"
 #include "divf4.h"
-#include "erf_utils.h"
 
 /*
  * FUNCTION
@@ -63,56 +63,374 @@
 
 static __inline vector float _erfcf4(vector float x)
 {
-  vec_float4 onehalff  = spu_splats(0.5f);
-  vec_float4 zerof     = spu_splats(0.0f);
-  vec_float4 onef      = spu_splats(1.0f);
-  vec_float4 twof      = spu_splats(2.0f);
-  vec_float4 sign_mask = spu_splats(-0.0f);
+  vec_float4 sign_maskf = spu_splats(-0.0f);
+  vec_float4 zerof      = spu_splats(0.0f);
+  vec_float4 onehalff   = spu_splats(0.5f);
+  vec_float4 onef       = spu_splats(1.0f);
+  vec_float4 twof       = spu_splats(2.0f);
+  vec_float4 clamp      = spu_splats(10.0542f);   // Erfc = 0 above this (in single precision)
+  vec_float4 xabs       = spu_andc(x, sign_maskf);
+  vec_float4 result;
 
-  /* This is where we switch from near zero approx. */
-  vec_float4 approx_point = spu_splats(0.89f);
+  /*
+   * First thing we do is setup the description of each partition.
+   * This consists of:
+   * - Start x of partition
+   * - Offset (used for evaluating power series expanded around a point)
+   * - Truncation adjustment.
+   */
 
-  vec_float4 xabs, xsqu, xsign;
-  vec_uint4  isneg;
-  vec_float4 tresult, presult, result;
 
-  xsign = spu_and(x, sign_mask);
+  /***************************************************************
+   * REGION 0: Approximation Near 0 from Above
+   *
+   */
+#define SDM_ERFCF4_0_START     0.0f
+#define SDM_ERFCF4_0_OFF       0.0f
+#define SDM_ERFCF4_0_TRUNC     1u
+
+#define SDM_ERFCF4_0_00      0.9999999999999949135f
+#define SDM_ERFCF4_0_01      -1.1283791670931702608f
+#define SDM_ERFCF4_0_02      -1.8051894620430502228e-10f
+#define SDM_ERFCF4_0_03      0.37612639455729408814f
+#define SDM_ERFCF4_0_04      -8.8929793006257568262e-8f
+#define SDM_ERFCF4_0_05      -0.11283705324835578294f
+#define SDM_ERFCF4_0_06      -5.4670494993502827210e-6f
+#define SDM_ERFCF4_0_07      0.026889802515535093351f
+#define SDM_ERFCF4_0_08      -0.000071498114084857387620f
+#define SDM_ERFCF4_0_09      -0.0050714210985129775210f
+#define SDM_ERFCF4_0_10      -0.00022683372291701701701f
+#define SDM_ERFCF4_0_11      0.0010796064437231401311f
+#define SDM_ERFCF4_0_12      -0.00012982218714593684809f
+#define SDM_ERFCF4_0_13      -0.00010102962499433144847f
+#define SDM_ERFCF4_0_14       0.000025784829228223517886f
 
-  /* Force Denorms to 0 */
-  x = spu_add(x, zerof);
 
-  xabs = spu_andc(x, sign_mask);
-  xsqu = spu_mul(x, x);
+  /***************************************************************
+   * REGION 1: Near 1
+   */
+#define SDM_ERFCF4_1_START     0.88f
+#define SDM_ERFCF4_1_OFF       1.125f
+#define SDM_ERFCF4_1_TRUNC     1u
+
+#define SDM_ERFCF4_1_00     0.111611768298292224f
+#define SDM_ERFCF4_1_01    -0.318273958500769283f
+#define SDM_ERFCF4_1_02     0.358058203313365464f
+#define SDM_ERFCF4_1_03    -0.162452332984767661f
+#define SDM_ERFCF4_1_04    -0.0279732971338566734f
+#define SDM_ERFCF4_1_05     0.0613236836056658061f
+#define SDM_ERFCF4_1_06    -0.0155368354497628942f
+#define SDM_ERFCF4_1_07    -0.00960689422582997228f
+#define SDM_ERFCF4_1_08     0.00603126088310672760f
+#define SDM_ERFCF4_1_09     0.000360191989801368303f
+#define SDM_ERFCF4_1_10    -0.00115326735470205975f
+#define SDM_ERFCF4_1_11     0.000176955087857924673f
+#define SDM_ERFCF4_1_12     0.000141558399011799664f
+#define SDM_ERFCF4_1_13    -0.0000494556968345700811f
+#define SDM_ERFCF4_1_14    0.0f
 
-  /*
-   * Use Taylor Series for x near 0
-   * Preserve sign of x in result, since erf(-x) = -erf(x)
-   * This approximation is for erf, so adjust for erfc.
-   */
-  TAYLOR_ERFF4(xabs, xsqu, tresult);
-  tresult = spu_or(tresult, xsign);
-  tresult = spu_sub(onef, tresult);
 
-  /*
-   * Now, use the Continued Fractions approximation away
-   * from 0. If x < 0, use erfc(-x) = 2 - erfc(x)
+  /***************************************************************
+   * REGION 2:
    */
-  CONTFRAC_ERFCF4(xabs, xsqu, presult);
-  isneg = spu_rlmaska((vec_uint4)x, -32);
-  presult = spu_sel(presult, spu_sub(twof, presult), isneg);
+#define SDM_ERFCF4_2_START     1.50f
+#define SDM_ERFCF4_2_OFF       1.75f
+#define SDM_ERFCF4_2_TRUNC     0u
+
+#define SDM_ERFCF4_2_00     0.0133283287808175777f
+#define SDM_ERFCF4_2_01    -0.0527749959301503715f
+#define SDM_ERFCF4_2_02     0.0923562428777631589f
+#define SDM_ERFCF4_2_03    -0.0901572847140068856f
+#define SDM_ERFCF4_2_04     0.0481022098321682995f
+#define SDM_ERFCF4_2_05    -0.00662436146831574865f
+#define SDM_ERFCF4_2_06    -0.00896304509872736070f
+#define SDM_ERFCF4_2_07     0.00605875147039124009f
+#define SDM_ERFCF4_2_08    -0.000730051247140304322f
+#define SDM_ERFCF4_2_09    -0.000894181745354844871f
+#define SDM_ERFCF4_2_10     0.000442750499254694174f
+#define SDM_ERFCF4_2_11     5.44549038611738718e-6f
+#define SDM_ERFCF4_2_12    -0.0000686716770072681921f
+#define SDM_ERFCF4_2_13     0.0000177205746526325771f
+#define SDM_ERFCF4_2_14    0.0f
+
+
+  /***************************************************************
+   * REGION 3:
+   */
+#define SDM_ERFCF4_3_START     2.0f
+#define SDM_ERFCF4_3_OFF       2.25f
+#define SDM_ERFCF4_3_TRUNC     1u
+
+#define SDM_ERFCF4_3_00      0.00146271658668117865f
+#define SDM_ERFCF4_3_01     -0.00714231902201798319f
+#define SDM_ERFCF4_3_02      0.0160702177995404628f
+#define SDM_ERFCF4_3_03     -0.0217245536919713662f
+#define SDM_ERFCF4_3_04      0.0190833836369542972f
+#define SDM_ERFCF4_3_05     -0.0106576791656674587f
+#define SDM_ERFCF4_3_06      0.00290435707106278173f
+#define SDM_ERFCF4_3_07      0.000670455969951892490f
+#define SDM_ERFCF4_3_08     -0.000999493712611392590f
+#define SDM_ERFCF4_3_09      0.000369380417703939461f
+#define SDM_ERFCF4_3_10      0.0000114665831641414663f
+#define SDM_ERFCF4_3_11     -0.0000651349432823388933f
+#define SDM_ERFCF4_3_12      0.0000226882426454011034f
+#define SDM_ERFCF4_3_13      1.33207467538330703e-6f
+#define SDM_ERFCF4_3_14    0.0f
+
+
+  /***************************************************************
+   * REGION 4:
+   */
+#define SDM_ERFCF4_4_START     2.46f
+#define SDM_ERFCF4_4_OFF       2.75f
+#define SDM_ERFCF4_4_TRUNC     1u
+
+#define SDM_ERFCF4_4_00      0.000100621922119681351f
+#define SDM_ERFCF4_4_01     -0.000586277247093792324f
+#define SDM_ERFCF4_4_02      0.00161226242950792873f
+#define SDM_ERFCF4_4_03     -0.00276038870506660526f
+#define SDM_ERFCF4_4_04      0.00325811365963060576f
+#define SDM_ERFCF4_4_05     -0.00275580841407368484f
+#define SDM_ERFCF4_4_06      0.00165732740366604948f
+#define SDM_ERFCF4_4_07     -0.000646040956672447276f
+#define SDM_ERFCF4_4_08      0.0000890115712124397128f
+#define SDM_ERFCF4_4_09      0.0000712231147231515843f
+#define SDM_ERFCF4_4_10     -0.0000549969924243893176f
+#define SDM_ERFCF4_4_11      0.0000158438047120425837f
+#define SDM_ERFCF4_4_12      1.07113381370613701e-6f
+#define SDM_ERFCF4_4_13     0.0f
+#define SDM_ERFCF4_4_14     0.0f
+
+
+  /***************************************************************
+   * REGION 5:
+   */
+#define SDM_ERFCF4_5_START     2.95f
+#define SDM_ERFCF4_5_OFF       3.25f
+#define SDM_ERFCF4_5_TRUNC     1u
+
+#define SDM_ERFCF4_5_00      4.30277946372736864e-6f
+#define SDM_ERFCF4_5_01     -0.0000291890253835816989f
+#define SDM_ERFCF4_5_02      0.0000948643324966405230f
+#define SDM_ERFCF4_5_03     -0.000195809711948193862f
+#define SDM_ERFCF4_5_04      0.000286569337750268210f
+#define SDM_ERFCF4_5_05     -0.000313797225490890491f
+#define SDM_ERFCF4_5_06      0.000263528504215059911f
+#define SDM_ERFCF4_5_07     -0.000169991414511391200f
+#define SDM_ERFCF4_5_08      0.0000816476305301353867f
+#define SDM_ERFCF4_5_09     -0.0000259138470056606003f
+#define SDM_ERFCF4_5_10      2.32886623721087698e-6f
+#define SDM_ERFCF4_5_11      2.86429946075621661e-6f
+#define SDM_ERFCF4_5_12      0.0f
+#define SDM_ERFCF4_5_13      0.0f
+#define SDM_ERFCF4_5_14      0.0f
+
+
+  /***************************************************************
+   * REGION 6:
+   */
+#define SDM_ERFCF4_6_START     3.45f
+#define SDM_ERFCF4_6_OFF       3.625f
+#define SDM_ERFCF4_6_TRUNC     1u
+
+#define SDM_ERFCF4_6_00     2.95140192507759025e-7f
+#define SDM_ERFCF4_6_01    -2.21592028463311237e-6f
+#define SDM_ERFCF4_6_02     8.03271103179503198e-6f
+#define SDM_ERFCF4_6_03    -0.0000186737448986269582f
+#define SDM_ERFCF4_6_04     0.0000311685922848296785f
+#define SDM_ERFCF4_6_05    -0.0000395923353434149457f
+#define SDM_ERFCF4_6_06     0.0000395291139306718091f
+#define SDM_ERFCF4_6_07    -0.0000315141214892874786f
+#define SDM_ERFCF4_6_08     0.0000200891481859513911f
+#define SDM_ERFCF4_6_09    -0.0000100551790824327187f
+#define SDM_ERFCF4_6_10     3.71860071281680690e-6f
+#define SDM_ERFCF4_6_11    -8.05502983594814356e-7f
+#define SDM_ERFCF4_6_12    -7.67662978382552699e-8f
+#define SDM_ERFCF4_6_13     1.56408548403936681e-7f
+#define SDM_ERFCF4_6_14     0.0f
+#define SDM_ERFCF4_6_15     0.0f
+#define SDM_ERFCF4_6_16     0.0f
+#define SDM_ERFCF4_6_17     0.0f
+
+
+  /***************************************************************
+   * REGION 7:
+   */
+#define SDM_ERFCF4_7_START     3.55f
+#define SDM_ERFCF4_7_OFF       4.0f
+#define SDM_ERFCF4_7_TRUNC     2u
+
+#define SDM_ERFCF4_7_00     1.54172579002800189e-8f
+#define SDM_ERFCF4_7_01    -1.2698234671866558e-7f
+#define SDM_ERFCF4_7_02     5.0792938687466233e-7f
+#define SDM_ERFCF4_7_03    -1.3121509160928777e-6f
+#define SDM_ERFCF4_7_04     2.4549920365608679e-6f
+#define SDM_ERFCF4_7_05    -3.5343419836695254e-6f
+#define SDM_ERFCF4_7_06     4.0577914351431357e-6f
+#define SDM_ERFCF4_7_07    -3.7959659297660776e-6f
+#define SDM_ERFCF4_7_08     2.9264391936639771e-6f
+#define SDM_ERFCF4_7_09    -1.8631747969134646e-6f
+#define SDM_ERFCF4_7_10     9.702839808793979e-7f
+#define SDM_ERFCF4_7_11    -4.0077792841735885e-7f
+#define SDM_ERFCF4_7_12     1.2017256123590621e-7f
+#define SDM_ERFCF4_7_13    -1.7432381111955779e-8f
+#define SDM_ERFCF4_7_14     0.0f
+
+
+  /***************************************************************
+   * Now we load the description of each partition.
+   */
+
+  /* Start point for each partition */
+  vec_float4 r1start = spu_splats(SDM_ERFCF4_1_START);
+  vec_float4 r2start = spu_splats(SDM_ERFCF4_2_START);
+  vec_float4 r3start = spu_splats(SDM_ERFCF4_3_START);
+  vec_float4 r4start = spu_splats(SDM_ERFCF4_4_START);
+  vec_float4 r5start = spu_splats(SDM_ERFCF4_5_START);
+  vec_float4 r6start = spu_splats(SDM_ERFCF4_6_START);
+  vec_float4 r7start = spu_splats(SDM_ERFCF4_7_START);
+
+  /* X Offset for each partition */
+  vec_float4 xoffseta = (vec_float4) {SDM_ERFCF4_0_OFF, SDM_ERFCF4_1_OFF, SDM_ERFCF4_2_OFF, SDM_ERFCF4_3_OFF};
+  vec_float4 xoffsetb = (vec_float4) {SDM_ERFCF4_4_OFF, SDM_ERFCF4_5_OFF, SDM_ERFCF4_6_OFF, SDM_ERFCF4_7_OFF};
+
+  /* Truncation Correction for each partition */
+  vec_uint4 tcorra = (vec_uint4) {SDM_ERFCF4_0_TRUNC, SDM_ERFCF4_1_TRUNC, SDM_ERFCF4_2_TRUNC, SDM_ERFCF4_3_TRUNC};
+  vec_uint4 tcorrb = (vec_uint4) {SDM_ERFCF4_4_TRUNC, SDM_ERFCF4_5_TRUNC, SDM_ERFCF4_6_TRUNC, SDM_ERFCF4_7_TRUNC};
+
+  /* The coefficients for each partition */
+  vec_float4 c00a = (vec_float4) {SDM_ERFCF4_0_00, SDM_ERFCF4_1_00, SDM_ERFCF4_2_00, SDM_ERFCF4_3_00};
+  vec_float4 c01a = (vec_float4) {SDM_ERFCF4_0_01, SDM_ERFCF4_1_01, SDM_ERFCF4_2_01, SDM_ERFCF4_3_01};
+  vec_float4 c02a = (vec_float4) {SDM_ERFCF4_0_02, SDM_ERFCF4_1_02, SDM_ERFCF4_2_02, SDM_ERFCF4_3_02};
+  vec_float4 c03a = (vec_float4) {SDM_ERFCF4_0_03, SDM_ERFCF4_1_03, SDM_ERFCF4_2_03, SDM_ERFCF4_3_03};
+  vec_float4 c04a = (vec_float4) {SDM_ERFCF4_0_04, SDM_ERFCF4_1_04, SDM_ERFCF4_2_04, SDM_ERFCF4_3_04};
+  vec_float4 c05a = (vec_float4) {SDM_ERFCF4_0_05, SDM_ERFCF4_1_05, SDM_ERFCF4_2_05, SDM_ERFCF4_3_05};
+  vec_float4 c06a = (vec_float4) {SDM_ERFCF4_0_06, SDM_ERFCF4_1_06, SDM_ERFCF4_2_06, SDM_ERFCF4_3_06};
+  vec_float4 c07a = (vec_float4) {SDM_ERFCF4_0_07, SDM_ERFCF4_1_07, SDM_ERFCF4_2_07, SDM_ERFCF4_3_07};
+  vec_float4 c08a = (vec_float4) {SDM_ERFCF4_0_08, SDM_ERFCF4_1_08, SDM_ERFCF4_2_08, SDM_ERFCF4_3_08};
+  vec_float4 c09a = (vec_float4) {SDM_ERFCF4_0_09, SDM_ERFCF4_1_09, SDM_ERFCF4_2_09, SDM_ERFCF4_3_09};
+  vec_float4 c10a = (vec_float4) {SDM_ERFCF4_0_10, SDM_ERFCF4_1_10, SDM_ERFCF4_2_10, SDM_ERFCF4_3_10};
+  vec_float4 c11a = (vec_float4) {SDM_ERFCF4_0_11, SDM_ERFCF4_1_11, SDM_ERFCF4_2_11, SDM_ERFCF4_3_11};
+  vec_float4 c12a = (vec_float4) {SDM_ERFCF4_0_12, SDM_ERFCF4_1_12, SDM_ERFCF4_2_12, SDM_ERFCF4_3_12};
+  vec_float4 c13a = (vec_float4) {SDM_ERFCF4_0_13, SDM_ERFCF4_1_13, SDM_ERFCF4_2_13, SDM_ERFCF4_3_13};
+  vec_float4 c14a = (vec_float4) {SDM_ERFCF4_0_14, SDM_ERFCF4_1_14, SDM_ERFCF4_2_14, SDM_ERFCF4_3_14};
+
+  vec_float4 c00b = (vec_float4) {SDM_ERFCF4_4_00, SDM_ERFCF4_5_00, SDM_ERFCF4_6_00, SDM_ERFCF4_7_00};
+  vec_float4 c01b = (vec_float4) {SDM_ERFCF4_4_01, SDM_ERFCF4_5_01, SDM_ERFCF4_6_01, SDM_ERFCF4_7_01};
+  vec_float4 c02b = (vec_float4) {SDM_ERFCF4_4_02, SDM_ERFCF4_5_02, SDM_ERFCF4_6_02, SDM_ERFCF4_7_02};
+  vec_float4 c03b = (vec_float4) {SDM_ERFCF4_4_03, SDM_ERFCF4_5_03, SDM_ERFCF4_6_03, SDM_ERFCF4_7_03};
+  vec_float4 c04b = (vec_float4) {SDM_ERFCF4_4_04, SDM_ERFCF4_5_04, SDM_ERFCF4_6_04, SDM_ERFCF4_7_04};
+  vec_float4 c05b = (vec_float4) {SDM_ERFCF4_4_05, SDM_ERFCF4_5_05, SDM_ERFCF4_6_05, SDM_ERFCF4_7_05};
+  vec_float4 c06b = (vec_float4) {SDM_ERFCF4_4_06, SDM_ERFCF4_5_06, SDM_ERFCF4_6_06, SDM_ERFCF4_7_06};
+  vec_float4 c07b = (vec_float4) {SDM_ERFCF4_4_07, SDM_ERFCF4_5_07, SDM_ERFCF4_6_07, SDM_ERFCF4_7_07};
+  vec_float4 c08b = (vec_float4) {SDM_ERFCF4_4_08, SDM_ERFCF4_5_08, SDM_ERFCF4_6_08, SDM_ERFCF4_7_08};
+  vec_float4 c09b = (vec_float4) {SDM_ERFCF4_4_09, SDM_ERFCF4_5_09, SDM_ERFCF4_6_09, SDM_ERFCF4_7_09};
+  vec_float4 c10b = (vec_float4) {SDM_ERFCF4_4_10, SDM_ERFCF4_5_10, SDM_ERFCF4_6_10, SDM_ERFCF4_7_10};
+  vec_float4 c11b = (vec_float4) {SDM_ERFCF4_4_11, SDM_ERFCF4_5_11, SDM_ERFCF4_6_11, SDM_ERFCF4_7_11};
+  vec_float4 c12b = (vec_float4) {SDM_ERFCF4_4_12, SDM_ERFCF4_5_12, SDM_ERFCF4_6_12, SDM_ERFCF4_7_12};
+  vec_float4 c13b = (vec_float4) {SDM_ERFCF4_4_13, SDM_ERFCF4_5_13, SDM_ERFCF4_6_13, SDM_ERFCF4_7_13};
+  vec_float4 c14b = (vec_float4) {SDM_ERFCF4_4_14, SDM_ERFCF4_5_14, SDM_ERFCF4_6_14, SDM_ERFCF4_7_14};
+
+  vec_uchar16 shuffle0 = (vec_uchar16) spu_splats(0x00010203);
+  vec_uchar16 shuffle1 = (vec_uchar16) spu_splats(0x04050607);
+  vec_uchar16 shuffle2 = (vec_uchar16) spu_splats(0x08090A0B);
+  vec_uchar16 shuffle3 = (vec_uchar16) spu_splats(0x0C0D0E0F);
+  vec_uchar16 shuffle4 = (vec_uchar16) spu_splats(0x10111213);
+  vec_uchar16 shuffle5 = (vec_uchar16) spu_splats(0x14151617);
+  vec_uchar16 shuffle6 = (vec_uchar16) spu_splats(0x18191A1B);
+  vec_uchar16 shuffle7 = (vec_uchar16) spu_splats(0x1C1D1E1F);
+
 
   /*
-   * Select the appropriate approximation.
+   * Determine the shuffle pattern based on which partition
+   * each element of x is in.
    */
-  result = spu_sel(tresult, presult, spu_cmpgt(xabs, approx_point));
+  vec_uchar16 gt_r1start = (vec_uchar16)spu_cmpabsgt(x, r1start);
+  vec_uchar16 gt_r2start = (vec_uchar16)spu_cmpabsgt(x, r2start);
+  vec_uchar16 gt_r3start = (vec_uchar16)spu_cmpabsgt(x, r3start);
+  vec_uchar16 gt_r4start = (vec_uchar16)spu_cmpabsgt(x, r4start);
+  vec_uchar16 gt_r5start = (vec_uchar16)spu_cmpabsgt(x, r5start);
+  vec_uchar16 gt_r6start = (vec_uchar16)spu_cmpabsgt(x, r6start);
+  vec_uchar16 gt_r7start = (vec_uchar16)spu_cmpabsgt(x, r7start);
+
+  vec_uchar16 shufflepattern;
+  shufflepattern = spu_sel(shuffle0, shuffle1, gt_r1start);
+  shufflepattern = spu_sel(shufflepattern, shuffle2, gt_r2start);
+  shufflepattern = spu_sel(shufflepattern, shuffle3, gt_r3start);
+  shufflepattern = spu_sel(shufflepattern, shuffle4, gt_r4start);
+  shufflepattern = spu_sel(shufflepattern, shuffle5, gt_r5start);
+  shufflepattern = spu_sel(shufflepattern, shuffle6, gt_r6start);
+  shufflepattern = spu_sel(shufflepattern, shuffle7, gt_r7start);
+
+
+  /* Use the shuffle pattern to select the coefficients */
+  vec_float4 coeff_14 = spu_shuffle(c14a, c14b, shufflepattern);
+  vec_float4 coeff_13 = spu_shuffle(c13a, c13b, shufflepattern);
+  vec_float4 coeff_12 = spu_shuffle(c12a, c12b, shufflepattern);
+  vec_float4 coeff_11 = spu_shuffle(c11a, c11b, shufflepattern);
+  vec_float4 coeff_10 = spu_shuffle(c10a, c10b, shufflepattern);
+  vec_float4 coeff_09 = spu_shuffle(c09a, c09b, shufflepattern);
+  vec_float4 coeff_08 = spu_shuffle(c08a, c08b, shufflepattern);
+  vec_float4 coeff_07 = spu_shuffle(c07a, c07b, shufflepattern);
+  vec_float4 coeff_06 = spu_shuffle(c06a, c06b, shufflepattern);
+  vec_float4 coeff_05 = spu_shuffle(c05a, c05b, shufflepattern);
+  vec_float4 coeff_04 = spu_shuffle(c04a, c04b, shufflepattern);
+  vec_float4 coeff_03 = spu_shuffle(c03a, c03b, shufflepattern);
+  vec_float4 coeff_02 = spu_shuffle(c02a, c02b, shufflepattern);
+  vec_float4 coeff_01 = spu_shuffle(c01a, c01b, shufflepattern);
+  vec_float4 coeff_00 = spu_shuffle(c00a, c00b, shufflepattern);
+
+  vec_float4 xoffset     = spu_shuffle(xoffseta, xoffsetb, shufflepattern);
+  vec_uint4  tcorrection = spu_shuffle(tcorra,   tcorrb,   shufflepattern);
+
 
   /*
-   * Special cases/errors.
+   * We've completed the coeff. setup. Now we actually do the
+   * approximation below.
    */
 
-  /* x = +/- infinite */
-  result = spu_sel(result, zerof, spu_cmpeq((vec_uint4)xabs, 0x7F800000));
-  result = spu_sel(result, twof,  spu_cmpeq((vec_uint4)xabs, 0xFF800000));
+  /* Adjust x value here (for approximations about a point) */
+  vec_float4 xappr = spu_sub(xabs, xoffset);
+
+
+  /* Now we do the multiplies.
+   * Use Horner's method.
+   */
+  result = coeff_14;
+  result = spu_madd(xappr, result, coeff_13);
+  result = spu_madd(xappr, result, coeff_12);
+  result = spu_madd(xappr, result, coeff_11);
+  result = spu_madd(xappr, result, coeff_10);
+  result = spu_madd(xappr, result, coeff_09);
+  result = spu_madd(xappr, result, coeff_08);
+  result = spu_madd(xappr, result, coeff_07);
+  result = spu_madd(xappr, result, coeff_06);
+  result = spu_madd(xappr, result, coeff_05);
+  result = spu_madd(xappr, result, coeff_04);
+  result = spu_madd(xappr, result, coeff_03);
+  result = spu_madd(xappr, result, coeff_02);
+  result = spu_madd(xappr, result, coeff_01);
+  result = spu_madd(xappr, result, coeff_00);
+
+  /* Adjust due to systematic truncation. */
+  result = (vec_float4)spu_add((vec_uint4)result, tcorrection);
+
+  /* Use the continued fraction approximation for x above approx. 4
+   * and below approx. 10
+   */
+  vec_float4 presult, xsqu;
+  xsqu = spu_mul(x, x);
+  CONTFRAC_ERFCF4(xabs, xsqu, presult);
+
+  /* Select between polynomial and continued fraction */
+  result = spu_sel(presult, result, spu_cmpgt(spu_splats(4.3f), xabs));
+
+  /* Above clamp value, set erfc = 0 */
+  result = spu_sel(result, zerof, spu_cmpgt(xabs, clamp));
+
+  /* Negative x values */
+  vec_uint4 gt0 = spu_cmpgt(x, zerof);
+  result = spu_sel(spu_sub(twof, result), result, gt0);
 
   return result;
 }
Index: src/newlib/libm/machine/spu/headers/erfd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/erfd2.h
+++ src/newlib/libm/machine/spu/headers/erfd2.h
@@ -68,7 +68,6 @@ static __inline vector double _erfd2(vec
 {
   vec_uchar16 dup_even  = ((vec_uchar16) { 0,1,2,3, 0,1,2,3,  8, 9,10,11,  8, 9,10,11 });
   vec_double2 onehalfd  = spu_splats(0.5);
-  vec_double2 zerod     = spu_splats(0.0);
   vec_double2 oned      = spu_splats(1.0);
   vec_double2 sign_mask = spu_splats(-0.0);
 
@@ -76,15 +75,9 @@ static __inline vector double _erfd2(vec
   vec_float4 approx_point = spu_splats(1.77f);
 
   vec_double2 xabs, xsqu, xsign;
-  vec_uint4 xabshigh;
-  vec_uint4 isinf, isnan;
   vec_double2 tresult, presult, result;
 
   xsign = spu_and(x, sign_mask);
-
-  /* Force Denorms to 0 */
-  x = spu_add(x, zerod);
-
   xabs = spu_andc(x, sign_mask);
   xsqu = spu_mul(x, x);
 
@@ -112,15 +105,11 @@ static __inline vector double _erfd2(vec
   /*
    * Special cases/errors.
    */
-  xabshigh = (vec_uint4)spu_shuffle(xabs, xabs, dup_even);
 
   /* x = +/- infinite, return +/-1 */
-  isinf = spu_cmpeq(xabshigh, 0x7FF00000);
-  result = spu_sel(result, oned, (vec_ullong2)isinf);
-
   /* x = nan, return x */
-  isnan = spu_cmpgt(xabshigh, 0x7FF00000);
-  result = spu_sel(result, x, (vec_ullong2)isnan);
+  result = spu_sel(result, oned, spu_testsv(x, SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
+  result = spu_sel(result,    x, spu_testsv(x, SPU_SV_NEG_DENORM   | SPU_SV_POS_DENORM));
 
   /*
    * Preserve sign in result, since erf(-x) = -erf(x)
Index: src/newlib/libm/machine/spu/headers/recipd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/recipd2.h
+++ src/newlib/libm/machine/spu/headers/recipd2.h
@@ -1,113 +1,168 @@
-/* --------------------------------------------------------------  */
-/* (C)Copyright 2001,2008,                                         */
-/* International Business Machines Corporation,                    */
-/* Sony Computer Entertainment, Incorporated,                      */
-/* Toshiba Corporation,                                            */
-/*                                                                 */
-/* All Rights Reserved.                                            */
-/*                                                                 */
-/* Redistribution and use in source and binary forms, with or      */
-/* without modification, are permitted provided that the           */
-/* following conditions are met:                                   */
-/*                                                                 */
-/* - Redistributions of source code must retain the above copyright*/
-/*   notice, this list of conditions and the following disclaimer. */
-/*                                                                 */
-/* - Redistributions in binary form must reproduce the above       */
-/*   copyright notice, this list of conditions and the following   */
-/*   disclaimer in the documentation and/or other materials        */
-/*   provided with the distribution.                               */
-/*                                                                 */
-/* - Neither the name of IBM Corporation nor the names of its      */
-/*   contributors may be used to endorse or promote products       */
-/*   derived from this software without specific prior written     */
-/*   permission.                                                   */
-/*                                                                 */
-/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
-/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
-/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
-/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
-/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
-/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
-/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
-/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
-/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
-/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
-/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
-/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
-/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
-/* --------------------------------------------------------------  */
-/* PROLOG END TAG zYx                                              */
-#ifdef __SPU__
-
-#ifndef _RECIPD2_H_
-#define _RECIPD2_H_		1
-
-#include <spu_intrinsics.h>
-
-
-/*
- * FUNCTION
- *	vector double _recipd2(vector double value)
- * 
- * DESCRIPTION
- * 	The _recipd2 function inverts "value" and returns the result. 
- *      Computation is performed using the single precision reciprocal 
- *      estimate and interpolate instructions to produce a 12 accurate 
- *      estimate.
- *
- *	One (1) iteration of a Newton-Raphson is performed to improve 
- *	accuracy to single precision floating point. Two additional double 
- *	precision iterations are  needed to achieve a full double
- *	preicision result.
- *
- *	The Newton-Raphson iteration is of the form:
- *		X[i+1] = X[i] * (2.0 - b*X[i]) 
- * 	where b is the input value to be inverted
- *
- */ 
-static __inline vector double _recipd2(vector double value_d)
-{
-  vector unsigned long long expmask  = (vector unsigned long long) { 0x7FF0000000000000ULL, 0x7FF0000000000000ULL };
-  vector float  x0;
-  vector float  value;
-  vector float  two   = spu_splats(2.0f);
-  vector double two_d = spu_splats(2.0);
-  vector double x1, x2, x3;
-  vector double bias;
-
-  /* Bias the divisor to correct for double precision floating
-   * point exponents that are out of single precision range.
-   */
-  bias = spu_xor(spu_and(value_d, (vector double)expmask), (vector double)expmask);
-
-  value = spu_roundtf(spu_mul(value_d, bias));
-  x0 = spu_re(value);
-  x1 = spu_extend(spu_mul(x0, spu_nmsub(value, x0, two)));
-  x1 = spu_mul(x1, bias);
-  x2 = spu_mul(x1, spu_nmsub(value_d, x1, two_d));
-  x3 = spu_mul(x2, spu_nmsub(value_d, x2, two_d));
-
-  /* Handle input = +/- infinity or +/-0. */
-
-#ifdef __SPU_EDP__
-  vec_ullong2 is0inf = spu_testsv(value_d, SPU_SV_NEG_ZERO     | SPU_SV_POS_ZERO |
-                                           SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY);
-#else
-  vec_double2 nzero = spu_splats(-0.0);
-  vec_double2 xabs = spu_andc(value_d, nzero);
-  vector unsigned char swap  = (vector unsigned char) {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
-  vec_uint4 isinf  = spu_cmpeq((vec_uint4)xabs, (vec_uint4)expmask);
-  vec_uint4 iszero = spu_cmpeq((vec_uint4)xabs, 0);
-  isinf  = spu_and(isinf,  spu_shuffle(isinf, isinf, swap));
-  iszero = spu_and(iszero, spu_shuffle(iszero, iszero, swap));
-  vec_ullong2 is0inf = (vec_ullong2)spu_or(isinf, iszero);
-#endif /* __SPU_EDP__ */
-
-  x3 = spu_sel(x3, spu_xor(value_d, (vector double)expmask), is0inf);
-
-  return (x3);
-}
-
-#endif /* _RECIPD2_H_ */
-#endif /* __SPU__ */
+/* --------------------------------------------------------------  */
+/* (C)Copyright 2001,2008,                                         */
+/* International Business Machines Corporation,                    */
+/* Sony Computer Entertainment, Incorporated,                      */
+/* Toshiba Corporation,                                            */
+/*                                                                 */
+/* All Rights Reserved.                                            */
+/*                                                                 */
+/* Redistribution and use in source and binary forms, with or      */
+/* without modification, are permitted provided that the           */
+/* following conditions are met:                                   */
+/*                                                                 */
+/* - Redistributions of source code must retain the above copyright*/
+/*   notice, this list of conditions and the following disclaimer. */
+/*                                                                 */
+/* - Redistributions in binary form must reproduce the above       */
+/*   copyright notice, this list of conditions and the following   */
+/*   disclaimer in the documentation and/or other materials        */
+/*   provided with the distribution.                               */
+/*                                                                 */
+/* - Neither the name of IBM Corporation nor the names of its      */
+/*   contributors may be used to endorse or promote products       */
+/*   derived from this software without specific prior written     */
+/*   permission.                                                   */
+/*                                                                 */
+/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
+/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
+/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
+/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
+/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
+/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
+/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
+/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
+/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
+/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
+/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
+/* --------------------------------------------------------------  */
+/* PROLOG END TAG zYx                                              */
+#ifdef __SPU__
+
+#ifndef _RECIPD2_H_
+#define _RECIPD2_H_		1
+
+#include <spu_intrinsics.h>
+
+
+/*
+ * FUNCTION
+ *	vector double _recipd2(vector double value)
+ * 
+ * DESCRIPTION
+ * 	The _recipd2 function inverts "value" and returns the result. 
+ *      Computation is performed using the single precision reciprocal 
+ *      estimate and interpolate instructions to produce a 12 accurate 
+ *      estimate.
+ *
+ *	One (1) iteration of a Newton-Raphson is performed to improve 
+ *	accuracy to single precision floating point. Two additional double 
+ *	precision iterations are  needed to achieve a full double
+ *	preicision result.
+ *
+ *	The Newton-Raphson iteration is of the form:
+ *	a)	X[i+1] = X[i] * (2.0 - b*X[i]) 
+ *          or
+ *      b)	X[i+1] = X[i] + X[i]*(1.0 - X[i]*b)
+ * 	where b is the input value to be inverted
+ *
+ *      The later (b) form improves the accuracy to 99.95% correctly rounded.
+ */ 
+static __inline vector double _recipd2(vector double value_in)
+{
+  vec_float4  x0;
+  vec_float4  value;
+  vec_float4  one   = spu_splats(1.0f);
+  vec_double2 one_d = spu_splats(1.0);
+  vec_double2 x1, x2, x3;
+  vec_double2 scale;
+  vec_double2 exp, value_d;
+  vec_ullong2 expmask = spu_splats(0x7FF0000000000000ULL);
+  vec_ullong2 is0inf;
+
+#ifdef __SPU_EDP__
+  vec_ullong2 isdenorm;
+  vec_ullong2 expmask_minus1 = spu_splats(0x7FE0000000000000ULL);
+
+  /* Determine special input values. For example, if the input is a denorm, infinity or 0 */
+
+  isdenorm = spu_testsv(value_in, (SPU_SV_POS_DENORM   | SPU_SV_NEG_DENORM));
+  is0inf   = spu_testsv(value_in, (SPU_SV_NEG_ZERO     | SPU_SV_POS_ZERO |
+				   SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
+
+  /* Scale the divisor to correct for double precision floating
+   * point exponents that are out of single precision range.
+   */
+  exp = spu_and(value_in, (vec_double2)expmask);
+  scale = spu_xor(exp, (vec_double2)spu_sel(expmask, expmask_minus1, isdenorm));
+  value_d = spu_mul(value_in, scale);
+  value = spu_roundtf(value_d);
+
+  /* Perform reciprocal with 1 single precision and 2 double precision
+   * Newton-Raphson iterations.
+   */
+  x0 = spu_re(value);
+  x1 = spu_extend(spu_madd(spu_nmsub(value, x0, one), x0, x0));
+  x2 = spu_madd(spu_nmsub(value_d, x1, one_d), x1, x1);
+  x3 = spu_madd(spu_nmsub(value_d, x2, one_d), x2, x2);
+  x3 = spu_sel(spu_mul(x3, scale), spu_xor(value_in, (vector double)expmask), is0inf);
+
+#else /* !__SPU_EDP__ */
+
+  vec_uint4 isinf, iszero, isdenorm0;
+  vec_double2 value_abs;
+  vec_double2 sign = spu_splats(-0.0);
+  vec_double2 denorm_scale = (vec_double2)spu_splats(0x4330000000000000ULL);
+  vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);
+  vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
+  vec_uchar16 swap = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
+
+  value_abs = spu_andc(value_in, sign);
+  exp = spu_and(value_in, (vec_double2)expmask);
+
+  /* Determine if the input is a special value. These include:
+   *  denorm   - then we must coerce it to a normal value. 
+   *  zero     - then we must return an infinity
+   *  infinity - then we must return a zero.
+   */
+  isdenorm0 = spu_cmpeq(spu_shuffle((vec_uint4)exp, (vec_uint4)exp, splat_hi), 0);
+
+  isinf  = spu_cmpeq((vec_uint4)value_abs, (vec_uint4)expmask);
+  iszero = spu_cmpeq((vec_uint4)value_abs, 0);
+  isinf  = spu_and(isinf,  spu_shuffle(isinf, isinf, swap));
+  iszero = spu_and(iszero, spu_shuffle(iszero, iszero, swap));
+  is0inf = (vec_ullong2)spu_or(isinf, iszero);
+
+  /* If the inputs is a denorm, we must first convert it to a normal number since
+   * arithmetic operations on denormals produces 0 on Cell/B.E.
+   */
+  value_d = spu_sub(spu_or(value_abs, exp_53), exp_53);
+  value_d = spu_sel(value_abs, value_d, (vec_ullong2)isdenorm0);
+
+  /* Scale the divisor to correct for double precision floating
+   * point exponents that are out of single precision range.
+   */
+  scale = spu_xor(spu_and(value_d, (vec_double2)expmask), (vec_double2)expmask);
+  value_d = spu_mul(value_d, scale);
+  value = spu_roundtf(value_d);
+
+  /* Perform reciprocal with 1 single precision and 2 double precision
+   * Newton-Raphson iterations. The bias is removed after the single 
+   * precision iteration.
+   */
+  x0 = spu_re(value);
+  x1 = spu_extend(spu_madd(spu_nmsub(value, x0, one), x0, x0));
+  x2 = spu_madd(spu_nmsub(value_d, x1, one_d), x1, x1);
+  x3 = spu_madd(spu_nmsub(value_d, x2, one_d), x2, x2);
+  x3 = spu_mul(x3, spu_sel(scale, value_in, (vec_ullong2)sign));
+  x3 = spu_sel(x3, spu_mul(x3, denorm_scale), (vec_ullong2)isdenorm0);
+  x3 = spu_sel(x3, spu_xor(value_in, (vector double)expmask), is0inf);
+
+#endif /* __SPU_EDP__ */
+
+  return (x3);
+}
+
+#endif /* _RECIPD2_H_ */
+#endif /* __SPU__ */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]