[patch 1/2] SPU libm

Ken Werner ken@linux.vnet.ibm.com
Tue Sep 2 16:53:00 GMT 2008


Hi,

the SPU implementation of libm incorporates header files of the simdmath 
library. This patch updates these headers (rebase against current version).

Ken

newlib/ChangeLog:

2008-09-01  Ken Werner  <ken.werner@de.ibm.com>

        * libm/machine/spu/headers/acoshf4.h: Rebase against current simdmath.
        * libm/machine/spu/headers/asinhd2.h: Likewise.
        * libm/machine/spu/headers/atanhd2.h: Likewise.
        * libm/machine/spu/headers/atanhf4.h: Likewise.
        * libm/machine/spu/headers/erff4.h: Likewise.
        * libm/machine/spu/headers/expd2.h: Likewise.
        * libm/machine/spu/headers/ldexpd2.h: Likewise.
        * libm/machine/spu/headers/lgammaf4.h: Likewise.
        * libm/machine/spu/headers/logbf4.h: Likewise.
        * libm/machine/spu/headers/nextafterd2.h: Likewise.
        * libm/machine/spu/headers/nextafterf4.h: Likewise.
        * libm/machine/spu/headers/recipd2.h: Likewise.
        * libm/machine/spu/headers/simdmath.h: Likewise.
        * libm/machine/spu/headers/acoshd2.: Likewise.

Index: src/newlib/libm/machine/spu/headers/acoshf4.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/acoshf4.h
+++ src/newlib/libm/machine/spu/headers/acoshf4.h
@@ -89,26 +89,26 @@
  * Taylor Series Coefficients 
  * for x around 1.
  */
-#define ACOSH_TAY01  1.0000000000000000000000000000000000000000000000000000000000000000000000E0  /* 1 / 1                            */
-#define ACOSH_TAY02 -8.3333333333333333333333333333333333333333333333333333333333333333333333E-2 /* 1 / 12                           */
-#define ACOSH_TAY03  1.8750000000000000000000000000000000000000000000000000000000000000000000E-2 /* 3 / 160                          */
-#define ACOSH_TAY04 -5.5803571428571428571428571428571428571428571428571428571428571428571429E-3 /* 5 / 896                          */
-#define ACOSH_TAY05  1.8988715277777777777777777777777777777777777777777777777777777777777778E-3 /* 35 / 18432                       */
-#define ACOSH_TAY06 -6.9912997159090909090909090909090909090909090909090909090909090909090909E-4 /* 63 / 90112                       */
-#define ACOSH_TAY07  2.7113694411057692307692307692307692307692307692307692307692307692307692E-4 /* 231 / 851968                     */
-#define ACOSH_TAY08 -1.0910034179687500000000000000000000000000000000000000000000000000000000E-4 /* 143 / 1310720                    */
-#define ACOSH_TAY09  4.5124222250545726102941176470588235294117647058823529411764705882352941E-5 /* 6435 / 142606336                 */
-#define ACOSH_TAY10 -1.9065643611707185444078947368421052631578947368421052631578947368421053E-5 /* 12155 / 637534208                */
-#define ACOSH_TAY11  8.1936873140789213634672619047619047619047619047619047619047619047619048E-6 /* 46189 / 5637144576               */
-#define ACOSH_TAY12 -3.5705692742181860882302989130434782608695652173913043478260869565217391E-6 /* 88179 / 24696061952              */
-#define ACOSH_TAY13  1.5740259550511837005615234375000000000000000000000000000000000000000000E-6 /* 676039 / 429496729600            */
-#define ACOSH_TAY14 -7.0068819224144573564882631655092592592592592592592592592592592592592593E-7 /* 1300075 / 1855425871872          */
-#define ACOSH_TAY15  3.1453306166503321507881427633351293103448275862068965517241379310344828E-7 /* 5014575 / 15942918602752         */
+#define SDM_ACOSHF4_TAY01  1.00000000000000000000000000000000000E0f  /* 1 / 1                            */
+#define SDM_ACOSHF4_TAY02 -8.33333333333333333333333333333333333E-2f /* 1 / 12                           */
+#define SDM_ACOSHF4_TAY03  1.87500000000000000000000000000000000E-2f /* 3 / 160                          */
+#define SDM_ACOSHF4_TAY04 -5.58035714285714285714285714285714286E-3f /* 5 / 896                          */
+#define SDM_ACOSHF4_TAY05  1.89887152777777777777777777777777778E-3f /* 35 / 18432                       */
+#define SDM_ACOSHF4_TAY06 -6.99129971590909090909090909090909091E-4f /* 63 / 90112                       */
+#define SDM_ACOSHF4_TAY07  2.71136944110576923076923076923076923E-4f /* 231 / 851968                     */
+#define SDM_ACOSHF4_TAY08 -1.09100341796875000000000000000000000E-4f /* 143 / 1310720                    */
+#define SDM_ACOSHF4_TAY09  4.51242222505457261029411764705882353E-5f /* 6435 / 142606336                 */
+#define SDM_ACOSHF4_TAY10 -1.90656436117071854440789473684210526E-5f /* 12155 / 637534208                */
+#define SDM_ACOSHF4_TAY11  8.19368731407892136346726190476190476E-6f /* 46189 / 5637144576               */
+#define SDM_ACOSHF4_TAY12 -3.57056927421818608823029891304347826E-6f /* 88179 / 24696061952              */
+#define SDM_ACOSHF4_TAY13  1.57402595505118370056152343750000000E-6f /* 676039 / 429496729600            */
+#define SDM_ACOSHF4_TAY14 -7.00688192241445735648826316550925926E-7f /* 1300075 / 1855425871872          */
+#define SDM_ACOSHF4_TAY15  3.14533061665033215078814276333512931E-7f /* 5014575 / 15942918602752         */
 #if 0
-#define ACOSH_TAY16 -1.4221629293564136230176494967552923387096774193548387096774193548387097E-7 /* 9694845 / 68169720922112         */
-#define ACOSH_TAY17  6.4711106776113328206437555226412686434659090909090909090909090909090909E-8 /* 100180065 / 1548112371908608     */
-#define ACOSH_TAY18 -2.9609409781171182528071637664522443498883928571428571428571428571428571E-8 /* 116680311 / 3940649673949184     */
-#define ACOSH_TAY19  1.3615438056281793767600509061201198680980785472972972972972972972972973E-8 /* 2268783825 / 166633186212708352  */
+#define SDM_ACOSHF4_TAY16 -1.42216292935641362301764949675529234E-7f /* 9694845 / 68169720922112         */
+#define SDM_ACOSHF4_TAY17  6.47111067761133282064375552264126864E-8f /* 100180065 / 1548112371908608     */
+#define SDM_ACOSHF4_TAY18 -2.96094097811711825280716376645224435E-8f /* 116680311 / 3940649673949184     */
+#define SDM_ACOSHF4_TAY19  1.36154380562817937676005090612011987E-8f /* 2268783825 / 166633186212708352  */
 #endif
 
 
@@ -117,6 +117,7 @@ static __inline vector float _acoshf4(ve
 {
     vec_float4 minus_onef = spu_splats(-1.0f);
     vec_float4 twof       = spu_splats(2.0f);
+    vec_float4 largef     = spu_splats(2.5e19f);
     vec_float4 xminus1;
     /* Where we switch from taylor to formula */
     vec_float4  switch_approx = spu_splats(2.0f);
@@ -129,30 +130,33 @@ static __inline vector float _acoshf4(ve
      *   acosh = ln(x + sqrt(x^2 - 1))
      */
     fresult = _sqrtf4(spu_madd(x, x, minus_onef));
-    fresult = spu_add(x, fresult);
+    fresult = spu_add(x, spu_sel(fresult, x, spu_cmpgt(x, largef)));
     fresult = _logf4(fresult);
+    fresult = (vec_float4)spu_add((vec_uint4)fresult, spu_splats(2u));
 
     /*
      * Taylor Series
      */
     xminus1 = spu_add(x, minus_onef);
 
-    mresult = spu_madd(xminus1, spu_splats((float)ACOSH_TAY15), spu_splats((float)ACOSH_TAY14));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY13));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY12));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY11));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY10));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY09));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY08));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY07));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY06));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY05));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY04));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY03));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY02));
-    mresult = spu_madd(xminus1, mresult, spu_splats((float)ACOSH_TAY01));
+    mresult = spu_splats(SDM_ACOSHF4_TAY15);
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY14));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY13));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY12));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY11));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY10));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY09));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY08));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY07));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY06));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY05));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY04));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY03));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY02));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHF4_TAY01));
     
     mresult = spu_mul(mresult, _sqrtf4(spu_mul(xminus1, twof)));
+    mresult = (vec_float4)spu_add((vec_uint4)mresult, spu_splats(1u));
 
     /*
      * Select series or formula
@@ -160,6 +164,7 @@ static __inline vector float _acoshf4(ve
     use_form = spu_cmpgt(x, switch_approx);
     result = spu_sel(mresult, fresult, use_form);
 
+
     return result;
 }
 
Index: src/newlib/libm/machine/spu/headers/asinhd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/asinhd2.h
+++ src/newlib/libm/machine/spu/headers/asinhd2.h
@@ -93,43 +93,27 @@
  * Maclaurin Series Coefficients 
  * for x near 0.
  */
-#define ASINH_MAC01     1.0000000000000000000000000000000000000000000000000000000000000000000000E0
-#define ASINH_MAC03     -1.6666666666666666666666666666666666666666666666666666666666666666666667E-1
-#define ASINH_MAC05     7.5000000000000000000000000000000000000000000000000000000000000000000000E-2
-#define ASINH_MAC07     -4.4642857142857142857142857142857142857142857142857142857142857142857143E-2
-#define ASINH_MAC09     3.0381944444444444444444444444444444444444444444444444444444444444444444E-2
-#define ASINH_MAC11     -2.2372159090909090909090909090909090909090909090909090909090909090909091E-2
-#define ASINH_MAC13     1.7352764423076923076923076923076923076923076923076923076923076923076923E-2
-#define ASINH_MAC15     -1.3964843750000000000000000000000000000000000000000000000000000000000000E-2
-#define ASINH_MAC17     1.1551800896139705882352941176470588235294117647058823529411764705882353E-2
-#if 0
-#define ASINH_MAC19     -9.7616095291940789473684210526315789473684210526315789473684210526315789E-3
-#define ASINH_MAC21     8.3903358096168154761904761904761904761904761904761904761904761904761905E-3
-#define ASINH_MAC23     -7.3125258735988451086956521739130434782608695652173913043478260869565217E-3
-#define ASINH_MAC25     6.4472103118896484375000000000000000000000000000000000000000000000000000E-3
-#define ASINH_MAC27     -5.7400376708419234664351851851851851851851851851851851851851851851851852E-3
-#define ASINH_MAC29     5.1533096823199041958512931034482758620689655172413793103448275862068966E-3
-#define ASINH_MAC31     -4.6601434869150961599042338709677419354838709677419354838709677419354839E-3
-#define ASINH_MAC33     4.2409070936793630773370916193181818181818181818181818181818181818181818E-3
-#define ASINH_MAC35     -3.8809645588376692363194056919642857142857142857142857142857142857142857E-3
-#define ASINH_MAC37     3.5692053938259345454138678473395270270270270270270270270270270270270270E-3
-#define ASINH_MAC39     -3.2970595034734847453924325796274038461538461538461538461538461538461538E-3
-#define ASINH_MAC41     3.0578216492580306693548109473251714939024390243902439024390243902439024E-3
-#define ASINH_MAC43     -2.8461784011089421678767647854117460029069767441860465116279069767441860E-3
-#endif
+#define SDM_ASINHD2_MAC01     1.000000000000000000000000000000000000000000E0
+#define SDM_ASINHD2_MAC03    -1.666666666666666666666666666666666666666667E-1
+#define SDM_ASINHD2_MAC05     7.500000000000000000000000000000000000000000E-2
+#define SDM_ASINHD2_MAC07    -4.464285714285714285714285714285714285714286E-2
+#define SDM_ASINHD2_MAC09     3.038194444444444444444444444444444444444444E-2
+#define SDM_ASINHD2_MAC11    -2.237215909090909090909090909090909090909091E-2
+#define SDM_ASINHD2_MAC13     1.735276442307692307692307692307692307692308E-2
+#define SDM_ASINHD2_MAC15    -1.396484375000000000000000000000000000000000E-2
+#define SDM_ASINHD2_MAC17     1.155180089613970588235294117647058823529412E-2
 
 
 static __inline vector double _asinhd2(vector double x)
 {
     vec_double2 sign_mask = spu_splats(-0.0);
     vec_double2 oned      = spu_splats(1.0);
-    vec_uchar16 dup_even  = ((vec_uchar16) { 0,1,2,3,  0,1,2,3, 8,9,10,11, 8,9,10,11 });
+    vec_uchar16 dup_even  = ((vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 });
     vec_uint4   infminus1 = spu_splats(0x7FEFFFFFU);
     vec_uint4   isinfnan;
     vec_double2 xabs, xsqu;
     vec_uint4   xabshigh;
-    /* Where we switch from maclaurin to formula */
-    vec_float4  switch_approx = spu_splats(0.165f);
+    vec_float4  switch_approx = spu_splats(0.165f);  /* Where we switch from maclaurin to formula */
     vec_uint4   use_form;
     vec_float4  xf;
     vec_double2 result, fresult, mresult;
@@ -153,14 +137,16 @@ static __inline vector double _asinhd2(v
     /*
      * Maclaurin Series approximation
      */
-    mresult = spu_madd(xsqu, spu_splats(ASINH_MAC17), spu_splats(ASINH_MAC15));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ASINH_MAC13));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ASINH_MAC11));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ASINH_MAC09));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ASINH_MAC07));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ASINH_MAC05));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ASINH_MAC03));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ASINH_MAC01));
+
+    mresult = spu_splats(SDM_ASINHD2_MAC17);
+    mresult = spu_madd(xsqu, mresult, spu_splats(SDM_ASINHD2_MAC15));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SDM_ASINHD2_MAC13));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SDM_ASINHD2_MAC11));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SDM_ASINHD2_MAC09));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SDM_ASINHD2_MAC07));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SDM_ASINHD2_MAC05));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SDM_ASINHD2_MAC03));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SDM_ASINHD2_MAC01));
     mresult = spu_mul(xabs, mresult);
 
 
Index: src/newlib/libm/machine/spu/headers/atanhd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/atanhd2.h
+++ src/newlib/libm/machine/spu/headers/atanhd2.h
@@ -72,7 +72,7 @@
  *  Special Cases:
  *  - atanh(1)  =  Infinity
  *  - atanh(-1) = -Infinity
- *  - atanh(x) for |x| > 1 = NaN
+ *  - atanh(x) for |x| > 1 = Undefined
  *
  */
 
@@ -80,37 +80,27 @@
  * Maclaurin Series Coefficients 
  * for x near 0.
  */
-#define ATANH_MAC01 1.0000000000000000000000000000000000000000000000000000000000000000000000E0
-#define ATANH_MAC03 3.3333333333333333333333333333333333333333333333333333333333333333333333E-1
-#define ATANH_MAC05 2.0000000000000000000000000000000000000000000000000000000000000000000000E-1
-#define ATANH_MAC07 1.4285714285714285714285714285714285714285714285714285714285714285714286E-1
-#define ATANH_MAC09 1.1111111111111111111111111111111111111111111111111111111111111111111111E-1
-#define ATANH_MAC11 9.0909090909090909090909090909090909090909090909090909090909090909090909E-2
-#define ATANH_MAC13 7.6923076923076923076923076923076923076923076923076923076923076923076923E-2
+#define SMD_DP_ATANH_MAC01 1.000000000000000000000000000000E0
+#define SMD_DP_ATANH_MAC03 3.333333333333333333333333333333E-1
+#define SMD_DP_ATANH_MAC05 2.000000000000000000000000000000E-1
+#define SMD_DP_ATANH_MAC07 1.428571428571428571428571428571E-1
+#define SMD_DP_ATANH_MAC09 1.111111111111111111111111111111E-1
+#define SMD_DP_ATANH_MAC11 9.090909090909090909090909090909E-2
+#define SMD_DP_ATANH_MAC13 7.692307692307692307692307692308E-2
+#define SMD_DP_ATANH_MAC15 6.666666666666666666666666666667E-2
+#define SMD_DP_ATANH_MAC17 5.882352941176470588235294117647E-2
 #if 0
-#define ATANH_MAC15 6.6666666666666666666666666666666666666666666666666666666666666666666667E-2
-#define ATANH_MAC17 5.8823529411764705882352941176470588235294117647058823529411764705882353E-2
-#define ATANH_MAC19 5.2631578947368421052631578947368421052631578947368421052631578947368421E-2
-#define ATANH_MAC21 4.7619047619047619047619047619047619047619047619047619047619047619047619E-2
-#define ATANH_MAC23 4.3478260869565217391304347826086956521739130434782608695652173913043478E-2
-#define ATANH_MAC25 4.0000000000000000000000000000000000000000000000000000000000000000000000E-2
-#define ATANH_MAC27 3.7037037037037037037037037037037037037037037037037037037037037037037037E-2
-#define ATANH_MAC29 3.4482758620689655172413793103448275862068965517241379310344827586206897E-2
-#define ATANH_MAC31 3.2258064516129032258064516129032258064516129032258064516129032258064516E-2
-#define ATANH_MAC33 3.0303030303030303030303030303030303030303030303030303030303030303030303E-2
-#define ATANH_MAC35 2.8571428571428571428571428571428571428571428571428571428571428571428571E-2
-#define ATANH_MAC37 2.7027027027027027027027027027027027027027027027027027027027027027027027E-2
-#define ATANH_MAC39 2.5641025641025641025641025641025641025641025641025641025641025641025641E-2
-#define ATANH_MAC41 2.4390243902439024390243902439024390243902439024390243902439024390243902E-2
-#define ATANH_MAC43 2.3255813953488372093023255813953488372093023255813953488372093023255814E-2
-#define ATANH_MAC45 2.2222222222222222222222222222222222222222222222222222222222222222222222E-2
-#define ATANH_MAC47 2.1276595744680851063829787234042553191489361702127659574468085106382979E-2
-#define ATANH_MAC49 2.0408163265306122448979591836734693877551020408163265306122448979591837E-2
-#define ATANH_MAC51 1.9607843137254901960784313725490196078431372549019607843137254901960784E-2
-#define ATANH_MAC53 1.8867924528301886792452830188679245283018867924528301886792452830188679E-2
-#define ATANH_MAC55 1.8181818181818181818181818181818181818181818181818181818181818181818182E-2
-#define ATANH_MAC57 1.7543859649122807017543859649122807017543859649122807017543859649122807E-2
-#define ATANH_MAC59 1.6949152542372881355932203389830508474576271186440677966101694915254237E-2
+#define SMD_DP_ATANH_MAC19 5.263157894736842105263157894737E-2
+#define SMD_DP_ATANH_MAC21 4.761904761904761904761904761905E-2
+#define SMD_DP_ATANH_MAC23 4.347826086956521739130434782609E-2
+#define SMD_DP_ATANH_MAC25 4.000000000000000000000000000000E-2
+#define SMD_DP_ATANH_MAC27 3.703703703703703703703703703704E-2
+#define SMD_DP_ATANH_MAC29 3.448275862068965517241379310345E-2
+#define SMD_DP_ATANH_MAC31 3.225806451612903225806451612903E-2
+#define SMD_DP_ATANH_MAC33 3.030303030303030303030303030303E-2
+#define SMD_DP_ATANH_MAC35 2.857142857142857142857142857143E-2
+#define SMD_DP_ATANH_MAC37 2.702702702702702702702702702703E-2
+#define SMD_DP_ATANH_MAC39 2.564102564102564102564102564103E-2
 #endif
 
 
@@ -120,12 +110,9 @@ static __inline vector double _atanhd2(v
     vec_double2 sign_mask = spu_splats(-0.0);
     vec_double2 oned      = spu_splats(1.0);
     vec_double2 onehalfd  = spu_splats(0.5);
-    vec_uint4   infminus1 = spu_splats(0x7FEFFFFFU);
-    vec_uint4   isinfnan;
-    vec_uint4   xabshigh;
     vec_double2 xabs, xsqu;
     /* Where we switch from maclaurin to formula */
-    vec_float4  switch_approx = spu_splats(0.08f);
+    vec_float4  switch_approx = spu_splats(0.125f);
     vec_uint4   use_form;
     vec_float4  xf;
     vec_double2 result, fresult, mresult;;
@@ -147,12 +134,14 @@ static __inline vector double _atanhd2(v
     /*
      * Taylor Series
      */
-    mresult = spu_madd(xsqu, spu_splats(ATANH_MAC13), spu_splats(ATANH_MAC11));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ATANH_MAC09));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ATANH_MAC07));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ATANH_MAC05));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ATANH_MAC03));
-    mresult = spu_madd(xsqu, mresult, spu_splats(ATANH_MAC01));
+    mresult = spu_madd(xsqu, spu_splats(SMD_DP_ATANH_MAC17), spu_splats(SMD_DP_ATANH_MAC15));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SMD_DP_ATANH_MAC13));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SMD_DP_ATANH_MAC11));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SMD_DP_ATANH_MAC09));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SMD_DP_ATANH_MAC07));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SMD_DP_ATANH_MAC05));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SMD_DP_ATANH_MAC03));
+    mresult = spu_madd(xsqu, mresult, spu_splats(SMD_DP_ATANH_MAC01));
     mresult = spu_mul(xabs, mresult);
 
 
@@ -162,10 +151,10 @@ static __inline vector double _atanhd2(v
     use_form = spu_cmpgt(xf, switch_approx);
     result = spu_sel(mresult, fresult, (vec_ullong2)use_form);
 
-    /* Infinity and NaN */
-    xabshigh = (vec_uint4)spu_shuffle(xabs, xabs, dup_even);
-    isinfnan = spu_cmpgt(xabshigh, infminus1);
-    result = spu_sel(result, x, (vec_ullong2)isinfnan);
+    /*
+     * Spec says results are undefined for |x| > 1, so
+     * no boundary tests needed here.
+     */
 
     /* Restore sign - atanh is an anti-symmetric */
     result = spu_sel(result, x, (vec_ullong2)sign_mask);
Index: src/newlib/libm/machine/spu/headers/atanhf4.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/atanhf4.h
+++ src/newlib/libm/machine/spu/headers/atanhf4.h
@@ -52,6 +52,7 @@
 #define _ATANHF4_H_	1
 
 #include <spu_intrinsics.h>
+#include <math.h>
 #include "logf4.h"
 
 /*
@@ -72,8 +73,7 @@
  *  Special Cases:
  *  - atanh(1)  =  HUGE_VALF
  *  - atanh(-1) = -HUGE_VALF
- *	- The result is undefined for x outside of the domain [-1,1],
- *	  since single-precision NaN is not supported on the SPU.
+ *	- The result is undefined for x outside of the domain [-1,1].
  *
  */
 
@@ -81,34 +81,28 @@
  * Maclaurin Series Coefficients 
  * for x near 0.
  */
-#define ATANH_MAC01 1.0000000000000000000000000000000000000000000000000000000000000000000000E0
-#define ATANH_MAC03 3.3333333333333333333333333333333333333333333333333333333333333333333333E-1
-#define ATANH_MAC05 2.0000000000000000000000000000000000000000000000000000000000000000000000E-1
-#define ATANH_MAC07 1.4285714285714285714285714285714285714285714285714285714285714285714286E-1
+#define SDM_SP_ATANH_MAC01 1.000000000000000000000000000000E0
+#define SDM_SP_ATANH_MAC03 3.333333333333333333333333333333E-1
+#define SDM_SP_ATANH_MAC05 2.000000000000000000000000000000E-1
+#define SDM_SP_ATANH_MAC07 1.428571428571428571428571428571E-1
 #if 0
-#define ATANH_MAC09 1.1111111111111111111111111111111111111111111111111111111111111111111111E-1
-#define ATANH_MAC11 9.0909090909090909090909090909090909090909090909090909090909090909090909E-2
-#define ATANH_MAC13 7.6923076923076923076923076923076923076923076923076923076923076923076923E-2
-#define ATANH_MAC15 6.6666666666666666666666666666666666666666666666666666666666666666666667E-2
-#define ATANH_MAC17 5.8823529411764705882352941176470588235294117647058823529411764705882353E-2
-#define ATANH_MAC19 5.2631578947368421052631578947368421052631578947368421052631578947368421E-2
-#define ATANH_MAC21 4.7619047619047619047619047619047619047619047619047619047619047619047619E-2
-#define ATANH_MAC23 4.3478260869565217391304347826086956521739130434782608695652173913043478E-2
-#define ATANH_MAC25 4.0000000000000000000000000000000000000000000000000000000000000000000000E-2
-#define ATANH_MAC27 3.7037037037037037037037037037037037037037037037037037037037037037037037E-2
-#define ATANH_MAC29 3.4482758620689655172413793103448275862068965517241379310344827586206897E-2
+#define SDM_SP_ATANH_MAC09 1.111111111111111111111111111111E-1
+#define SDM_SP_ATANH_MAC11 9.090909090909090909090909090909E-2
+#define SDM_SP_ATANH_MAC13 7.692307692307692307692307692308E-2
+#define SDM_SP_ATANH_MAC15 6.666666666666666666666666666667E-2
 #endif
 
 
 static __inline vector float _atanhf4(vector float x)
 {
+    vec_uint4  one = spu_splats(1u);
     vec_float4 sign_mask = spu_splats(-0.0f);
     vec_float4 onef      = spu_splats(1.0f);
     vec_float4 onehalff  = spu_splats(0.5f);
+    vec_float4 huge      = spu_splats(HUGE_VALF);
     vec_float4 result, fresult, mresult;;
     vec_float4 xabs, xsqu;
     /* Where we switch from maclaurin to formula */
-    //vec_float4  switch_approx = spu_splats(0.4661f);
     vec_float4  switch_approx = spu_splats(0.165f);
     vec_uint4   use_form;
 
@@ -126,18 +120,33 @@ static __inline vector float _atanhf4(ve
     /*
      * Taylor Series
      */
-    mresult = spu_madd(xsqu, spu_splats((float)ATANH_MAC07), spu_splats((float)ATANH_MAC05));
-    mresult = spu_madd(xsqu, mresult, spu_splats((float)ATANH_MAC03));
-    mresult = spu_madd(xsqu, mresult, spu_splats((float)ATANH_MAC01));
+    mresult = spu_madd(xsqu, spu_splats((float)SDM_SP_ATANH_MAC07), spu_splats((float)SDM_SP_ATANH_MAC05));
+    mresult = spu_madd(xsqu, mresult, spu_splats((float)SDM_SP_ATANH_MAC03));
+    mresult = spu_madd(xsqu, mresult, spu_splats((float)SDM_SP_ATANH_MAC01));
     mresult = spu_mul(xabs, mresult);
 
-
     /*
      * Choose between series and formula
      */
     use_form = spu_cmpgt(xabs, switch_approx);
     result = spu_sel(mresult, fresult, use_form);
 
+    /*
+     * Correct for accumulated truncation error. Currently reduces rms of
+     * absolute error by about 50%
+     */
+    result = (vec_float4)spu_add((vec_uint4)result, spu_and(one, spu_cmpgt(xabs, spu_splats(0.0f))));
+    result = (vec_float4)spu_add((vec_uint4)result, spu_and(one, spu_cmpgt(xabs, spu_splats(0.25f))));
+
+    /*
+     * Check Boundary Conditions
+     */
+    result = spu_sel(result, huge, spu_cmpeq(xabs, onef));
+
+    /*
+     * Spec says |x| > 1, result is undefined, so no additional
+     * boundary checks needed.
+     */
 
     /* Preserve sign - atanh is anti-symmetric */
     result = spu_sel(result, x, (vec_uint4)sign_mask);
Index: src/newlib/libm/machine/spu/headers/erff4.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/erff4.h
+++ src/newlib/libm/machine/spu/headers/erff4.h
@@ -53,11 +53,6 @@
 
 #include <spu_intrinsics.h>
 
-#include "expf4.h"
-#include "recipf4.h"
-#include "divf4.h"
-#include "erf_utils.h"
-
 /*
  * FUNCTION
  *  vector float _erff4(vector float x)
@@ -75,54 +70,346 @@
 
 static __inline vector float _erff4(vector float x)
 {
-  vector float onehalff  = spu_splats(0.5f);
-  vector float zerof     = spu_splats(0.0f);
-  vector float onef      = spu_splats(1.0f);
-  vector float sign_mask = spu_splats(-0.0f);
+  vec_float4 sign_maskf = spu_splats(-0.0f);
+  vec_float4 zerof      = spu_splats(0.0f);
+  vec_float4 onef       = spu_splats(1.0f);
+  vec_float4 clamp      = spu_splats(3.9199876f);
+  vec_float4 xabs       = spu_andc(x, sign_maskf);
+  vec_float4 xsign      = spu_and(x, sign_maskf);
+  vec_float4 result;
 
-  /* This is where we switch from Taylor Series to Continued Fraction approximation */
-  vec_float4 approx_point = spu_splats(0.89f);
 
-  vec_float4 xabs, xsqu, xsign;
-  vec_uint4 isinf;
-  vec_float4 tresult, presult, result;
+  /*
+   * First thing we do is setup the description of each partition.
+   * This consists of:
+   * - Start x of partition
+   * - Offset (used for evaluating power series expanded around a point)
+   * - Truncation adjustment.
+   */
 
-  xsign = spu_and(x, sign_mask);
 
-  /* Force Denorms to 0 */
-  x = spu_add(x, zerof);
+  /***************************************************************
+   * REGION 0: Approximation Near 0 from Above
+   *
+   */
+#define SDM_ERFF4_0_START     0.0f
+#define SDM_ERFF4_0_OFF       0.0f
+#define SDM_ERFF4_0_TRUNC     2u
+
+#define SDM_ERFF4_0_00   0.0f
+#define SDM_ERFF4_0_01   1.12837916709551257389615890312154f
+#define SDM_ERFF4_0_02   0.0f
+#define SDM_ERFF4_0_03  -0.37612638903183752463205296770955f
+#define SDM_ERFF4_0_04   0.0f
+#define SDM_ERFF4_0_05   0.11283791670955125738961589031073f
+#define SDM_ERFF4_0_06   0.0f
+#define SDM_ERFF4_0_07  -0.02686617064513125175943235483588f
+#define SDM_ERFF4_0_08   0.0f
+#define SDM_ERFF4_0_09   0.00522397762544218784211184677371f
+#define SDM_ERFF4_0_10   0.0f
+//#define SDM_ERFF4_0_11  -0.00085483270234508528325466583569f
 
-  xabs = spu_andc(x, sign_mask);
-  xsqu = spu_mul(x, x);
 
-  /*
-   * Taylor Series Expansion near Zero
+
+  /***************************************************************
+   * REGION 1: Above 0 and Below 1
    */
-  TAYLOR_ERFF4(xabs, xsqu, tresult);
+#define SDM_ERFF4_1_START     0.07f
+#define SDM_ERFF4_1_OFF       0.0625f
+#define SDM_ERFF4_1_TRUNC     1u
+
+#define SDM_ERFF4_1_00     0.0704319777223870780505900559232967439190042883f
+#define SDM_ERFF4_1_01     1.1239800336253906104888456836298420746260842545f
+#define SDM_ERFF4_1_02    -0.0702487521015869131555528552268651296641302713f
+#define SDM_ERFF4_1_03    -0.3717329798708974154481338589088279778060226856f
+#define SDM_ERFF4_1_04     0.0350329063214945152846051348331892508611482993f
+#define SDM_ERFF4_1_05     0.1106440713032318617523250293018186620702780982f
+#define SDM_ERFF4_1_06    -0.0116471931712158678624014740659716890227703402f
+#define SDM_ERFF4_1_07    -0.0261358409084263503958678377968739965222786482f
+#define SDM_ERFF4_1_08     0.0029041996223118476954500365511415181291113910f
+#define SDM_ERFF4_1_09     0.0050416329596619035812041623972929782386498567f
+#define SDM_ERFF4_1_10    -0.0005793225670734356072895029723913210064918149f
+//#define SDM_ERFF4_1_11    -0.0008184112733188406359323913130525859730689332f
 
-  /*
-   * Continued Fraction Approximation of Erfc().
-   * erf = 1 - erfc 
+
+
+  /***************************************************************
+   * REGION 2:
    */
-  CONTFRAC_ERFCF4(xabs, xsqu, presult);
-  presult = spu_sub(onef, presult);
+#define SDM_ERFF4_2_START     0.13f
+#define SDM_ERFF4_2_OFF       0.1875f
+#define SDM_ERFF4_2_TRUNC     1u
+
+#define SDM_ERFF4_2_00    0.2091176770593758483008706390019410965937912290f
+#define SDM_ERFF4_2_01    1.0893988034775673230502318110338693557898033315f
+#define SDM_ERFF4_2_02   -0.2042622756520438730719184645688505042105881396f
+#define SDM_ERFF4_2_03   -0.3376001500360169568827541289401834722369442864f
+#define SDM_ERFF4_2_04    0.0997374392832245473983976877777590352590762400f
+#define SDM_ERFF4_2_05    0.0937997370645632460099464120987231140266525679f
+#define SDM_ERFF4_2_06   -0.0324591340420617488485277008302392706957527828f
+#define SDM_ERFF4_2_07   -0.0205943885488331791711970665266474471714543313f
+#define SDM_ERFF4_2_08    0.0079208906865255014554772269570592999495375181f
+#define SDM_ERFF4_2_09    0.0036744273281123333893101007014150883409965011f
+#define SDM_ERFF4_2_10   -0.0015459493690754127608506357908913858038162608f
+//#define SDM_ERFF4_2_11   -0.0005485671070180836650399266219057172124875094f
+
+
+
+  /***************************************************************
+   * REGION 3:
+   */
+#define SDM_ERFF4_3_START     0.25f
+#define SDM_ERFF4_3_OFF       0.5f
+#define SDM_ERFF4_3_TRUNC     2u
+
+#define SDM_ERFF4_3_00    0.5204998778130465376827466538919645287364515699f
+#define SDM_ERFF4_3_01    0.8787825789354447940937239548244578983625218956f
+#define SDM_ERFF4_3_02   -0.4393912894677223970468619774122289491812609947f
+#define SDM_ERFF4_3_03   -0.1464637631559074656822873258040763163937536583f
+#define SDM_ERFF4_3_04    0.1830797039448843321028591572550953954921920811f
+#define SDM_ERFF4_3_05    0.0073231881577953732841143662902038158196876832f
+#define SDM_ERFF4_3_06   -0.0500417857449350507747815029830594081011991688f
+#define SDM_ERFF4_3_07    0.0054052103069442040906558417856266259621504328f
+#define SDM_ERFF4_3_08    0.0100475885141180567975497704160236877764167320f
+#define SDM_ERFF4_3_09   -0.0021674118390300459951330548378744759122422210f
+#define SDM_ERFF4_3_10   -0.0015694967741624277200510981457278746801387524f
+//#define SDM_ERFF4_3_11    0.0004973489167651373192082360776274483020158863f
+
+
+
+  /***************************************************************
+   * REGION 4:
+   */
+#define SDM_ERFF4_4_START     0.77f
+#define SDM_ERFF4_4_OFF       1.0f
+#define SDM_ERFF4_4_TRUNC     1u
+
+#define SDM_ERFF4_4_00     0.8427007929497148693412206350826092590442f
+#define SDM_ERFF4_4_01     0.4151074974205947033402682494413373653605f
+#define SDM_ERFF4_4_02    -0.4151074974205947033402682494413373653605f
+#define SDM_ERFF4_4_03     0.1383691658068649011134227498137791217898f
+#define SDM_ERFF4_4_04     0.0691845829034324505567113749068895608946f
+#define SDM_ERFF4_4_05    -0.0691845829034324505567113749068895608946f
+#define SDM_ERFF4_4_06     0.0046123055268954967037807583271259707263f
+#define SDM_ERFF4_4_07     0.0151547181597994891695653487891281895293f
+#define SDM_ERFF4_4_08    -0.0047770307242846215860586425530947553951f
+#define SDM_ERFF4_4_09    -0.0018851883701199847638468972527538689873f
+#define SDM_ERFF4_4_10     0.0012262875805634852347353603488787303121f
+//#define SDM_ERFF4_4_11     0.0000855239913717274641321540324726821411f
+
+
+
+  /***************************************************************
+   * REGION 5:
+   */
+#define SDM_ERFF4_5_START     1.36f
+#define SDM_ERFF4_5_OFF       1.875f
+#define SDM_ERFF4_5_TRUNC     1u
+
+#define SDM_ERFF4_5_00     0.99199005767011997029646305969122440092668f
+#define SDM_ERFF4_5_01     0.03354582842421607459425032786195496507386f
+#define SDM_ERFF4_5_02    -0.06289842829540513986421936474116555951979f
+#define SDM_ERFF4_5_03     0.06744109256118439996552409663913862770819f
+#define SDM_ERFF4_5_04    -0.04225988151097532834627238568547061029869f
+#define SDM_ERFF4_5_05     0.01146258336487617627004706027236136941544f
+#define SDM_ERFF4_5_06     0.00410518713321247739022655684589964019683f
+#define SDM_ERFF4_5_07    -0.00492839390823910723763257456562751425198f
+#define SDM_ERFF4_5_08     0.00143050168737012207687743571780226012058f
+#define SDM_ERFF4_5_09     0.00036225644575338665306295794978774160986f
+#define SDM_ERFF4_5_10    -0.00039015757824554169745459780322413823624f
+//#define SDM_ERFF4_5_11     0.00007372993782406230817649249567932577159f
+
+
+
+  /***************************************************************
+   * REGION 6:
+   */
+#define SDM_ERFF4_6_START     2.0f
+#define SDM_ERFF4_6_OFF       2.5f
+#define SDM_ERFF4_6_TRUNC     1u
+
+#define SDM_ERFF4_6_00    0.999593047982555041060435784260025087279f
+#define SDM_ERFF4_6_01    0.002178284230352709720386678564097264007f
+#define SDM_ERFF4_6_02   -0.005445710575881774300966696410243160031f
+#define SDM_ERFF4_6_03    0.008350089549685387261482267829039512051f
+#define SDM_ERFF4_6_04   -0.008622375078479475976530602649551670054f
+#define SDM_ERFF4_6_05    0.006117348213573859798085922300839816434f
+#define SDM_ERFF4_6_06   -0.002798490157050356237996774544152735014f
+#define SDM_ERFF4_6_07    0.000542410061327906884739143174194854432f
+#define SDM_ERFF4_6_08    0.000260670173895134533751630061303802055f
+#define SDM_ERFF4_6_09   -0.000250285386311056635227961206817778392f
+#define SDM_ERFF4_6_10    0.000078801328907504400502579703621546608f
+//#define SDM_ERFF4_6_11    5.137004620216358263402877651297096663210e-6f
+
+
+
+  /***************************************************************
+   * REGION 7:
+   */
+#define SDM_ERFF4_7_START     2.75f
+#define SDM_ERFF4_7_OFF       3.5f
+#define SDM_ERFF4_7_TRUNC     1u
+
+#define SDM_ERFF4_7_00    0.999999256901627658587254476316243904363263f
+#define SDM_ERFF4_7_01    5.399426777384782511586818937495781413007869e-6f
+#define SDM_ERFF4_7_02   -0.000018897993720846738790553866281235234945f
+#define SDM_ERFF4_7_03    0.000042295509756180796340763415010383621069f
+#define SDM_ERFF4_7_04   -0.000067717810833034147332818020841092925222f
+#define SDM_ERFF4_7_05    0.000082116282239393567363716204674415008991f
+#define SDM_ERFF4_7_06   -0.000077744246390483389302250766562526063763f
+#define SDM_ERFF4_7_07    0.000058192750619199206596604051163855823527f
+#define SDM_ERFF4_7_08   -0.000034259175422410008064403380504975403351f
+#define SDM_ERFF4_7_09    0.000015330768263696827211862952666453348031f
+#define SDM_ERFF4_7_10   -4.641017709492666503521243665632827470977627e-6f
+//#define SDM_ERFF4_7_11    4.447037356176705948450355327103423490366212e-7f
+
+
+
+
+
+  /***************************************************************
+   * Now we load the description of each partition.
+   */
+
+  /* Start point for each partition */
+  vec_float4 r1start = spu_splats(SDM_ERFF4_1_START);
+  vec_float4 r2start = spu_splats(SDM_ERFF4_2_START);
+  vec_float4 r3start = spu_splats(SDM_ERFF4_3_START);
+  vec_float4 r4start = spu_splats(SDM_ERFF4_4_START);
+  vec_float4 r5start = spu_splats(SDM_ERFF4_5_START);
+  vec_float4 r6start = spu_splats(SDM_ERFF4_6_START);
+  vec_float4 r7start = spu_splats(SDM_ERFF4_7_START);
+
+  /* X Offset for each partition */
+  vec_float4 xoffseta = (vec_float4) {SDM_ERFF4_0_OFF, SDM_ERFF4_1_OFF, SDM_ERFF4_2_OFF, SDM_ERFF4_3_OFF};
+  vec_float4 xoffsetb = (vec_float4) {SDM_ERFF4_4_OFF, SDM_ERFF4_5_OFF, SDM_ERFF4_6_OFF, SDM_ERFF4_7_OFF};
+
+  /* Truncation Correction for each partition */
+  vec_uint4 tcorra = (vec_uint4) {SDM_ERFF4_0_TRUNC, SDM_ERFF4_1_TRUNC, SDM_ERFF4_2_TRUNC, SDM_ERFF4_3_TRUNC};
+  vec_uint4 tcorrb = (vec_uint4) {SDM_ERFF4_4_TRUNC, SDM_ERFF4_5_TRUNC, SDM_ERFF4_6_TRUNC, SDM_ERFF4_7_TRUNC};
+
+  /* The coefficients for each partition */
+  vec_float4 c00a = (vec_float4) {SDM_ERFF4_0_00, SDM_ERFF4_1_00, SDM_ERFF4_2_00, SDM_ERFF4_3_00};
+  vec_float4 c01a = (vec_float4) {SDM_ERFF4_0_01, SDM_ERFF4_1_01, SDM_ERFF4_2_01, SDM_ERFF4_3_01};
+  vec_float4 c02a = (vec_float4) {SDM_ERFF4_0_02, SDM_ERFF4_1_02, SDM_ERFF4_2_02, SDM_ERFF4_3_02};
+  vec_float4 c03a = (vec_float4) {SDM_ERFF4_0_03, SDM_ERFF4_1_03, SDM_ERFF4_2_03, SDM_ERFF4_3_03};
+  vec_float4 c04a = (vec_float4) {SDM_ERFF4_0_04, SDM_ERFF4_1_04, SDM_ERFF4_2_04, SDM_ERFF4_3_04};
+  vec_float4 c05a = (vec_float4) {SDM_ERFF4_0_05, SDM_ERFF4_1_05, SDM_ERFF4_2_05, SDM_ERFF4_3_05};
+  vec_float4 c06a = (vec_float4) {SDM_ERFF4_0_06, SDM_ERFF4_1_06, SDM_ERFF4_2_06, SDM_ERFF4_3_06};
+  vec_float4 c07a = (vec_float4) {SDM_ERFF4_0_07, SDM_ERFF4_1_07, SDM_ERFF4_2_07, SDM_ERFF4_3_07};
+  vec_float4 c08a = (vec_float4) {SDM_ERFF4_0_08, SDM_ERFF4_1_08, SDM_ERFF4_2_08, SDM_ERFF4_3_08};
+  vec_float4 c09a = (vec_float4) {SDM_ERFF4_0_09, SDM_ERFF4_1_09, SDM_ERFF4_2_09, SDM_ERFF4_3_09};
+  vec_float4 c10a = (vec_float4) {SDM_ERFF4_0_10, SDM_ERFF4_1_10, SDM_ERFF4_2_10, SDM_ERFF4_3_10};
+
+  vec_float4 c00b = (vec_float4) {SDM_ERFF4_4_00, SDM_ERFF4_5_00, SDM_ERFF4_6_00, SDM_ERFF4_7_00};
+  vec_float4 c01b = (vec_float4) {SDM_ERFF4_4_01, SDM_ERFF4_5_01, SDM_ERFF4_6_01, SDM_ERFF4_7_01};
+  vec_float4 c02b = (vec_float4) {SDM_ERFF4_4_02, SDM_ERFF4_5_02, SDM_ERFF4_6_02, SDM_ERFF4_7_02};
+  vec_float4 c03b = (vec_float4) {SDM_ERFF4_4_03, SDM_ERFF4_5_03, SDM_ERFF4_6_03, SDM_ERFF4_7_03};
+  vec_float4 c04b = (vec_float4) {SDM_ERFF4_4_04, SDM_ERFF4_5_04, SDM_ERFF4_6_04, SDM_ERFF4_7_04};
+  vec_float4 c05b = (vec_float4) {SDM_ERFF4_4_05, SDM_ERFF4_5_05, SDM_ERFF4_6_05, SDM_ERFF4_7_05};
+  vec_float4 c06b = (vec_float4) {SDM_ERFF4_4_06, SDM_ERFF4_5_06, SDM_ERFF4_6_06, SDM_ERFF4_7_06};
+  vec_float4 c07b = (vec_float4) {SDM_ERFF4_4_07, SDM_ERFF4_5_07, SDM_ERFF4_6_07, SDM_ERFF4_7_07};
+  vec_float4 c08b = (vec_float4) {SDM_ERFF4_4_08, SDM_ERFF4_5_08, SDM_ERFF4_6_08, SDM_ERFF4_7_08};
+  vec_float4 c09b = (vec_float4) {SDM_ERFF4_4_09, SDM_ERFF4_5_09, SDM_ERFF4_6_09, SDM_ERFF4_7_09};
+  vec_float4 c10b = (vec_float4) {SDM_ERFF4_4_10, SDM_ERFF4_5_10, SDM_ERFF4_6_10, SDM_ERFF4_7_10};
+
+
+  vec_uchar16 shuffle0 = (vec_uchar16) spu_splats(0x00010203);
+  vec_uchar16 shuffle1 = (vec_uchar16) spu_splats(0x04050607);
+  vec_uchar16 shuffle2 = (vec_uchar16) spu_splats(0x08090A0B);
+  vec_uchar16 shuffle3 = (vec_uchar16) spu_splats(0x0C0D0E0F);
+  vec_uchar16 shuffle4 = (vec_uchar16) spu_splats(0x10111213);
+  vec_uchar16 shuffle5 = (vec_uchar16) spu_splats(0x14151617);
+  vec_uchar16 shuffle6 = (vec_uchar16) spu_splats(0x18191A1B);
+  vec_uchar16 shuffle7 = (vec_uchar16) spu_splats(0x1C1D1E1F);
+
 
   /*
-   * Select the appropriate approximation.
+   * Determine the shuffle pattern based on which partition
+   * each element of x is in.
    */
-  result = spu_sel(tresult, presult, spu_cmpgt(xabs, approx_point));
+
+  vec_uchar16 gt_r1start = (vec_uchar16)spu_cmpabsgt(x, r1start);
+  vec_uchar16 gt_r2start = (vec_uchar16)spu_cmpabsgt(x, r2start);
+  vec_uchar16 gt_r3start = (vec_uchar16)spu_cmpabsgt(x, r3start);
+  vec_uchar16 gt_r4start = (vec_uchar16)spu_cmpabsgt(x, r4start);
+  vec_uchar16 gt_r5start = (vec_uchar16)spu_cmpabsgt(x, r5start);
+  vec_uchar16 gt_r6start = (vec_uchar16)spu_cmpabsgt(x, r6start);
+  vec_uchar16 gt_r7start = (vec_uchar16)spu_cmpabsgt(x, r7start);
+
+  vec_uchar16 shufflepattern;
+  shufflepattern = spu_sel(shuffle0, shuffle1, gt_r1start);
+  shufflepattern = spu_sel(shufflepattern, shuffle2, gt_r2start);
+  shufflepattern = spu_sel(shufflepattern, shuffle3, gt_r3start);
+  shufflepattern = spu_sel(shufflepattern, shuffle4, gt_r4start);
+  shufflepattern = spu_sel(shufflepattern, shuffle5, gt_r5start);
+  shufflepattern = spu_sel(shufflepattern, shuffle6, gt_r6start);
+  shufflepattern = spu_sel(shufflepattern, shuffle7, gt_r7start);
+
+
+
+  /* Use the shuffle pattern to select the coefficients */
+
+  vec_float4 coeff_10 = spu_shuffle(c10a, c10b, shufflepattern);
+  vec_float4 coeff_09 = spu_shuffle(c09a, c09b, shufflepattern);
+  vec_float4 coeff_08 = spu_shuffle(c08a, c08b, shufflepattern);
+  vec_float4 coeff_07 = spu_shuffle(c07a, c07b, shufflepattern);
+  vec_float4 coeff_06 = spu_shuffle(c06a, c06b, shufflepattern);
+  vec_float4 coeff_05 = spu_shuffle(c05a, c05b, shufflepattern);
+  vec_float4 coeff_04 = spu_shuffle(c04a, c04b, shufflepattern);
+  vec_float4 coeff_03 = spu_shuffle(c03a, c03b, shufflepattern);
+  vec_float4 coeff_02 = spu_shuffle(c02a, c02b, shufflepattern);
+  vec_float4 coeff_01 = spu_shuffle(c01a, c01b, shufflepattern);
+  vec_float4 coeff_00 = spu_shuffle(c00a, c00b, shufflepattern);
+
+  vec_float4 xoffset     = spu_shuffle(xoffseta, xoffsetb, shufflepattern);
+  vec_uint4  tcorrection = spu_shuffle(tcorra,   tcorrb,   shufflepattern);
+
 
   /*
-   * Special cases/errors.
+   * We've completed the coeff. setup. Now we actually do the
+   * approximation below.
    */
 
-  /* x = +/- infinite, return +/-1 */
-  isinf = spu_cmpeq((vec_uint4)xabs, 0x7F800000);
-  result = spu_sel(result, onef, isinf);
+  /* Adjust x value here (for approximations about a point) */
+  vec_float4 xappr = spu_sub(xabs, xoffset);
+
+
+  /* Now we do the multiplies.
+   * Use Horner's method.
+   */
+  result = coeff_10;
+  result = spu_madd(xappr, result, coeff_09);
+  result = spu_madd(xappr, result, coeff_08);
+  result = spu_madd(xappr, result, coeff_07);
+  result = spu_madd(xappr, result, coeff_06);
+  result = spu_madd(xappr, result, coeff_05);
+  result = spu_madd(xappr, result, coeff_04);
+  result = spu_madd(xappr, result, coeff_03);
+  result = spu_madd(xappr, result, coeff_02);
+  result = spu_madd(xappr, result, coeff_01);
+  result = spu_madd(xappr, result, coeff_00);
+
+
+  /* Adjust due to systematic truncation. Note that the correction
+   * value is always non-negative, so the result is cast as uint
+   * to do the adjustment.
+   */
+  result = (vec_float4)spu_add((vec_uint4)result, tcorrection);
+
 
   /*
-   * Preserve sign in result, since erf(-x) = -erf(x)
+   * Special Cases
    */
+
+  /* Erf(0) = 0 */
+  result = spu_sel(result, zerof, spu_cmpeq(xabs, zerof));
+
+  /* Erf(infinity) = 1 */
+  result = spu_sel(result, onef, spu_cmpgt(xabs, clamp));
+
+
+  /* Preserve sign in result, since erf(-x) = -erf(x) */
   result = spu_or(result, xsign);
 
   return result;
Index: src/newlib/libm/machine/spu/headers/expd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/expd2.h
+++ src/newlib/libm/machine/spu/headers/expd2.h
@@ -56,7 +56,6 @@
 
 #include <spu_intrinsics.h>
 #include "floord2.h"
-#include "ldexpd2.h"
 
 #define LOG2E 1.4426950408889634073599     // 1/log(2) 
 
@@ -101,15 +100,12 @@
 
 static __inline vector double _expd2(vector double x)
 {
-  vec_uchar16 even2odd = ((vec_uchar16){0x80, 0x80, 0x80, 0x80, 0, 1, 2, 3,
-                                        0x80, 0x80, 0x80, 0x80, 8, 9, 10, 11});
-
   //  log(2) in extended machine representable precision
   vec_double2 ln2_hi = spu_splats(6.9314575195312500E-1);  // 3FE62E4000000000
   vec_double2 ln2_lo = spu_splats(1.4286068203094172E-6);  // 3EB7F7D1CF79ABCA
 
-
   //  coefficients for the power series
+  // vec_double2 f01 = spu_splats(1.00000000000000000000E0);  // 1/(1!)
   vec_double2 f02 = spu_splats(5.00000000000000000000E-1); // 1/(2!)
   vec_double2 f03 = spu_splats(1.66666666666666666667E-1); // 1/(3!)
   vec_double2 f04 = spu_splats(4.16666666666666666667E-2); // 1/(4!)
@@ -126,19 +122,42 @@ static __inline vector double _expd2(vec
   vec_double2 rx = _floord2(spu_madd(x,spu_splats(LOG2E),spu_splats(0.5)));
 
   // extract the exponent of reduction
-  vec_int4 nint = spu_convts(spu_roundtf(rx),0);
-  vec_llong2 n  = spu_extend(spu_shuffle(nint, nint, even2odd));
+  vec_int4 exp = spu_convts(spu_roundtf(rx),0);
 
   // reduce the input to within [ -ln(2)/2 ... ln(2)/2 ]
   vec_double2 r;
   r = spu_nmsub(rx,ln2_hi,x);
   r = spu_nmsub(rx,ln2_lo,r);
 
-
   vec_double2 result;
   vec_double2 r2 = spu_mul(r,r);
 
   //  Use Horner's method on the power series
+  /*  result = ((((c12*x + c11)*x + c10)*x + c9)*x + c8)*x + c7)*x + c6)*x^6 +
+              ((((((c5*x + c4)*x + c3)*x + c2)*x + c1)*x + c0
+  */
+
+#ifdef __SPU_EDP__
+  vec_double2 p1, p2, r4, r6;
+
+  p1 = spu_madd(f12, r, f11);
+  p2 = spu_madd(f05, r, f04);
+  r4 = spu_mul(r2, r2);
+  p1 = spu_madd(p1, r, f10);
+  p2 = spu_madd(p2, r, f03);
+  p1 = spu_madd(p1, r, f09);
+  p2 = spu_madd(p2, r, f02);
+  p1 = spu_madd(p1, r, f08);
+  r6 = spu_mul(r2, r4);
+  p1 = spu_madd(p1, r, f07);
+  p2 = spu_madd(p2, r2, r);
+  p1 = spu_madd(p1, r, f06);
+
+  result = spu_madd(r6, p1, p2);
+  result = spu_add(result, spu_splats(1.0));
+
+#else
+
   result = spu_madd(r,f12,f11);
   result = spu_madd(result,r,f10);
   result = spu_madd(result,r,f09);
@@ -152,14 +171,38 @@ static __inline vector double _expd2(vec
   result = spu_madd(result,r2,r);
   result = spu_add(result,spu_splats(1.0));
 
-  //  Scale the result
-  result = _ldexpd2(result, n);
+#endif  /* __SPU_EDP__ */
 
-  return result;
 
+  //  Scale the result - basically a call to ldexpd2()
+  vec_int4 e1, e2;
+  vec_int4 min = spu_splats(-2044);
+  vec_int4 max = spu_splats(2046);
+  vec_uint4 cmp_min, cmp_max;
+  vec_uint4 shift = (vec_uint4) { 20, 32, 20, 32 };
+  vec_double2 f1, f2;
+
+  /* Clamp the specified exponent to the range -2044 to 2046.
+   */
+  cmp_min = spu_cmpgt(exp, min);
+  cmp_max = spu_cmpgt(exp, max);
+  exp = spu_sel(min, exp, cmp_min);
+  exp = spu_sel(exp, max, cmp_max);
+
+  /* Generate the factors f1 = 2^e1 and f2 = 2^e2
+   */
+  e1 = spu_rlmaska(exp, -1);
+  e2 = spu_sub(exp, e1);
+
+  f1 = (vec_double2)spu_sl(spu_add(e1, 1023), shift);
+  f2 = (vec_double2)spu_sl(spu_add(e2, 1023), shift);
+
+  /* Compute the product x * 2^e1 * 2^e2
+   */
+  result = spu_mul(spu_mul(result, f1), f2);
 
+  return result;
 }
 
-
 #endif /* _EXPD2_H_ */
 #endif /* __SPU__ */
Index: src/newlib/libm/machine/spu/headers/ldexpd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/ldexpd2.h
+++ src/newlib/libm/machine/spu/headers/ldexpd2.h
@@ -70,10 +70,7 @@ static __inline vector double _ldexpd2(v
 {
   vec_uchar16 odd_to_even = ((vec_uchar16) { 4,5,6,7,     0x80,0x80,0x80,0x80, 
                                              12,13,14,15, 0x80,0x80,0x80,0x80 });
-  vec_uchar16 dup_even = ((vec_uchar16) { 0,1,2,3,    0,1,2,3,
-                                          8,9,10,11,  8,9,10,11});
   vec_int4 exp;
-  vec_uint4 exphi;
   vec_int4 e1, e2;
   vec_int4 min = spu_splats(-2044);
   vec_int4 max = spu_splats(2046);
@@ -84,8 +81,6 @@ static __inline vector double _ldexpd2(v
 
   exp = (vec_int4)spu_shuffle(llexp, llexp, odd_to_even);
 
-  exphi = (vec_uint4)spu_shuffle(llexp, llexp, dup_even);
-
   /* Clamp the specified exponent to the range -2044 to 2046.
    */
 
Index: src/newlib/libm/machine/spu/headers/lgammaf4.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/lgammaf4.h
+++ src/newlib/libm/machine/spu/headers/lgammaf4.h
@@ -53,11 +53,13 @@
 #define _LGAMMAF4_H_	1
 
 #include <spu_intrinsics.h>
-#include "lgammad2.h"
-#include "recipf4.h"
+
 #include "logf4.h"
-#include "sinf4.h"
+#include "divf4.h"
+#include "recipf4.h"
 #include "truncf4.h"
+#include "sinf4.h"
+
 
 /*
  * FUNCTION
@@ -68,129 +70,423 @@
  *	function for the corresponding elements of the input vector.
  *
  * C99 Special Cases:
- *	lgamma(0) returns +infinite 
+ *	lgamma(0) returns +infinity
  *	lgamma(1) returns +0
  *	lgamma(2) returns +0
- *	lgamma(negative integer) returns +infinite 
- *	lgamma(+infinite) returns +infinite
- *	lgamma(-infinite) returns +infinite
+ *	lgamma(negative integer) returns +infinity
+ *	lgamma(+infinity) returns +infinity
+ *	lgamma(-infinity) returns +infinity
  *
  * Other Cases:
  *  lgamma(Nan) returns Nan
- *  lgamma(Denorm) treated as lgamma(0) and returns +infinite
+ *  lgamma(Denorm) treated as lgamma(0) and returns +infinity
  *
  */
 
-
 static __inline vector float _lgammaf4(vector float x) 
 {
+  vec_float4 result;
+  vec_float4 halflog2pi = spu_splats(9.189385332046727417803297364056E-1f);
+  vec_float4 logpi      = spu_splats(1.1447298858494001741434273513530587116472948129153f);
   vec_float4 inff       = (vec_float4)spu_splats(0x7F800000);
   vec_float4 zerof      = spu_splats(0.0f);
-  vec_float4 pi         = spu_splats((float)PI);
+  vec_float4 onef       = spu_splats(1.0f);
+  vec_float4 twof       = spu_splats(2.0f);
   vec_float4 sign_maskf = spu_splats(-0.0f);
+  vec_float4 pi         = spu_splats(3.14159265358979323846264338328f);
 
-  vector unsigned int gt0;
-
-  /* This is where we switch from near zero approx. */
-  vec_float4 mac_switch = spu_splats(0.16f);
-  vec_float4 shift_switch = spu_splats(6.0f);
-
-  vec_float4 inv_x, inv_xsqu;                  
-  vec_float4 xtrunc, xstirling;
-  vec_float4 sum, xabs;
-  vec_uint4  isnaninf, isshifted;
-  vec_float4 result, stresult, shresult, mresult, nresult;
 
+  /*
+   * Unfortunately, some of the approximation methods for lgamma require
+   * other basic math computations. Get those out of the way now. The
+   * compiler seems to good a good job of scheduling this code with
+   * the code that follows.
+   */
+  vec_uint4  gt0      = spu_cmpgt(x, zerof);
+  vec_float4 xabs     = spu_andc(x, sign_maskf);
+  vec_float4 ln_x     = _logf4(xabs);
+  vec_float4 inv_x    = _recipf4(xabs);
+  vec_float4 xtrunc   = _truncf4(x);
+  vec_float4 inv_xsqu = spu_mul(inv_x, inv_x);
+  vec_uint4 isnaninf  = spu_cmpgt((vec_uint4)xabs, 0x7F7FFFFF);
+  vec_uint4 ret_zero  = spu_or(spu_cmpeq(x, onef), spu_cmpeq(x, twof));
 
-  /* Force Denorms to 0 */
-  x = spu_add(x, zerof);
 
-  xabs = spu_andc(x, sign_maskf);
+  /*
+   * First thing we do is setup the description of each partition.
+   * This consists of:
+   * - Start x of partition
+   * - Offset (used for evaluating power series expanded around a point)
+   * - Truncation adjustment.
+   * - Is approx method in region a rational approximation or just a polynomial
+   * - The coefficients used in the poly or rational approximation
+   */
 
-  gt0    = spu_cmpgt(x, zerof);
-  xtrunc = _truncf4(x);
 
-  /*
-   * For 0 < x <= 0.16.
-   * Approximation Near Zero
+  /***************************************************************
+   * REGION 0: Approximation Near 0 from Above
    *
    * Use Maclaurin Expansion of lgamma()
    *
    * lgamma(z) = -ln(z) - z * EulerMascheroni + Sum[(-1)^n * z^n * Zeta(n)/n]
    */
-  mresult = spu_madd(xabs, spu_splats((float)ZETA_06_DIV_06), spu_splats((float)ZETA_05_DIV_05));
-  mresult = spu_madd(xabs, mresult, spu_splats((float)ZETA_04_DIV_04));
-  mresult = spu_madd(xabs, mresult, spu_splats((float)ZETA_03_DIV_03));
-  mresult = spu_madd(xabs, mresult, spu_splats((float)ZETA_02_DIV_02));
-  mresult = spu_mul(xabs, spu_mul(xabs, mresult));
-  mresult = spu_sub(mresult, spu_add(_logf4(xabs), spu_mul(xabs, spu_splats((float)EULER_MASCHERONI))));
 
+#define SDM_LGF4_0_START     0.0f
+#define SDM_LGF4_0_OFF       0.0f
+#define SDM_LGF4_0_TRUNC     2u
+#define SDM_LGF4_0_RATIONAL  0x0u
+
+#define SDM_LGF4_0_00   0.0f
+#define SDM_LGF4_0_01  -0.5772156649015328606065121f
+#define SDM_LGF4_0_02   0.8224670334241132182362076f
+#define SDM_LGF4_0_03  -0.4006856343865314284665794f
+#define SDM_LGF4_0_04   0.2705808084277845478790009f
+#define SDM_LGF4_0_05  -0.2073855510286739852662731f
+#define SDM_LGF4_0_06   1.6955717699740818995241965496515E-1f
+#define SDM_LGF4_0_07  -1.4404989676884611811997107854997E-1f
+#define SDM_LGF4_0_08   1.2550966952474304242233565481358E-1f
+#define SDM_LGF4_0_09  -1.1133426586956469049087252991471E-1f
+#define SDM_LGF4_0_10   1.0009945751278180853371459589003E-1f
+#define SDM_LGF4_0_11  -9.0954017145829042232609298411497E-2f
 
-  /*
-   * For 0.16 < x <= 6.0, we are going to push value
-   * out to an area where Stirling's approximation is
-   * accurate. Let's use a constant of 6.
-   *
-   * Use the recurrence relation:
-   *    lgamma(x + 1) = ln(x) + lgamma(x)
-   * 
-   * Note that we shift x here, before Stirling's calculation,
-   * then after Stirling's, we adjust the result.
+
+
+  /***************************************************************
+   * REGION 1: Above 0 and Below 1
+   */
+#define SDM_LGF4_1_START     0.20f
+#define SDM_LGF4_1_OFF       0.0f
+#define SDM_LGF4_1_TRUNC     0u
+#define SDM_LGF4_1_RATIONAL  0xFFFFFFFFu
+
+/* Numerator */
+#define SDM_LGF4_1_06  5.5247592697706124892083167601451981186889952720891079f
+#define SDM_LGF4_1_07  188.42248906442882644741346270888237140890625699348872f
+#define SDM_LGF4_1_08  730.89115027907050579364152184942040244662318995470771f
+#define SDM_LGF4_1_09  -517.93391251349155395618464682404141737699116911423096f
+#define SDM_LGF4_1_10  -866.81293419754982917624255525168901081630973644141406f
+#define SDM_LGF4_1_11  459.90872804523394478152324135956113729930154636775805f
+
+/* Denominator */
+#define SDM_LGF4_1_00   1.0f
+#define SDM_LGF4_1_01   62.356015559548850893358835861387218304619374633480009f
+#define SDM_LGF4_1_02   553.64875642095755724931612658933597252336243693499682f
+#define SDM_LGF4_1_03   997.28805670393557265195865662557219661414263910835386f
+#define SDM_LGF4_1_04   257.10520661440946455560646958565998121417179154677712f
+#define SDM_LGF4_1_05   -15.398409585547124178878369413880017200739911288666830f
+
+
+
+  /***************************************************************
+   * REGION 2: Above 0 and Below 1
+   */
+#define SDM_LGF4_2_START     0.60f
+#define SDM_LGF4_2_OFF       0.69f
+#define SDM_LGF4_2_TRUNC     1u
+#define SDM_LGF4_2_RATIONAL  0x0u
+
+/* This is a power series expanson of LogGamma around 0.69 */
+#define SDM_LGF4_2_00   0.27321026793030387025442491383648273204234f
+#define SDM_LGF4_2_01  -1.24869016926209356266849815723905575347988f
+#define SDM_LGF4_2_02   1.44985879780363867173410158693003578927407f
+#define SDM_LGF4_2_03  -1.11686573274718166516744313082147691068190f
+#define SDM_LGF4_2_04   1.14079150485439143731395820215710950729505f
+#define SDM_LGF4_2_05  -1.29512166953091144888197173527810141620764f
+#define SDM_LGF4_2_06   1.55206382120790061136858894716459302629069f
+#define SDM_LGF4_2_07  -1.92227237154565289482911310272968704445560f
+#define SDM_LGF4_2_08   2.43478939488445894670349784581009987461638f
+#define SDM_LGF4_2_09  -3.13512449573283650741385084753752461908870f
+#define SDM_LGF4_2_10   4.08851456399492725127969680590409811177590f
+#define SDM_LGF4_2_11   5.38629680478093362448042704719642976375265f
+
+
+
+  /***************************************************************
+   * REGION 3: Around 1
+   */
+#define SDM_LGF4_3_START     0.74f
+#define SDM_LGF4_3_OFF       1.0f
+#define SDM_LGF4_3_TRUNC     2u
+#define SDM_LGF4_3_RATIONAL  0x0u
+
+#define SDM_LGF4_3_11   -0.90954017145829042232609298411497266951691494159836e-1f
+#define SDM_LGF4_3_10    0.10009945751278180853371459589003190170060195315645f
+#define SDM_LGF4_3_09   -0.11133426586956469049087252991471245116506731682165f
+#define SDM_LGF4_3_08    0.12550966952474304242233565481358155815737009883123f
+#define SDM_LGF4_3_07   -0.14404989676884611811997107854997096565712336579503f
+#define SDM_LGF4_3_06    0.16955717699740818995241965496515342131696958167214f
+#define SDM_LGF4_3_05   -0.20738555102867398526627309729140683361141618390038f
+#define SDM_LGF4_3_04    0.27058080842778454787900092413529197569368773797968f
+#define SDM_LGF4_3_03   -0.40068563438653142846657938717048333025499543078016f
+#define SDM_LGF4_3_02    0.82246703342411321823620758332301259460947495060340f
+#define SDM_LGF4_3_01   -0.57721566490153286060651209008240243104215933593992f
+#define SDM_LGF4_3_00    0.0f
+
+
+
+  /***************************************************************
+   * REGION 4:  Above 1 to Below 2
+   */
+
+#define SDM_LGF4_4_START     1.25f
+#define SDM_LGF4_4_OFF       1.4616321449683623412626595423257213284681962040064f
+#define SDM_LGF4_4_TRUNC     1u
+#define SDM_LGF4_4_RATIONAL  0x0u
+
+#define SDM_LGF4_4_00   -0.12148629053584960809551455717769158215135617313000f
+#define SDM_LGF4_4_01    0.0f
+#define SDM_LGF4_4_02    0.48383612272381058521372238085482537020562860838860f
+#define SDM_LGF4_4_03   -0.14758772299453070203095509395083641661852764909458f
+#define SDM_LGF4_4_04    0.064624940238912752656100346425238557063086033931734f
+#define SDM_LGF4_4_05   -0.032788541088481305500850258549331278505894787737970f
+#define SDM_LGF4_4_06    0.017970675115210394292863824811126161810628596070981f
+#define SDM_LGF4_4_07   -0.010314223036636387275160254800730296612070784399082f
+#define SDM_LGF4_4_08    0.0061005360205178884031365656884883648099463048507839f
+#define SDM_LGF4_4_09   -0.0036845696083163732546953776004972425913603137160767f
+#define SDM_LGF4_4_10    0.00225976482322181046596248251178293952686321035f
+#define SDM_LGF4_4_11   -0.00140225144590445083080002880374741201782467331f
+
+
+
+  /***************************************************************
+   * REGION 5: Around 2
+   */
+
+#define SDM_LGF4_5_START     1.50f
+#define SDM_LGF4_5_OFF       2.0f
+#define SDM_LGF4_5_TRUNC     1u
+#define SDM_LGF4_5_RATIONAL  0x0u
+
+#define SDM_LGF4_5_00    0.0f
+#define SDM_LGF4_5_01    0.42278433509846713939348790991759756895784066406008f
+#define SDM_LGF4_5_02    0.32246703342411321823620758332301259460947495060340f
+#define SDM_LGF4_5_03   -0.6735230105319809513324605383714999692166209744683e-1f
+#define SDM_LGF4_5_04    0.2058080842778454787900092413529197569368773797968e-1f
+#define SDM_LGF4_5_05   -0.738555102867398526627309729140683361141618390038e-2f
+#define SDM_LGF4_5_06    0.289051033074152328575298829848675465030291500547e-2f
+#define SDM_LGF4_5_07   -0.119275391170326097711393569282810851426622293789e-2f
+#define SDM_LGF4_5_08    0.50966952474304242233565481358155815737009883123e-3f
+#define SDM_LGF4_5_09   -0.22315475845357937976141880360134005395620571054e-3f
+#define SDM_LGF4_5_10    0.9945751278180853371459589003190170060195315645e-4f
+#define SDM_LGF4_5_11   -0.44926236738133141700207502406357860782403250745e-4f
+
+
+
+  /***************************************************************
+   * REGION 6: Above 2 to Below Stirlings
+   */
+
+#define SDM_LGF4_6_START     2.48f
+#define SDM_LGF4_6_OFF       0.0f
+#define SDM_LGF4_6_TRUNC     2u
+#define SDM_LGF4_6_RATIONAL  0xFFFFFFFFu
+
+/* Numerator */
+#define SDM_LGF4_6_06  2.8952045264375719070927153893062450394256201846894266f
+#define SDM_LGF4_6_07  0.9017557380149600532583460408941390566399250566546766f
+#define SDM_LGF4_6_08 -5.0120743649109868270726470406381462995568837028633266f
+#define SDM_LGF4_6_09  0.5723176665030477945174549923532715487712277062412760f
+#define SDM_LGF4_6_10  0.6107282478237180956153912232438073421489100296366786f
+#define SDM_LGF4_6_11  0.0312308625200519550078820867041868696010490562277303f
+
+/* Denominator */
+#define SDM_LGF4_6_00  1.0f
+#define SDM_LGF4_6_01  4.3592151369378598515798083402849838078885877442021500f
+#define SDM_LGF4_6_02  2.6245676641191702420707093818412405820501009602499853f
+#define SDM_LGF4_6_03  0.3438846837443412565179153619145215759074092780311669f
+#define SDM_LGF4_6_04  0.0078092905528158343621764949220712317164193605131159f
+#define SDM_LGF4_6_05  -0.000015217018272713076443927141674684568030697337620f
+
+
+
+  /***************************************************************
+   * REGION 7: Stirlings - Above 6.0
    *
    */
 
-  isshifted = spu_cmpgt(shift_switch, x);
-  xstirling = spu_sel(xabs, spu_add(xabs, spu_splats(6.0f)), isshifted);
-  inv_x    = _recipf4(xstirling);            
-  inv_xsqu = spu_mul(inv_x, inv_x);            
+#define SDM_LGF4_7_START     7.80f
+#define SDM_LGF4_7_OFF       0.0f
+#define SDM_LGF4_7_TRUNC     5u
+#define SDM_LGF4_7_RATIONAL  0x0u
+
+#define SDM_LGF4_7_00    8.3333333333333333333333333333333333333333333333333333333333333333333333E-2f
+#define SDM_LGF4_7_01   -2.7777777777777777777777777777777777777777777777777777777777777777777778E-3f
+#define SDM_LGF4_7_02    7.9365079365079365079365079365079365079365079365079365079365079365079365E-4f
+#define SDM_LGF4_7_03   -5.9523809523809523809523809523809523809523809523809523809523809523809524E-4f
+#define SDM_LGF4_7_04    8.4175084175084175084175084175084175084175084175084175084175084175084175E-4f
+#define SDM_LGF4_7_05   -1.9175269175269175269175269175269175269175269175269175269175269175269175E-3f
+#define SDM_LGF4_7_06    6.4102564102564102564102564102564102564102564102564102564102564102564103E-3f
+#define SDM_LGF4_7_07   0.0f
+#define SDM_LGF4_7_08   0.0f
+#define SDM_LGF4_7_09   0.0f
+#define SDM_LGF4_7_10   0.0f
+#define SDM_LGF4_7_11   0.0f
+
 
   /*
-   * For 6.0 < x < infinite
-   *
-   * Use Stirling's Series.
-   *
-   *              1                    1                1      1        1
-   * lgamma(x) = --- ln (2*pi) + (z - ---) ln(x) - x + --- - ----- + ------ ...
-   *              2                    2               12x   360x^3  1260x^5
-   *
-   *
+   * Now we load the description of each partition.
    */
-  sum = spu_madd(inv_xsqu, spu_splats((float)STIRLING_10), spu_splats((float)STIRLING_09));
-  sum = spu_madd(sum, inv_xsqu, spu_splats((float)STIRLING_08));
-  sum = spu_madd(sum, inv_xsqu, spu_splats((float)STIRLING_07));
-  sum = spu_madd(sum, inv_xsqu, spu_splats((float)STIRLING_06));
-  sum = spu_madd(sum, inv_xsqu, spu_splats((float)STIRLING_05));
-  sum = spu_madd(sum, inv_xsqu, spu_splats((float)STIRLING_04));
-  sum = spu_madd(sum, inv_xsqu, spu_splats((float)STIRLING_03));
-  sum = spu_madd(sum, inv_xsqu, spu_splats((float)STIRLING_02));
-  sum = spu_madd(sum, inv_xsqu, spu_splats((float)STIRLING_01));
-  sum = spu_mul(sum, inv_x);
 
-  stresult = spu_madd(spu_sub(xstirling, spu_splats(0.5f)), _logf4(xstirling), spu_splats((float)HALFLOG2PI));
-  stresult = spu_sub(stresult, xstirling);
-  stresult = spu_add(stresult, sum);
+  /* Start point for each partition */
+  vec_float4 r1start = spu_splats(SDM_LGF4_1_START);
+  vec_float4 r2start = spu_splats(SDM_LGF4_2_START);
+  vec_float4 r3start = spu_splats(SDM_LGF4_3_START);
+  vec_float4 r4start = spu_splats(SDM_LGF4_4_START);
+  vec_float4 r5start = spu_splats(SDM_LGF4_5_START);
+  vec_float4 r6start = spu_splats(SDM_LGF4_6_START);
+  vec_float4 r7start = spu_splats(SDM_LGF4_7_START);
+
+  /* X Offset for each partition */
+  vec_float4 xoffseta = (vec_float4) {SDM_LGF4_0_OFF, SDM_LGF4_1_OFF, SDM_LGF4_2_OFF, SDM_LGF4_3_OFF};
+  vec_float4 xoffsetb = (vec_float4) {SDM_LGF4_4_OFF, SDM_LGF4_5_OFF, SDM_LGF4_6_OFF, SDM_LGF4_7_OFF};
+
+  /* Truncation Correction for each partition */
+  vec_uint4 tcorra = (vec_uint4) {SDM_LGF4_0_TRUNC, SDM_LGF4_1_TRUNC, SDM_LGF4_2_TRUNC, SDM_LGF4_3_TRUNC};
+  vec_uint4 tcorrb = (vec_uint4) {SDM_LGF4_4_TRUNC, SDM_LGF4_5_TRUNC, SDM_LGF4_6_TRUNC, SDM_LGF4_7_TRUNC};
+
+  /* Is partition a Rational Approximation */
+  vec_uint4 israta = (vec_uint4) {SDM_LGF4_0_RATIONAL, SDM_LGF4_1_RATIONAL, SDM_LGF4_2_RATIONAL, SDM_LGF4_3_RATIONAL};
+  vec_uint4 isratb = (vec_uint4) {SDM_LGF4_4_RATIONAL, SDM_LGF4_5_RATIONAL, SDM_LGF4_6_RATIONAL, SDM_LGF4_7_RATIONAL};
+
+  /* The polynomial coefficients for all partitions */
+  vec_float4 c00a = (vec_float4) {SDM_LGF4_0_00, SDM_LGF4_1_00, SDM_LGF4_2_00, SDM_LGF4_3_00};
+  vec_float4 c01a = (vec_float4) {SDM_LGF4_0_01, SDM_LGF4_1_01, SDM_LGF4_2_01, SDM_LGF4_3_01};
+  vec_float4 c02a = (vec_float4) {SDM_LGF4_0_02, SDM_LGF4_1_02, SDM_LGF4_2_02, SDM_LGF4_3_02};
+  vec_float4 c03a = (vec_float4) {SDM_LGF4_0_03, SDM_LGF4_1_03, SDM_LGF4_2_03, SDM_LGF4_3_03};
+  vec_float4 c04a = (vec_float4) {SDM_LGF4_0_04, SDM_LGF4_1_04, SDM_LGF4_2_04, SDM_LGF4_3_04};
+  vec_float4 c05a = (vec_float4) {SDM_LGF4_0_05, SDM_LGF4_1_05, SDM_LGF4_2_05, SDM_LGF4_3_05};
+  vec_float4 c06a = (vec_float4) {SDM_LGF4_0_06, SDM_LGF4_1_06, SDM_LGF4_2_06, SDM_LGF4_3_06};
+  vec_float4 c07a = (vec_float4) {SDM_LGF4_0_07, SDM_LGF4_1_07, SDM_LGF4_2_07, SDM_LGF4_3_07};
+  vec_float4 c08a = (vec_float4) {SDM_LGF4_0_08, SDM_LGF4_1_08, SDM_LGF4_2_08, SDM_LGF4_3_08};
+  vec_float4 c09a = (vec_float4) {SDM_LGF4_0_09, SDM_LGF4_1_09, SDM_LGF4_2_09, SDM_LGF4_3_09};
+  vec_float4 c10a = (vec_float4) {SDM_LGF4_0_10, SDM_LGF4_1_10, SDM_LGF4_2_10, SDM_LGF4_3_10};
+  vec_float4 c11a = (vec_float4) {SDM_LGF4_0_11, SDM_LGF4_1_11, SDM_LGF4_2_11, SDM_LGF4_3_11};
+
+  vec_float4 c00b = (vec_float4) {SDM_LGF4_4_00, SDM_LGF4_5_00, SDM_LGF4_6_00, SDM_LGF4_7_00};
+  vec_float4 c01b = (vec_float4) {SDM_LGF4_4_01, SDM_LGF4_5_01, SDM_LGF4_6_01, SDM_LGF4_7_01};
+  vec_float4 c02b = (vec_float4) {SDM_LGF4_4_02, SDM_LGF4_5_02, SDM_LGF4_6_02, SDM_LGF4_7_02};
+  vec_float4 c03b = (vec_float4) {SDM_LGF4_4_03, SDM_LGF4_5_03, SDM_LGF4_6_03, SDM_LGF4_7_03};
+  vec_float4 c04b = (vec_float4) {SDM_LGF4_4_04, SDM_LGF4_5_04, SDM_LGF4_6_04, SDM_LGF4_7_04};
+  vec_float4 c05b = (vec_float4) {SDM_LGF4_4_05, SDM_LGF4_5_05, SDM_LGF4_6_05, SDM_LGF4_7_05};
+  vec_float4 c06b = (vec_float4) {SDM_LGF4_4_06, SDM_LGF4_5_06, SDM_LGF4_6_06, SDM_LGF4_7_06};
+  vec_float4 c07b = (vec_float4) {SDM_LGF4_4_07, SDM_LGF4_5_07, SDM_LGF4_6_07, SDM_LGF4_7_07};
+  vec_float4 c08b = (vec_float4) {SDM_LGF4_4_08, SDM_LGF4_5_08, SDM_LGF4_6_08, SDM_LGF4_7_08};
+  vec_float4 c09b = (vec_float4) {SDM_LGF4_4_09, SDM_LGF4_5_09, SDM_LGF4_6_09, SDM_LGF4_7_09};
+  vec_float4 c10b = (vec_float4) {SDM_LGF4_4_10, SDM_LGF4_5_10, SDM_LGF4_6_10, SDM_LGF4_7_10};
+  vec_float4 c11b = (vec_float4) {SDM_LGF4_4_11, SDM_LGF4_5_11, SDM_LGF4_6_11, SDM_LGF4_7_11};
+
+
+  vec_uchar16 shuffle0 = (vec_uchar16) spu_splats(0x00010203);
+  vec_uchar16 shuffle1 = (vec_uchar16) spu_splats(0x04050607);
+  vec_uchar16 shuffle2 = (vec_uchar16) spu_splats(0x08090A0B);
+  vec_uchar16 shuffle3 = (vec_uchar16) spu_splats(0x0C0D0E0F);
+  vec_uchar16 shuffle4 = (vec_uchar16) spu_splats(0x10111213);
+  vec_uchar16 shuffle5 = (vec_uchar16) spu_splats(0x14151617);
+  vec_uchar16 shuffle6 = (vec_uchar16) spu_splats(0x18191A1B);
+  vec_uchar16 shuffle7 = (vec_uchar16) spu_splats(0x1C1D1E1F);
+
 
   /*
-   * Adjust result if we shifted x into Stirling range.
-   *
-   * lgamma(x) = lgamma(x + n) - ln(x(x+1)(x+2)...(x+n-1)
-   *
+   * Determine the shuffle pattern based on which partition
+   * each element of x is in.
    */
-  shresult = spu_mul(xabs, spu_add(xabs, spu_splats(1.0f)));
-  shresult = spu_mul(shresult, spu_add(xabs, spu_splats(2.0f)));
-  shresult = spu_mul(shresult, spu_add(xabs, spu_splats(3.0f)));
-  shresult = spu_mul(shresult, spu_add(xabs, spu_splats(4.0f)));
-  shresult = spu_mul(shresult, spu_add(xabs, spu_splats(5.0f)));
-  shresult = _logf4(shresult);
-  shresult = spu_sub(stresult, shresult);
-  stresult = spu_sel(stresult, shresult, isshifted);
+
+  vec_uchar16 gt_r1start = (vec_uchar16)spu_cmpgt(xabs, r1start);
+  vec_uchar16 gt_r2start = (vec_uchar16)spu_cmpgt(xabs, r2start);
+  vec_uchar16 gt_r3start = (vec_uchar16)spu_cmpgt(xabs, r3start);
+  vec_uchar16 gt_r4start = (vec_uchar16)spu_cmpgt(xabs, r4start);
+  vec_uchar16 gt_r5start = (vec_uchar16)spu_cmpgt(xabs, r5start);
+  vec_uchar16 gt_r6start = (vec_uchar16)spu_cmpgt(xabs, r6start);
+  vec_uchar16 gt_r7start = (vec_uchar16)spu_cmpgt(xabs, r7start);
+
+  vec_uchar16 shufflepattern;
+  shufflepattern = spu_sel(shuffle0, shuffle1, gt_r1start);
+  shufflepattern = spu_sel(shufflepattern, shuffle2, gt_r2start);
+  shufflepattern = spu_sel(shufflepattern, shuffle3, gt_r3start);
+  shufflepattern = spu_sel(shufflepattern, shuffle4, gt_r4start);
+  shufflepattern = spu_sel(shufflepattern, shuffle5, gt_r5start);
+  shufflepattern = spu_sel(shufflepattern, shuffle6, gt_r6start);
+  shufflepattern = spu_sel(shufflepattern, shuffle7, gt_r7start);
+
 
 
+  /* Use the shuffle pattern to select the coefficients */
+
+  vec_float4 coeff_00 = spu_shuffle(c00a, c00b, shufflepattern);
+  vec_float4 coeff_01 = spu_shuffle(c01a, c01b, shufflepattern);
+  vec_float4 coeff_02 = spu_shuffle(c02a, c02b, shufflepattern);
+  vec_float4 coeff_03 = spu_shuffle(c03a, c03b, shufflepattern);
+  vec_float4 coeff_04 = spu_shuffle(c04a, c04b, shufflepattern);
+  vec_float4 coeff_06 = spu_shuffle(c06a, c06b, shufflepattern);
+  vec_float4 coeff_07 = spu_shuffle(c07a, c07b, shufflepattern);
+  vec_float4 coeff_05 = spu_shuffle(c05a, c05b, shufflepattern);
+  vec_float4 coeff_08 = spu_shuffle(c08a, c08b, shufflepattern);
+  vec_float4 coeff_09 = spu_shuffle(c09a, c09b, shufflepattern);
+  vec_float4 coeff_10 = spu_shuffle(c10a, c10b, shufflepattern);
+  vec_float4 coeff_11 = spu_shuffle(c11a, c11b, shufflepattern);
+
+  vec_float4 xoffset     = spu_shuffle(xoffseta, xoffsetb, shufflepattern);
+  vec_uint4  tcorrection = spu_shuffle(tcorra,   tcorrb,   shufflepattern);
+  vec_uint4  isrational  = spu_shuffle(israta,   isratb,   shufflepattern);
+
   /*
-   * Select either Maclaurin or Stirling result before Negative X calc.
+   * We've completed the coeff. setup. Now we actually do the
+   * approximation below.
    */
-  vec_uint4 useStirlings = spu_cmpgt(xabs, mac_switch);
-  result = spu_sel(mresult, stresult, useStirlings);
+
+  /* Adjust x value here (for approximations about a point) */
+  vec_float4 xappr = spu_sub(xabs, xoffset);
+
+  /* If in Stirling partition, do some setup before the madds */
+  xappr = spu_sel(xappr, inv_xsqu, gt_r7start);
+
+
+
+  /* Now we do the multiplies - either a big polynomial or
+   * a rational approximation. Use Horner's method.
+   */
+  result = coeff_11;
+  result = spu_madd(xappr, result, coeff_10);
+  result = spu_madd(xappr, result, coeff_09);
+  result = spu_madd(xappr, result, coeff_08);
+  result = spu_madd(xappr, result, coeff_07);
+  result = spu_madd(xappr, result, coeff_06);
+
+  /* For rational approximations, we save numerator. */
+  vec_float4 resultn = result;
+
+  /* For rational appr,, reset result for calculation of denominator. */
+  result = spu_sel(result, spu_splats(0.0f), isrational);
+
+  result = spu_madd(xappr, result, coeff_05);
+  result = spu_madd(xappr, result, coeff_04);
+  result = spu_madd(xappr, result, coeff_03);
+  result = spu_madd(xappr, result, coeff_02);
+  result = spu_madd(xappr, result, coeff_01);
+  result = spu_madd(xappr, result, coeff_00);
+
+  /* Select either the polynomial or rational result */
+  result = spu_sel(result, _divf4(resultn, result), isrational);
+
+  /*
+   * Now we have to do a bit of additional calculations for
+   * partitions that weren't simple polynomial or rational
+   * approximations.
+   */
+
+  /* Finish the Near 0 formula */
+  result = spu_sel(spu_sub(result, ln_x), result, gt_r1start);
+
+  /* Finish Stirling's Approximation */
+  vec_float4 resultstirling = spu_madd(spu_sub(xabs, spu_splats(0.5f)), ln_x, halflog2pi);
+  resultstirling = spu_sub(resultstirling, xabs);
+  resultstirling = spu_add(spu_mul(result,inv_x), resultstirling);
+  result = spu_sel(result, resultstirling, gt_r7start);
+
+
+  /* Adjust due to systematic truncation */
+  result = (vec_float4)spu_add((vec_uint4)result, tcorrection);
+
 
   /*
    * Approximation for Negative X
@@ -202,29 +498,30 @@ static __inline vector float _lgammaf4(v
    * lgamma(x) = log(pi/(-x sin(pi x))) - lgamma(-x)
    *           
    */
-  nresult = spu_mul(x, _sinf4(spu_mul(x, pi)));
+  vec_float4 nresult = spu_mul(x, _sinf4(spu_mul(x, pi)));
   nresult = spu_andc(nresult, sign_maskf);
-  nresult = spu_sub(_logf4(pi), spu_add(result, _logf4(nresult)));
-
+  nresult = spu_sub(logpi, spu_add(result, _logf4(nresult)));
+  nresult = (vec_float4)spu_add((vec_uint4)nresult, spu_splats(1u));
 
-  /*
-   * Select between the negative or positive x approximations.
-   */
   result = spu_sel(nresult, result, gt0);
 
-  /*
-   * Finally, special cases/errors.
-   */
 
   /*
-   * x = non-positive integer, return infinity.
+   * Special Cases
    */
+
+  /* x = non-positive integer, return infinity */
+  vec_uint4 isnonposint = spu_andc(spu_cmpeq(x, xtrunc), gt0);
+  result = spu_sel(result, inff, spu_or(isnonposint, spu_cmpgt(x, spu_splats(4.2e36f))));
   result = spu_sel(result, inff, spu_andc(spu_cmpeq(x, xtrunc), gt0));
 
-  /* x = +/- infinite or nan, return |x| */
-  isnaninf = spu_cmpgt((vec_uint4)xabs, 0x7FEFFFFF);
+  /* Zeros of function */
+  result = spu_sel(result, zerof, ret_zero);
+
+  /* x = +/- infinity or nan, return |x| */
   result   = spu_sel(result, xabs, isnaninf);
 
+
   return result;
 }
 
Index: src/newlib/libm/machine/spu/headers/logbf4.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/logbf4.h
+++ src/newlib/libm/machine/spu/headers/logbf4.h
@@ -57,19 +57,18 @@
 
 /*
  * FUNCTION
- *	vector float _scalbnf4(vector float x, vector signed int exp)
+ *	vector float _logbf4(vector float x)
  *
  * DESCRIPTION
- *      The _scalbnf4 function returns a vector containing each element of x 
- *      multiplied by 2^n computed efficiently.  This function is computed 
- *      without the assistance of any floating point operations and as such 
- *      does not set any floating point exceptions.
+ *  The _logbf4 function returns a vector float that contains the exponent
+ *  of the corresponding elements of the input vector x. The exponent is
+ *  defined by:
+ *    x = frac * FLT_RADIX^exp, with frac in [1, FLT_RADIX).
  *
- *      Special Cases:
- *	  - if the exponent is 0, then x is either 0 or a subnormal, and the 
- *          result will be returned as 0.
- *        - if the result if underflows, it will be returned as 0.
- *        - if the result overflows, it will be returned as FLT_MAX.
+ *  Special Cases:
+ *    x = 0,  result is undefined.
+ *    x = NaN, result is NaN.
+ *    x = infinity, +infinity is returned.
  *
  */
 static __inline vector float _logbf4(vector float x)
Index: src/newlib/libm/machine/spu/headers/nextafterd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/nextafterd2.h
+++ src/newlib/libm/machine/spu/headers/nextafterd2.h
@@ -74,10 +74,13 @@
 static __inline vector double _nextafterd2(vector double x, vector double y)
 {
     vec_double2 n1ulp = (vec_double2)spu_splats(0x8000000000000001ull);
+    vector unsigned char mov_carry = {0x80,0x80,0x80, 7, 0x80,0x80,0x80,0x80,
+                                      0x80,0x80,0x80,15, 0x80,0x80,0x80,0x80};
     vec_double2 zerod = spu_splats(0.0);
     vec_llong2  one   = spu_splats(1ll);
     vec_ullong2 xlt0, xgty, xeqy, xeq0;
     vec_llong2  xllong;
+    vec_int4    carry;
     vec_llong2  delta, deltap1;
     vec_double2 result;
 
@@ -102,10 +105,18 @@ static __inline vector double _nextafter
 
     /* Determine value to add to x */
     delta = (vec_llong2)spu_xor(xgty, xlt0);
-    deltap1 = delta + one;
+
+    //deltap1 = delta + one;
+    carry = spu_genc((vec_int4)delta, (vec_int4)one);
+    carry = spu_shuffle(carry, carry, mov_carry);
+    deltap1 = (vec_llong2)spu_addx((vec_int4)delta, (vec_int4)one, (vec_int4)carry);
+
     delta = spu_sel(deltap1, delta, (vec_ullong2)delta);
 
-    xllong = xllong + delta;
+    //xllong = xllong + delta;
+    carry = spu_genc((vec_int4)xllong, (vec_int4)delta);
+    carry = spu_shuffle(carry, carry, mov_carry);
+    xllong = (vec_llong2)spu_addx((vec_int4)xllong, (vec_int4)delta, (vec_int4)carry);
 
     /* Fix the case of x = 0, and answer should be -1 ulp */
     result = spu_sel((vec_double2)xllong, n1ulp, spu_and((vec_ullong2)delta, xeq0));
Index: src/newlib/libm/machine/spu/headers/nextafterf4.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/nextafterf4.h
+++ src/newlib/libm/machine/spu/headers/nextafterf4.h
@@ -97,10 +97,10 @@ static __inline vector float _nextafterf
 
     /* Determine value to add to x */
     delta = (vec_int4)spu_xor(xgty, xlt0);
-    deltap1 = delta + one;
+    deltap1 = spu_add(delta,one);
     delta = spu_sel(deltap1, delta, (vec_uint4)delta);
 
-    xint = xint + delta;
+    xint = spu_add(xint, delta);
 
     /* Fix the case of x = 0, and answer should be -1 ulp */
     result = spu_sel((vec_float4)xint, n1ulp, spu_and((vec_uint4)delta, xeq0));
Index: src/newlib/libm/machine/spu/headers/recipd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/recipd2.h
+++ src/newlib/libm/machine/spu/headers/recipd2.h
@@ -80,9 +80,7 @@
  */ 
 static __inline vector double _recipd2(vector double value_d)
 {
-  vector unsigned long long zero     = (vector unsigned long long) { 0x0000000000000000ULL, 0x0000000000000000ULL };
   vector unsigned long long expmask  = (vector unsigned long long) { 0x7FF0000000000000ULL, 0x7FF0000000000000ULL };
-  vector unsigned long long signmask = (vector unsigned long long) { 0x8000000000000000ULL, 0x8000000000000000ULL };
   vector float  x0;
   vector float  value;
   vector float  two   = spu_splats(2.0f);
@@ -94,6 +92,7 @@ static __inline vector double _recipd2(v
    * point exponents that are out of single precision range.
    */
   bias = spu_xor(spu_and(value_d, (vector double)expmask), (vector double)expmask);
+
   value = spu_roundtf(spu_mul(value_d, bias));
   x0 = spu_re(value);
   x1 = spu_extend(spu_mul(x0, spu_nmsub(value, x0, two)));
@@ -102,10 +101,22 @@ static __inline vector double _recipd2(v
   x3 = spu_mul(x2, spu_nmsub(value_d, x2, two_d));
 
   /* Handle input = +/- infinity or +/-0. */
-  vec_double2 xabs = spu_andc(value_d, (vec_double2)signmask);
-  vec_ullong2 zeroinf = spu_or(spu_cmpeq(xabs, (vec_double2)expmask),
-                               spu_cmpeq(xabs, (vec_double2)zero));
-  x3 = spu_sel(x3, spu_xor(value_d, (vector double)expmask), zeroinf);
+
+#ifdef __SPU_EDP__
+  vec_ullong2 is0inf = spu_testsv(value_d, SPU_SV_NEG_ZERO     | SPU_SV_POS_ZERO |
+                                           SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY);
+#else
+  vec_double2 nzero = spu_splats(-0.0);
+  vec_double2 xabs = spu_andc(value_d, nzero);
+  vector unsigned char swap  = (vector unsigned char) {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
+  vec_uint4 isinf  = spu_cmpeq((vec_uint4)xabs, (vec_uint4)expmask);
+  vec_uint4 iszero = spu_cmpeq((vec_uint4)xabs, 0);
+  isinf  = spu_and(isinf,  spu_shuffle(isinf, isinf, swap));
+  iszero = spu_and(iszero, spu_shuffle(iszero, iszero, swap));
+  vec_ullong2 is0inf = (vec_ullong2)spu_or(isinf, iszero);
+#endif /* __SPU_EDP__ */
+
+  x3 = spu_sel(x3, spu_xor(value_d, (vector double)expmask), is0inf);
 
   return (x3);
 }
Index: src/newlib/libm/machine/spu/headers/simdmath.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/simdmath.h
+++ src/newlib/libm/machine/spu/headers/simdmath.h
@@ -212,9 +212,9 @@ vector double fmind2(vector double x, ve
 vector double fmodd2(vector double x, vector double y);
 vector float  fmodf4_fast(vector float x, vector float y);
 vector signed long long fpclassifyd2(vector double x);
-vector double frexpd2(vector double x, vector signed int *pexp);
+vector double frexpd2(vector double x, vector signed long long *pexp);
 vector double hypotd2(vector double x, vector double y);
-vector signed int ilogbd2(vector double x);
+vector signed long long ilogbd2(vector double x);
 vector unsigned long long is0denormd2(vector double x);
 vector unsigned long long isequald2(vector double x, vector double y);
 vector unsigned long long isfinited2(vector double x);
@@ -227,7 +227,7 @@ vector unsigned long long islessgreaterd
 vector unsigned long long isnand2(vector double x);
 vector unsigned long long isnormald2(vector double x);
 vector unsigned long long isunorderedd2(vector double x, vector double y);
-vector double ldexpd2(vector double x, vector signed int exp);
+vector double ldexpd2(vector double x, vector signed long long exp);
 vector signed long long llabsi2(vector signed long long x);
 lldivi2_t lldivi2(vector signed long long x, vector signed long long y);
 lldivu2_t lldivu2(vector unsigned long long x, vector unsigned long long y);
@@ -237,6 +237,7 @@ vector signed long long llroundd2(vector
 vector double log10d2(vector double x);
 vector double log1pd2(vector double x);
 vector double log2d2(vector double x);
+vector double logbd2(vector double x);
 vector double logd2(vector double x);
 vector double modfd2(vector double x, vector double* pint);
 vector double nearbyintd2(vector double x);
@@ -247,7 +248,7 @@ vector double powd2(vector double x, vec
 vector double recipd2(vector double value_d);
 vector float  recipf4_fast(vector float a);
 vector double remainderd2(vector double x, vector double y);
-vector double remquod2(vector double x, vector double y, vector signed int *quo);
+vector double remquod2(vector double x, vector double y, vector signed long long *quo);
 vector double rintd2(vector double x);
 vector double roundd2(vector double x);
 vector double rsqrtd2(vector double x);
Index: src/newlib/libm/machine/spu/headers/acoshd2.h
===================================================================
--- src.orig/newlib/libm/machine/spu/headers/acoshd2.h
+++ src/newlib/libm/machine/spu/headers/acoshd2.h
@@ -91,79 +91,78 @@
  * Taylor Series Coefficients 
  * for x around 1.
  */
-#define ACOSH_TAY01  1.0000000000000000000000000000000000000000000000000000000000000000000000E0  /* 1 / 1                            */
-#define ACOSH_TAY02 -8.3333333333333333333333333333333333333333333333333333333333333333333333E-2 /* 1 / 12                           */
-#define ACOSH_TAY03  1.8750000000000000000000000000000000000000000000000000000000000000000000E-2 /* 3 / 160                          */
-#define ACOSH_TAY04 -5.5803571428571428571428571428571428571428571428571428571428571428571429E-3 /* 5 / 896                          */
-#define ACOSH_TAY05  1.8988715277777777777777777777777777777777777777777777777777777777777778E-3 /* 35 / 18432                       */
-#define ACOSH_TAY06 -6.9912997159090909090909090909090909090909090909090909090909090909090909E-4 /* 63 / 90112                       */
-#define ACOSH_TAY07  2.7113694411057692307692307692307692307692307692307692307692307692307692E-4 /* 231 / 851968                     */
-#define ACOSH_TAY08 -1.0910034179687500000000000000000000000000000000000000000000000000000000E-4 /* 143 / 1310720                    */
-#define ACOSH_TAY09  4.5124222250545726102941176470588235294117647058823529411764705882352941E-5 /* 6435 / 142606336                 */
-#define ACOSH_TAY10 -1.9065643611707185444078947368421052631578947368421052631578947368421053E-5 /* 12155 / 637534208                */
-#define ACOSH_TAY11  8.1936873140789213634672619047619047619047619047619047619047619047619048E-6 /* 46189 / 5637144576               */
-#define ACOSH_TAY12 -3.5705692742181860882302989130434782608695652173913043478260869565217391E-6 /* 88179 / 24696061952              */
-#define ACOSH_TAY13  1.5740259550511837005615234375000000000000000000000000000000000000000000E-6 /* 676039 / 429496729600            */
-#define ACOSH_TAY14 -7.0068819224144573564882631655092592592592592592592592592592592592592593E-7 /* 1300075 / 1855425871872          */
-#define ACOSH_TAY15  3.1453306166503321507881427633351293103448275862068965517241379310344828E-7 /* 5014575 / 15942918602752         */
-#if 0
-#define ACOSH_TAY16 -1.4221629293564136230176494967552923387096774193548387096774193548387097E-7 /* 9694845 / 68169720922112         */
-#define ACOSH_TAY17  6.4711106776113328206437555226412686434659090909090909090909090909090909E-8 /* 100180065 / 1548112371908608     */
-#define ACOSH_TAY18 -2.9609409781171182528071637664522443498883928571428571428571428571428571E-8 /* 116680311 / 3940649673949184     */
-#define ACOSH_TAY19  1.3615438056281793767600509061201198680980785472972972972972972972972973E-8 /* 2268783825 / 166633186212708352  */
-#endif
+#define SDM_ACOSHD2_TAY01  1.000000000000000000000000000000000E0  /* 1 / 1                            */
+#define SDM_ACOSHD2_TAY02 -8.333333333333333333333333333333333E-2 /* 1 / 12                           */
+#define SDM_ACOSHD2_TAY03  1.875000000000000000000000000000000E-2 /* 3 / 160                          */
+#define SDM_ACOSHD2_TAY04 -5.580357142857142857142857142857142E-3 /* 5 / 896                          */
+#define SDM_ACOSHD2_TAY05  1.898871527777777777777777777777777E-3 /* 35 / 18432                       */
+#define SDM_ACOSHD2_TAY06 -6.991299715909090909090909090909090E-4 /* 63 / 90112                       */
+#define SDM_ACOSHD2_TAY07  2.711369441105769230769230769230769E-4 /* 231 / 851968                     */
+#define SDM_ACOSHD2_TAY08 -1.091003417968750000000000000000000E-4 /* 143 / 1310720                    */
+#define SDM_ACOSHD2_TAY09  4.512422225054572610294117647058823E-5 /* 6435 / 142606336                 */
+#define SDM_ACOSHD2_TAY10 -1.906564361170718544407894736842105E-5 /* 12155 / 637534208                */
+#define SDM_ACOSHD2_TAY11  8.193687314078921363467261904761904E-6 /* 46189 / 5637144576               */
+#define SDM_ACOSHD2_TAY12 -3.570569274218186088230298913043478E-6 /* 88179 / 24696061952              */
+#define SDM_ACOSHD2_TAY13  1.574025955051183700561523437500000E-6 /* 676039 / 429496729600            */
+#define SDM_ACOSHD2_TAY14 -7.006881922414457356488263165509259E-7 /* 1300075 / 1855425871872          */
+#define SDM_ACOSHD2_TAY15  3.145330616650332150788142763335129E-7 /* 5014575 / 15942918602752         */
 
 static __inline vector double _acoshd2(vector double x)
 {
     vec_uchar16 dup_even  = ((vec_uchar16) { 0,1,2,3,  0,1,2,3, 8,9,10,11, 8,9,10,11 });
     vec_double2 minus_oned = spu_splats(-1.0);
     vec_double2 twod       = spu_splats(2.0);
-    vec_double2 xminus1;
-    vec_float4  xf;
     /* Where we switch from taylor to formula */
     vec_float4  switch_approx = spu_splats(1.15f);
-    vec_uint4   use_form;
     vec_double2 result, fresult, mresult;;
 
     
-    xf = spu_roundtf(x);
+    vec_double2 xminus1 = spu_add(x, minus_oned);
+    vec_float4  xf = spu_roundtf(x);
     xf = spu_shuffle(xf, xf, dup_even);
 
+    vec_ullong2 use_form = (vec_ullong2)spu_cmpgt(xf, switch_approx);
+
+    vec_double2 sqrtargformula = spu_madd(x, x, minus_oned);
+    vec_double2 sqrtargtaylor  = spu_mul(xminus1, twod);
+    vec_double2 sqrtarg = spu_sel(sqrtargtaylor, sqrtargformula, use_form);
+
+    vec_double2 sqrtresult = _sqrtd2(sqrtarg);
+
     /*
      * Formula:
      *   acosh = ln(x + sqrt(x^2 - 1))
      */
-    fresult = _sqrtd2(spu_madd(x, x, minus_oned));
-    fresult = spu_add(x, fresult);
+    fresult = spu_add(x, sqrtresult);
     fresult = _logd2(fresult);
 
     /*
      * Taylor Series
      */
-    xminus1 = spu_add(x, minus_oned);
+    mresult = spu_splats(SDM_ACOSHD2_TAY15);
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY14));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY13));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY12));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY11));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY10));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY09));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY08));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY07));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY06));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY05));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY04));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY03));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY02));
+    mresult = spu_madd(xminus1, mresult, spu_splats(SDM_ACOSHD2_TAY01));
+
+
+    mresult = spu_mul(mresult, sqrtresult);
 
-    mresult = spu_madd(xminus1, spu_splats(ACOSH_TAY15), spu_splats(ACOSH_TAY14));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY13));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY12));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY11));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY10));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY09));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY08));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY07));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY06));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY05));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY04));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY03));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY02));
-    mresult = spu_madd(xminus1, mresult, spu_splats(ACOSH_TAY01));
-    
-    mresult = spu_mul(mresult, _sqrtd2(spu_mul(xminus1, twod)));
 
     /*
      * Select series or formula
      */
-    use_form = spu_cmpgt(xf, switch_approx);
-    result = spu_sel(mresult, fresult, (vec_ullong2)use_form);
+    result = spu_sel(mresult, fresult, use_form);
 
     return result;
 }



More information about the Newlib mailing list