sysdeps/alpha/fpu/e_sqrt.c

   1 /* Copyright (C) 1996, 1997 Free Software Foundation, Inc.
   2    Contributed by David Mosberger (davidm@cs.arizona.edu).
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Library General Public License as
   8    published by the Free Software Foundation; either version 2 of the
   9    License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Library General Public License for more details.
  15
  16    You should have received a copy of the GNU Library General Public
  17    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  18    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 /*
  22  * We have three versions, depending on how exact we need the results.
  23  */
  24
  25 #if defined(_IEEE_FP) && defined(_IEEE_FP_INEXACT)
  26
  27 /* Most demanding: go to the original source.  */
  28 #include <libm-ieee754/e_sqrt.c>
  29
  30 #else
  31
  32 /* Careful with rearranging this without consulting the assembly below.  */
  33 const static struct sqrt_data_struct {
  34         unsigned long dn, up, half, almost_three_half;
  35         unsigned long one_and_a_half, two_to_minus_30, one, nan;
  36         const int T2[64];
  37 } sqrt_data = {
  38         0x3fefffffffffffff,     /* __dn = nextafter(1,-Inf) */
  39         0x3ff0000000000001,     /* __up = nextafter(1,+Inf) */
  40         0x3fe0000000000000,     /* half */
  41         0x3ff7ffffffc00000,     /* almost_three_half = 1.5-2^-30 */
  42         0x3ff8000000000000,     /* one_and_a_half */
  43         0x3e10000000000000,     /* two_to_minus_30 */
  44         0x3ff0000000000000,     /* one */
  45         0xffffffffffffffff,     /* nan */
  46
  47         { 0x1500, 0x2ef8, 0x4d67, 0x6b02, 0x87be, 0xa395, 0xbe7a, 0xd866,
  48         0xf14a, 0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,
  49         0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,
  50         0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,
  51         0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,
  52         0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,
  53         0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,
  54         0x1527f,0x1334a,0x11051,0xe951, 0xbe01, 0x8e0d, 0x5924, 0x1edd }
  55 };
  56
  57 #ifdef _IEEE_FP
  58 /*
  59  * This version is much faster than the standard one included above,
  60  * but it doesn't maintain the inexact flag.
  61  */
  62
  63 #define lobits(x) (((unsigned int *)&x)[0])
  64 #define hibits(x) (((unsigned int *)&x)[1])
  65
  66 static inline double initial_guess(double x, unsigned int k,
  67         const struct sqrt_data_struct * const ptr)
  68 {
  69         double ret = 0.0;
  70
  71         k = 0x5fe80000 - (k >> 1);
  72         k = k - ptr->T2[63&(k>>14)];
  73         hibits(ret) = k;
  74         return ret;
  75 }
  76
  77 /* up = nextafter(1,+Inf), dn = nextafter(1,-Inf) */
  78
  79 #define __half                  (ptr->half)
  80 #define __one_and_a_half        (ptr->one_and_a_half)
  81 #define __two_to_minus_30       (ptr->two_to_minus_30)
  82 #define __one                   (ptr->one)
  83 #define __up                    (ptr->up)
  84 #define __dn                    (ptr->dn)
  85 #define __Nan                   (ptr->nan)
  86
  87 #define Double(x) (*(double *)&x)
  88
  89 /* Multiply with chopping rounding.. */
  90 #define choppedmul(a,b,c) \
  91   __asm__("multc %1,%2,%0":"=&f" (c):"f" (a), "f" (b))
  92
  93 double
  94 __ieee754_sqrt(double x)
  95 {
  96   const struct sqrt_data_struct * const ptr = &sqrt_data;
  97   unsigned long k, bits;
  98   double y, z, zp, zn;
  99   double dn, up, low, high;
 100   double half, one_and_a_half, one, two_to_minus_30;
 101
 102   *(double *)&bits = x;
 103   k = bits;
 104
 105   /* Negative or NaN or Inf */
 106   if ((k >> 52) >= 0x7ff)
 107     goto special;
 108   y = initial_guess(x, k >> 32, ptr);
 109   half = Double(__half);
 110   one_and_a_half = Double(__one_and_a_half);
 111   y = y*(one_and_a_half - half*x*y*y);
 112   dn = Double(__dn);
 113   two_to_minus_30 = Double(__two_to_minus_30);
 114   y = y*((one_and_a_half - two_to_minus_30) - half*x*y*y);
 115   up = Double(__up);
 116   z = x*y;
 117   one = Double(__one);
 118   z = z + half*z*(one-z*y);
 119
 120   choppedmul(z,dn,zp);
 121   choppedmul(z,up,zn);
 122
 123   choppedmul(z,zp,low);
 124   low = low - x;
 125   choppedmul(z,zn,high);
 126   high = high - x;
 127
 128   /* I can't get gcc to use fcmov's.. */
 129   __asm__("fcmovge %2,%3,%0"
 130           :"=f" (z)
 131           :"0" (z), "f" (low), "f" (zp));
 132   __asm__("fcmovlt %2,%3,%0"
 133           :"=f" (z)
 134           :"0" (z), "f" (high), "f" (zn));
 135   return z;     /* Argh! gcc jumps to end here */
 136
 137 special:
 138   /* throw away sign bit */
 139   k <<= 1;
 140   /* -0 */
 141   if (!k)
 142     return x;
 143   /* special? */
 144   if ((k >> 53) == 0x7ff) {
 145     /* NaN? */
 146     if (k << 11)
 147       return x;
 148     /* sqrt(+Inf) = +Inf */
 149     if (x > 0)
 150       return x;
 151   }
 152
 153   x = Double(__Nan);
 154   return x;
 155 }
 156
 157 #else
 158 /*
 159  * This version is much faster than generic sqrt implementation, but
 160  * it doesn't handle exceptional values or the inexact flag.
 161  */
 162
 163 asm ("\
 164   /* Define offsets into the structure defined in C above.  */
 165         $DN = 0*8
 166         $UP = 1*8
 167         $HALF = 2*8
 168         $ALMOST_THREE_HALF = 3*8
 169         $NAN = 7*8
 170         $T2 = 8*8
 171
 172   /* Stack variables.  */
 173         $K = 0
 174         $Y = 8
 175
 176         .text
 177         .align  3
 178         .globl  __ieee754_sqrt
 179         .ent    __ieee754_sqrt
 180 __ieee754_sqrt:
 181         ldgp    $29, 0($27)
 182         subq    $sp, 16, $sp
 183         .frame  $sp, 16, $26, 0\n"
 184 #ifdef PROF
 185 "       lda     $28, _mcount
 186         jsr     $28, ($28), _mcount\n"
 187 #endif
 188 "       .prologue 1
 189
 190         stt     $f16, $K($sp)
 191         lda     $4, sqrt_data                   # load base address into t3
 192         fblt    $f16, $negative
 193
 194   /* Compute initial guess.  */
 195
 196         .align 3
 197
 198         ldah    $2, 0x5fe8                      # e0    :
 199         ldq     $3, $K($sp)                     # .. e1 :
 200         ldt     $f12, $HALF($4)                 # e0    :
 201         ldt     $f18, $ALMOST_THREE_HALF($4)    # .. e1 :
 202         srl     $3, 33, $1                      # e0    :
 203         mult    $f16, $f12, $f11                # .. fm : $f11 = x * 0.5
 204         subl    $2, $1, $2                      # e0    :
 205         addt    $f12, $f12, $f17                # .. fa : $f17 = 1.0
 206         srl     $2, 12, $1                      # e0    :
 207         and     $1, 0xfc, $1                    # .. e1 :
 208         addq    $1, $4, $1                      # e0    :
 209         ldl     $1, $T2($1)                     # .. e1 :
 210         addt    $f12, $f17, $f15                # fa    : $f15 = 1.5
 211         subl    $2, $1, $2                      # .. e1 :
 212         sll     $2, 32, $2                      # e0    :
 213         ldt     $f14, $DN($4)                   # .. e1 :
 214         stq     $2, $Y($sp)                     # e0    :
 215         nop                                     # .. e1 : avoid pipe flash
 216         nop                                     # e0    :
 217         ldt     $f13, $Y($sp)                   # .. e1 :
 218
 219         mult/su $f11, $f13, $f10        # fm    : $f10 = (x * 0.5) * y
 220         mult    $f10, $f13, $f10        # fm    : $f10 = ((x * 0.5) * y) * y
 221         subt    $f15, $f10, $f1         # fa    : $f1 = (1.5 - 0.5*x*y*y)
 222         mult    $f13, $f1, $f13         # fm    : yp = y*(1.5 - 0.5*x*y*y)
 223         mult/su $f11, $f13, $f1         # fm    : $f11 = x * 0.5 * yp
 224         mult    $f1, $f13, $f11         # fm    : $f11 = (x * 0.5 * yp) * yp
 225         subt    $f18, $f11, $f1         # fa    : $f1= (1.5-2^-30) - 0.5*x*yp*yp
 226         mult    $f13, $f1, $f13         # fm    : ypp = $f13 = yp*$f1
 227         subt    $f15, $f12, $f1         # fa    : $f1 = (1.5 - 0.5)
 228         ldt     $f15, $UP($4)           # .. e1 :
 229         mult/su $f16, $f13, $f10        # fm    : z = $f10 = x * ypp
 230         mult    $f10, $f13, $f11        # fm    : $f11 = z*ypp
 231         mult    $f10, $f12, $f12        # fm    : $f12 = z*0.5
 232         subt    $f1, $f11, $f1          # .. fa : $f1 = 1 - z*ypp
 233         mult    $f12, $f1, $f12         # fm    : $f12 = z*0.5*(1 - z*ypp)
 234         addt    $f10, $f12, $f0         # fa    : zp=res=$f0= z + z*0.5*(1 - z*ypp)
 235
 236         mult/c  $f0, $f14, $f12         # fm    : zmi = zp * DN
 237         mult/c  $f0, $f15, $f11         # fm    : zpl = zp * UP
 238         mult/c  $f0, $f12, $f1          # fm    : $f1 = zp * zmi
 239         mult/c  $f0, $f11, $f15         # fm    : $f15 = zp * zpl
 240
 241         subt/su $f1, $f16, $f13         # fa    : y1 = zp*zmi - x
 242         subt/su $f15, $f16, $f14        # fa    : y2 = zp*zpl - x
 243
 244         fcmovge $f13, $f12, $f0         # res = (y1 >= 0) ? zmi : res
 245         fcmovlt $f14, $f11, $f0         # res = (y2 <  0) ? zpl : res
 246
 247         addq    $sp, 16, $sp            # e0    :
 248         ret                             # .. e1 :
 249
 250 $negative:
 251         ldt     $f0, $NAN($4)
 252         addq    $sp, 16, $sp
 253         ret
 254
 255         .end    __ieee754_sqrt");
 256
 257 #endif /* _IEEE_FP */
 258 #endif /* _IEEE_FP && _IEEE_FP_INEXACT */