[glibc.git] / soft-fp / op-2.h

/* Software floating-point emulation.
   Basic two-word fraction declaration and manipulation.
   Copyright (C) 1997-2013 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Richard Henderson (rth@cygnus.com),
		  Jakub Jelinek (jj@ultra.linux.cz),
		  David S. Miller (davem@redhat.com) and
		  Peter Maydell (pmaydell@chiark.greenend.org.uk).

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   In addition to the permissions in the GNU Lesser General Public
   License, the Free Software Foundation gives you unlimited
   permission to link the compiled version of this file into
   combinations with other programs, and to distribute those
   combinations without any restriction coming from the use of this
   file.  (The Lesser General Public License restrictions do apply in
   other respects; for example, they cover modification of the file,
   and distribution when not linked into a combine executable.)

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#define _FP_FRAC_DECL_2(X)	_FP_W_TYPE X##_f0, X##_f1
#define _FP_FRAC_COPY_2(D,S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
#define _FP_FRAC_SET_2(X,I)	__FP_FRAC_SET_2(X, I)
#define _FP_FRAC_HIGH_2(X)	(X##_f1)
#define _FP_FRAC_LOW_2(X)	(X##_f0)
#define _FP_FRAC_WORD_2(X,w)	(X##_f##w)

#define _FP_FRAC_SLL_2(X,N)						    \
(void)(((N) < _FP_W_TYPE_SIZE)						    \
       ? ({								    \
	    if (__builtin_constant_p(N) && (N) == 1)			    \
	      {								    \
		X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0);   \
		X##_f0 += X##_f0;					    \
	      }								    \
	    else							    \
	      {								    \
		X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
		X##_f0 <<= (N);						    \
	      }								    \
	    0;								    \
	  })								    \
       : ({								    \
	    X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);			    \
	    X##_f0 = 0;							    \
	  }))


#define _FP_FRAC_SRL_2(X,N)						\
(void)(((N) < _FP_W_TYPE_SIZE)						\
       ? ({								\
	    X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N));	\
	    X##_f1 >>= (N);						\
	  })								\
       : ({								\
	    X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);			\
	    X##_f1 = 0;							\
	  }))

/* Right shift with sticky-lsb.  */
#define _FP_FRAC_SRST_2(X,S, N,sz)					  \
(void)(((N) < _FP_W_TYPE_SIZE)						  \
       ? ({								  \
	    S = (__builtin_constant_p(N) && (N) == 1			  \
		 ? X##_f0 & 1						  \
		 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		  \
	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
	    X##_f1 >>= (N);						  \
	  })								  \
       : ({								  \
	    S = ((((N) == _FP_W_TYPE_SIZE				  \
		   ? 0							  \
		   : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		  \
		  | X##_f0) != 0);					  \
	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		  \
	    X##_f1 = 0;							  \
	  }))

#define _FP_FRAC_SRS_2(X,N,sz)						  \
(void)(((N) < _FP_W_TYPE_SIZE)						  \
       ? ({								  \
	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \
		      (__builtin_constant_p(N) && (N) == 1		  \
		       ? X##_f0 & 1					  \
		       : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0));	  \
	    X##_f1 >>= (N);						  \
	  })								  \
       : ({								  \
	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) |		  \
		      ((((N) == _FP_W_TYPE_SIZE				  \
			 ? 0						  \
			 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	  \
			| X##_f0) != 0));				  \
	    X##_f1 = 0;							  \
	  }))

#define _FP_FRAC_ADDI_2(X,I)	\
  __FP_FRAC_ADDI_2(X##_f1, X##_f0, I)

#define _FP_FRAC_ADD_2(R,X,Y)	\
  __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)

#define _FP_FRAC_SUB_2(R,X,Y)	\
  __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)

#define _FP_FRAC_DEC_2(X,Y)	\
  __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)

#define _FP_FRAC_CLZ_2(R,X)	\
  do {				\
    if (X##_f1)			\
      __FP_CLZ(R,X##_f1);	\
    else 			\
    {				\
      __FP_CLZ(R,X##_f0);	\
      R += _FP_W_TYPE_SIZE;	\
    }				\
  } while(0)

/* Predicates */
#define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE)X##_f1 < 0)
#define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
#define _FP_FRAC_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
#define _FP_FRAC_CLEAR_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
#define _FP_FRAC_HIGHBIT_DW_2(fs,X)	\
  (_FP_FRAC_HIGH_DW_##fs(X) & _FP_HIGHBIT_DW_##fs)
#define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
#define _FP_FRAC_GT_2(X, Y)	\
  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
#define _FP_FRAC_GE_2(X, Y)	\
  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))

#define _FP_ZEROFRAC_2		0, 0
#define _FP_MINFRAC_2		0, 1
#define _FP_MAXFRAC_2		(~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)

/*
 * Internals
 */

#define __FP_FRAC_SET_2(X,I1,I0)	(X##_f0 = I0, X##_f1 = I1)

#define __FP_CLZ_2(R, xh, xl)	\
  do {				\
    if (xh)			\
      __FP_CLZ(R,xh);		\
    else 			\
    {				\
      __FP_CLZ(R,xl);		\
      R += _FP_W_TYPE_SIZE;	\
    }				\
  } while(0)

#if 0

#ifndef __FP_FRAC_ADDI_2
#define __FP_FRAC_ADDI_2(xh, xl, i)	\
  (xh += ((xl += i) < i))
#endif
#ifndef __FP_FRAC_ADD_2
#define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
  (rh = xh + yh + ((rl = xl + yl) < xl))
#endif
#ifndef __FP_FRAC_SUB_2
#define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
  (rh = xh - yh - ((rl = xl - yl) > xl))
#endif
#ifndef __FP_FRAC_DEC_2
#define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
  do {					\
    UWtype _t = xl;			\
    xh -= yh + ((xl -= yl) > _t);	\
  } while (0)
#endif

#else

#undef __FP_FRAC_ADDI_2
#define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa(xh, xl, xh, xl, 0, i)
#undef __FP_FRAC_ADD_2
#define __FP_FRAC_ADD_2			add_ssaaaa
#undef __FP_FRAC_SUB_2
#define __FP_FRAC_SUB_2			sub_ddmmss
#undef __FP_FRAC_DEC_2
#define __FP_FRAC_DEC_2(xh, xl, yh, yl)	sub_ddmmss(xh, xl, xh, xl, yh, yl)

#endif

/*
 * Unpack the raw bits of a native fp value.  Do not classify or
 * normalize the data.
 */

#define _FP_UNPACK_RAW_2(fs, X, val)			\
  do {							\
    union _FP_UNION_##fs _flo; _flo.flt = (val);	\
							\
    X##_f0 = _flo.bits.frac0;				\
    X##_f1 = _flo.bits.frac1;				\
    X##_e  = _flo.bits.exp;				\
    X##_s  = _flo.bits.sign;				\
  } while (0)

#define _FP_UNPACK_RAW_2_P(fs, X, val)			\
  do {							\
    union _FP_UNION_##fs *_flo =			\
      (union _FP_UNION_##fs *)(val);			\
							\
    X##_f0 = _flo->bits.frac0;				\
    X##_f1 = _flo->bits.frac1;				\
    X##_e  = _flo->bits.exp;				\
    X##_s  = _flo->bits.sign;				\
  } while (0)


/*
 * Repack the raw bits of a native fp value.
 */

#define _FP_PACK_RAW_2(fs, val, X)			\
  do {							\
    union _FP_UNION_##fs _flo;				\
							\
    _flo.bits.frac0 = X##_f0;				\
    _flo.bits.frac1 = X##_f1;				\
    _flo.bits.exp   = X##_e;				\
    _flo.bits.sign  = X##_s;				\
							\
    (val) = _flo.flt;					\
  } while (0)

#define _FP_PACK_RAW_2_P(fs, val, X)			\
  do {							\
    union _FP_UNION_##fs *_flo =			\
      (union _FP_UNION_##fs *)(val);			\
							\
    _flo->bits.frac0 = X##_f0;				\
    _flo->bits.frac1 = X##_f1;				\
    _flo->bits.exp   = X##_e;				\
    _flo->bits.sign  = X##_s;				\
  } while (0)


/*
 * Multiplication algorithms:
 */

/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */

#define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit)		\
  do {									\
    _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);				\
									\
    doit(_FP_FRAC_WORD_4(R,1), _FP_FRAC_WORD_4(R,0), X##_f0, Y##_f0);	\
    doit(_b_f1, _b_f0, X##_f0, Y##_f1);					\
    doit(_c_f1, _c_f0, X##_f1, Y##_f0);					\
    doit(_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2), X##_f1, Y##_f1);	\
									\
    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		    _FP_FRAC_WORD_4(R,1), 0, _b_f1, _b_f0,		\
		    _FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		    _FP_FRAC_WORD_4(R,1));				\
    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		    _FP_FRAC_WORD_4(R,1), 0, _c_f1, _c_f0,		\
		    _FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		    _FP_FRAC_WORD_4(R,1));				\
  } while (0)

#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
  do {									\
    _FP_FRAC_DECL_4(_z);						\
									\
    _FP_MUL_MEAT_DW_2_wide(wfracbits, _z, X, Y, doit);			\
									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
       at either 2B or 2B-1.  */					\
    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
  } while (0)

/* Given a 1W * 1W => 2W primitive, do the extended multiplication.
   Do only 3 multiplications instead of four. This one is for machines
   where multiplication is much more expensive than subtraction.  */

#define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit)		\
  do {									\
    _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);				\
    _FP_W_TYPE _d;							\
    int _c1, _c2;							\
									\
    _b_f0 = X##_f0 + X##_f1;						\
    _c1 = _b_f0 < X##_f0;						\
    _b_f1 = Y##_f0 + Y##_f1;						\
    _c2 = _b_f1 < Y##_f0;						\
    doit(_d, _FP_FRAC_WORD_4(R,0), X##_f0, Y##_f0);			\
    doit(_FP_FRAC_WORD_4(R,2), _FP_FRAC_WORD_4(R,1), _b_f0, _b_f1);	\
    doit(_c_f1, _c_f0, X##_f1, Y##_f1);					\
									\
    _b_f0 &= -_c2;							\
    _b_f1 &= -_c1;							\
    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		    _FP_FRAC_WORD_4(R,1), (_c1 & _c2), 0, _d,		\
		    0, _FP_FRAC_WORD_4(R,2), _FP_FRAC_WORD_4(R,1));	\
    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		     _b_f0);						\
    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		     _b_f1);						\
    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		    _FP_FRAC_WORD_4(R,1),				\
		    0, _d, _FP_FRAC_WORD_4(R,0));			\
    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2),		\
		    _FP_FRAC_WORD_4(R,1), 0, _c_f1, _c_f0);		\
    __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2),		\
		    _c_f1, _c_f0,					\
		    _FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2));	\
  } while (0)

#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
  do {									\
    _FP_FRAC_DECL_4(_z);						\
									\
    _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, _z, X, Y, doit);		\
									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
       at either 2B or 2B-1.  */					\
    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
  } while (0)

#define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y)			\
  do {									\
    _FP_W_TYPE _x[2], _y[2];						\
    _x[0] = X##_f0; _x[1] = X##_f1;					\
    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
									\
    mpn_mul_n(R##_f, _x, _y, 2);					\
  } while (0)

#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
  do {									\
    _FP_FRAC_DECL_4(_z);						\
									\
    _FP_MUL_MEAT_DW_2_gmp(wfracbits, _z, X, Y);				\
									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
       at either 2B or 2B-1.  */					\
    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    R##_f0 = _z_f[0];							\
    R##_f1 = _z_f[1];							\
  } while (0)

/* Do at most 120x120=240 bits multiplication using double floating
   point multiplication.  This is useful if floating point
   multiplication has much bigger throughput than integer multiply.
   It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
   between 106 and 120 only.
   Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
   SETFETZ is a macro which will disable all FPU exceptions and set rounding
   towards zero,  RESETFE should optionally reset it back.  */

#define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe)	\
  do {										\
    static const double _const[] = {						\
      /* 2^-24 */ 5.9604644775390625e-08,					\
      /* 2^-48 */ 3.5527136788005009e-15,					\
      /* 2^-72 */ 2.1175823681357508e-22,					\
      /* 2^-96 */ 1.2621774483536189e-29,					\
      /* 2^28 */ 2.68435456e+08,						\
      /* 2^4 */ 1.600000e+01,							\
      /* 2^-20 */ 9.5367431640625e-07,						\
      /* 2^-44 */ 5.6843418860808015e-14,					\
      /* 2^-68 */ 3.3881317890172014e-21,					\
      /* 2^-92 */ 2.0194839173657902e-28,					\
      /* 2^-116 */ 1.2037062152420224e-35};					\
    double _a240, _b240, _c240, _d240, _e240, _f240, 				\
	   _g240, _h240, _i240, _j240, _k240;					\
    union { double d; UDItype i; } _l240, _m240, _n240, _o240,			\
				   _p240, _q240, _r240, _s240;			\
    UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;			\
										\
    if (wfracbits < 106 || wfracbits > 120)					\
      abort();									\
										\
    setfetz;									\
										\
    _e240 = (double)(long)(X##_f0 & 0xffffff);					\
    _j240 = (double)(long)(Y##_f0 & 0xffffff);					\
    _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff);				\
    _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff);				\
    _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48));	\
    _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48));	\
    _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff);				\
    _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff);				\
    _a240 = (double)(long)(X##_f1 >> 32);					\
    _f240 = (double)(long)(Y##_f1 >> 32);					\
    _e240 *= _const[3];								\
    _j240 *= _const[3];								\
    _d240 *= _const[2];								\
    _i240 *= _const[2];								\
    _c240 *= _const[1];								\
    _h240 *= _const[1];								\
    _b240 *= _const[0];								\
    _g240 *= _const[0];								\
    _s240.d =							      _e240*_j240;\
    _r240.d =						_d240*_j240 + _e240*_i240;\
    _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240;\
    _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\
    _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\
    _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;		\
    _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;				\
    _l240.d = _a240*_g240 + _b240*_f240;					\
    _k240 =   _a240*_f240;							\
    _r240.d += _s240.d;								\
    _q240.d += _r240.d;								\
    _p240.d += _q240.d;								\
    _o240.d += _p240.d;								\
    _n240.d += _o240.d;								\
    _m240.d += _n240.d;								\
    _l240.d += _m240.d;								\
    _k240 += _l240.d;								\
    _s240.d -= ((_const[10]+_s240.d)-_const[10]);				\
    _r240.d -= ((_const[9]+_r240.d)-_const[9]);					\
    _q240.d -= ((_const[8]+_q240.d)-_const[8]);					\
    _p240.d -= ((_const[7]+_p240.d)-_const[7]);					\
    _o240.d += _const[7];							\
    _n240.d += _const[6];							\
    _m240.d += _const[5];							\
    _l240.d += _const[4];							\
    if (_s240.d != 0.0) _y240 = 1;						\
    if (_r240.d != 0.0) _y240 = 1;						\
    if (_q240.d != 0.0) _y240 = 1;						\
    if (_p240.d != 0.0) _y240 = 1;						\
    _t240 = (DItype)_k240;							\
    _u240 = _l240.i;								\
    _v240 = _m240.i;								\
    _w240 = _n240.i;								\
    _x240 = _o240.i;								\
    R##_f1 = (_t240 << (128 - (wfracbits - 1)))					\
	     | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104));			\
    R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1)))			\
	     | ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))			\
	     | ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))			\
	     | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))			\
	     | _y240;								\
    resetfe;									\
  } while (0)

/*
 * Division algorithms:
 */

#define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
  do {									\
    _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0;		\
    if (_FP_FRAC_GT_2(X, Y))						\
      {									\
	_n_f2 = X##_f1 >> 1;						\
	_n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
	_n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1);			\
      }									\
    else								\
      {									\
	R##_e--;							\
	_n_f2 = X##_f1;							\
	_n_f1 = X##_f0;							\
	_n_f0 = 0;							\
      }									\
									\
    /* Normalize, i.e. make the most significant bit of the 		\
       denominator set. */						\
    _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs);				\
									\
    udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1);			\
    umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0);				\
    _r_f0 = _n_f0;							\
    if (_FP_FRAC_GT_2(_m, _r))						\
      {									\
	R##_f1--;							\
	_FP_FRAC_ADD_2(_r, Y, _r);					\
	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
	  {								\
	    R##_f1--;							\
	    _FP_FRAC_ADD_2(_r, Y, _r);					\
	  }								\
      }									\
    _FP_FRAC_DEC_2(_r, _m);						\
									\
    if (_r_f1 == Y##_f1)						\
      {									\
	/* This is a special case, not an optimization			\
	   (_r/Y##_f1 would not fit into UWtype).			\
	   As _r is guaranteed to be < Y,  R##_f0 can be either		\
	   (UWtype)-1 or (UWtype)-2.  But as we know what kind		\
	   of bits it is (sticky, guard, round),  we don't care.	\
	   We also don't care what the reminder is,  because the	\
	   guard bit will be set anyway.  -jj */			\
	R##_f0 = -1;							\
      }									\
    else								\
      {									\
	udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1);		\
	umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0);			\
	_r_f0 = 0;							\
	if (_FP_FRAC_GT_2(_m, _r))					\
	  {								\
	    R##_f0--;							\
	    _FP_FRAC_ADD_2(_r, Y, _r);					\
	    if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
	      {								\
		R##_f0--;						\
		_FP_FRAC_ADD_2(_r, Y, _r);				\
	      }								\
	  }								\
	if (!_FP_FRAC_EQ_2(_r, _m))					\
	  R##_f0 |= _FP_WORK_STICKY;					\
      }									\
  } while (0)


#define _FP_DIV_MEAT_2_gmp(fs, R, X, Y)					\
  do {									\
    _FP_W_TYPE _x[4], _y[2], _z[4];					\
    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
    _x[0] = _x[3] = 0;							\
    if (_FP_FRAC_GT_2(X, Y))						\
      {									\
	R##_e++;							\
	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) |	\
		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
			    (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE)));	\
	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE);	\
      }									\
    else								\
      {									\
	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) |	\
		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
			    (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE)));	\
	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE);	\
      }									\
									\
    (void) mpn_divrem (_z, 0, _x, 4, _y, 2);				\
    R##_f1 = _z[1];							\
    R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0);				\
  } while (0)


/*
 * Square root algorithms:
 * We have just one right now, maybe Newton approximation
 * should be added for those machines where division is fast.
 */

#define _FP_SQRT_MEAT_2(R, S, T, X, q)			\
  do {							\
    while (q)						\
      {							\
	T##_f1 = S##_f1 + q;				\
	if (T##_f1 <= X##_f1)				\
	  {						\
	    S##_f1 = T##_f1 + q;			\
	    X##_f1 -= T##_f1;				\
	    R##_f1 += q;				\
	  }						\
	_FP_FRAC_SLL_2(X, 1);				\
	q >>= 1;					\
      }							\
    q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1);		\
    while (q != _FP_WORK_ROUND)				\
      {							\
	T##_f0 = S##_f0 + q;				\
	T##_f1 = S##_f1;				\
	if (T##_f1 < X##_f1 || 				\
	    (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
	  {						\
	    S##_f0 = T##_f0 + q;			\
	    S##_f1 += (T##_f0 > S##_f0);		\
	    _FP_FRAC_DEC_2(X, T);			\
	    R##_f0 += q;				\
	  }						\
	_FP_FRAC_SLL_2(X, 1);				\
	q >>= 1;					\
      }							\
    if (X##_f0 | X##_f1)				\
      {							\
	if (S##_f1 < X##_f1 || 				\
	    (S##_f1 == X##_f1 && S##_f0 < X##_f0))	\
	  R##_f0 |= _FP_WORK_ROUND;			\
	R##_f0 |= _FP_WORK_STICKY;			\
      }							\
  } while (0)


/*
 * Assembly/disassembly for converting to/from integral types.
 * No shifting or overflow handled here.
 */

#define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
(void)((rsize <= _FP_W_TYPE_SIZE)		\
       ? ({ r = X##_f0; })			\
       : ({					\
	    r = X##_f1;				\
	    r <<= _FP_W_TYPE_SIZE;		\
	    r += X##_f0;			\
	  }))

#define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)				\
  do {									\
    X##_f0 = r;								\
    X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE);	\
  } while (0)

/*
 * Convert FP values between word sizes
 */

#define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)

#define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))

#define _FP_FRAC_COPY_2_2(D,S)		_FP_FRAC_COPY_2(D,S)
Commit	Line	Data
d876f532 UD	1	/* Software floating-point emulation.
d876f532 UD	2	Basic two-word fraction declaration and manipulation.
568035b7	3	Copyright (C) 1997-2013 Free Software Foundation, Inc.
d876f532 UD	4	This file is part of the GNU C Library.
	5	Contributed by Richard Henderson (rth@cygnus.com),
	6	Jakub Jelinek (jj@ultra.linux.cz),
	7	David S. Miller (davem@redhat.com) and
	8	Peter Maydell (pmaydell@chiark.greenend.org.uk).
	9
	10	The GNU C Library is free software; you can redistribute it and/or
41bdb6e2 AJ	11	modify it under the terms of the GNU Lesser General Public
	12	License as published by the Free Software Foundation; either
	13	version 2.1 of the License, or (at your option) any later version.
d876f532	14
638a783c RM	15	In addition to the permissions in the GNU Lesser General Public
	16	License, the Free Software Foundation gives you unlimited
	17	permission to link the compiled version of this file into
	18	combinations with other programs, and to distribute those
	19	combinations without any restriction coming from the use of this
	20	file. (The Lesser General Public License restrictions do apply in
	21	other respects; for example, they cover modification of the file,
	22	and distribution when not linked into a combine executable.)
	23
d876f532 UD	24	The GNU C Library is distributed in the hope that it will be useful,
	25	but WITHOUT ANY WARRANTY; without even the implied warranty of
	26	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2	27	Lesser General Public License for more details.
d876f532	28
41bdb6e2	29	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	30	License along with the GNU C Library; if not, see
59ba27a6 PE	31	<http://www.gnu.org/licenses/>. */
d876f532 UD	32
	33	#define _FP_FRAC_DECL_2(X) _FP_W_TYPE X##_f0, X##_f1
	34	#define _FP_FRAC_COPY_2(D,S) (D##_f0 = S##_f0, D##_f1 = S##_f1)
	35	#define _FP_FRAC_SET_2(X,I) __FP_FRAC_SET_2(X, I)
	36	#define _FP_FRAC_HIGH_2(X) (X##_f1)
	37	#define _FP_FRAC_LOW_2(X) (X##_f0)
	38	#define _FP_FRAC_WORD_2(X,w) (X##_f##w)
	39
fe0b1e85 RM	40	#define _FP_FRAC_SLL_2(X,N) \
	41	(void)(((N) < _FP_W_TYPE_SIZE) \
	42	? ({ \
	43	if (__builtin_constant_p(N) && (N) == 1) \
	44	{ \
	45	X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0); \
	46	X##_f0 += X##_f0; \
	47	} \
	48	else \
	49	{ \
	50	X##_f1 = X##_f1 << (N) \| X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
	51	X##_f0 <<= (N); \
	52	} \
	53	0; \
	54	}) \
	55	: ({ \
	56	X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \
	57	X##_f0 = 0; \
	58	}))
	59
d876f532 UD	60
d876f532 UD	61	#define _FP_FRAC_SRL_2(X,N) \
fe0b1e85 RM	62	(void)(((N) < _FP_W_TYPE_SIZE) \
	63	? ({ \
	64	X##_f0 = X##_f0 >> (N) \| X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
	65	X##_f1 >>= (N); \
	66	}) \
	67	: ({ \
	68	X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \
	69	X##_f1 = 0; \
	70	}))
d876f532 UD	71
d876f532 UD	72	/* Right shift with sticky-lsb. */
fe0b1e85 RM	73	#define _FP_FRAC_SRST_2(X,S, N,sz) \
	74	(void)(((N) < _FP_W_TYPE_SIZE) \
	75	? ({ \
	76	S = (__builtin_constant_p(N) && (N) == 1 \
	77	? X##_f0 & 1 \
	78	: (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0); \
	79	X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) \| X##_f0 >> (N)); \
	80	X##_f1 >>= (N); \
	81	}) \
	82	: ({ \
	83	S = ((((N) == _FP_W_TYPE_SIZE \
	84	? 0 \
	85	: (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \
	86	\| X##_f0) != 0); \
	87	X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)); \
	88	X##_f1 = 0; \
	89	}))
	90
	91	#define _FP_FRAC_SRS_2(X,N,sz) \
	92	(void)(((N) < _FP_W_TYPE_SIZE) \
	93	? ({ \
	94	X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) \| X##_f0 >> (N) \| \
	95	(__builtin_constant_p(N) && (N) == 1 \
	96	? X##_f0 & 1 \
	97	: (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
	98	X##_f1 >>= (N); \
	99	}) \
	100	: ({ \
	101	X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) \| \
	102	((((N) == _FP_W_TYPE_SIZE \
	103	? 0 \
	104	: (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \
	105	\| X##_f0) != 0)); \
	106	X##_f1 = 0; \
	107	}))
d876f532 UD	108
	109	#define _FP_FRAC_ADDI_2(X,I) \
	110	__FP_FRAC_ADDI_2(X##_f1, X##_f0, I)
	111
	112	#define _FP_FRAC_ADD_2(R,X,Y) \
	113	__FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
	114
	115	#define _FP_FRAC_SUB_2(R,X,Y) \
	116	__FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
	117
	118	#define _FP_FRAC_DEC_2(X,Y) \
	119	__FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)
	120
	121	#define _FP_FRAC_CLZ_2(R,X) \
	122	do { \
	123	if (X##_f1) \
	124	__FP_CLZ(R,X##_f1); \
	125	else \
	126	{ \
	127	__FP_CLZ(R,X##_f0); \
	128	R += _FP_W_TYPE_SIZE; \
	129	} \
	130	} while(0)
	131
	132	/* Predicates */
	133	#define _FP_FRAC_NEGP_2(X) ((_FP_WS_TYPE)X##_f1 < 0)
	134	#define _FP_FRAC_ZEROP_2(X) ((X##_f1 \| X##_f0) == 0)
	135	#define _FP_FRAC_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
cf299341	136	#define _FP_FRAC_CLEAR_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
77f01ab5 JM	137	#define _FP_FRAC_HIGHBIT_DW_2(fs,X) \
77f01ab5 JM	138	(_FP_FRAC_HIGH_DW_##fs(X) & _FP_HIGHBIT_DW_##fs)
d876f532 UD	139	#define _FP_FRAC_EQ_2(X, Y) (X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
d876f532 UD	140	#define _FP_FRAC_GT_2(X, Y) \
fe0b1e85	141	(X##_f1 > Y##_f1 \|\| (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
d876f532	142	#define _FP_FRAC_GE_2(X, Y) \
fe0b1e85	143	(X##_f1 > Y##_f1 \|\| (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
d876f532 UD	144
	145	#define _FP_ZEROFRAC_2 0, 0
	146	#define _FP_MINFRAC_2 0, 1
	147	#define _FP_MAXFRAC_2 (~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)
	148
	149	/*
9c84384c	150	* Internals
d876f532 UD	151	*/
	152
	153	#define __FP_FRAC_SET_2(X,I1,I0) (X##_f0 = I0, X##_f1 = I1)
	154
	155	#define __FP_CLZ_2(R, xh, xl) \
	156	do { \
	157	if (xh) \
	158	__FP_CLZ(R,xh); \
	159	else \
	160	{ \
	161	__FP_CLZ(R,xl); \
	162	R += _FP_W_TYPE_SIZE; \
	163	} \
	164	} while(0)
	165
	166	#if 0
	167
	168	#ifndef __FP_FRAC_ADDI_2
	169	#define __FP_FRAC_ADDI_2(xh, xl, i) \
	170	(xh += ((xl += i) < i))
	171	#endif
	172	#ifndef __FP_FRAC_ADD_2
	173	#define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl) \
	174	(rh = xh + yh + ((rl = xl + yl) < xl))
	175	#endif
	176	#ifndef __FP_FRAC_SUB_2
	177	#define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl) \
	178	(rh = xh - yh - ((rl = xl - yl) > xl))
	179	#endif
	180	#ifndef __FP_FRAC_DEC_2
	181	#define __FP_FRAC_DEC_2(xh, xl, yh, yl) \
	182	do { \
	183	UWtype _t = xl; \
	184	xh -= yh + ((xl -= yl) > _t); \
	185	} while (0)
	186	#endif
	187
	188	#else
	189
	190	#undef __FP_FRAC_ADDI_2
	191	#define __FP_FRAC_ADDI_2(xh, xl, i) add_ssaaaa(xh, xl, xh, xl, 0, i)
	192	#undef __FP_FRAC_ADD_2
	193	#define __FP_FRAC_ADD_2 add_ssaaaa
	194	#undef __FP_FRAC_SUB_2
	195	#define __FP_FRAC_SUB_2 sub_ddmmss
	196	#undef __FP_FRAC_DEC_2
	197	#define __FP_FRAC_DEC_2(xh, xl, yh, yl) sub_ddmmss(xh, xl, xh, xl, yh, yl)
	198
	199	#endif
	200
	201	/*
	202	* Unpack the raw bits of a native fp value. Do not classify or
	203	* normalize the data.
	204	*/
	205
	206	#define _FP_UNPACK_RAW_2(fs, X, val) \
	207	do { \
	208	union _FP_UNION_##fs _flo; _flo.flt = (val); \
	209	\
	210	X##_f0 = _flo.bits.frac0; \
	211	X##_f1 = _flo.bits.frac1; \
	212	X##_e = _flo.bits.exp; \
	213	X##_s = _flo.bits.sign; \
	214	} while (0)
215
216	#define _FP_UNPACK_RAW_2_P(fs, X, val) \
217	do { \
218	union _FP_UNION_##fs *_flo = \
219	(union _FP_UNION_##fs *)(val); \
220	\
221	X##_f0 = _flo->bits.frac0; \
222	X##_f1 = _flo->bits.frac1; \
223	X##_e = _flo->bits.exp; \
224	X##_s = _flo->bits.sign; \
225	} while (0)
226
227
228	/*
229	* Repack the raw bits of a native fp value.
230	*/
231
232	#define _FP_PACK_RAW_2(fs, val, X) \
233	do { \
234	union _FP_UNION_##fs _flo; \
235	\
236	_flo.bits.frac0 = X##_f0; \
237	_flo.bits.frac1 = X##_f1; \
238	_flo.bits.exp = X##_e; \
239	_flo.bits.sign = X##_s; \
240	\
241	(val) = _flo.flt; \
242	} while (0)
243
244	#define _FP_PACK_RAW_2_P(fs, val, X) \
245	do { \
246	union _FP_UNION_##fs *_flo = \
247	(union _FP_UNION_##fs *)(val); \
248	\
249	_flo->bits.frac0 = X##_f0; \
250	_flo->bits.frac1 = X##_f1; \
251	_flo->bits.exp = X##_e; \
252	_flo->bits.sign = X##_s; \
253	} while (0)
254
255
256	/*
257	* Multiplication algorithms:
258	*/
259
260	/* Given a 1W * 1W => 2W primitive, do the extended multiplication. */
261
77f01ab5	262	#define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit) \
d876f532	263	do { \
77f01ab5	264	_FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \
d876f532	265	\
77f01ab5	266	doit(_FP_FRAC_WORD_4(R,1), _FP_FRAC_WORD_4(R,0), X##_f0, Y##_f0); \
d876f532 UD	267	doit(_b_f1, _b_f0, X##_f0, Y##_f1); \
d876f532 UD	268	doit(_c_f1, _c_f0, X##_f1, Y##_f0); \
77f01ab5	269	doit(_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2), X##_f1, Y##_f1); \
d876f532	270	\
77f01ab5 JM	271	__FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
	272	_FP_FRAC_WORD_4(R,1), 0, _b_f1, _b_f0, \
	273	_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
	274	_FP_FRAC_WORD_4(R,1)); \
	275	__FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
	276	_FP_FRAC_WORD_4(R,1), 0, _c_f1, _c_f0, \
	277	_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
	278	_FP_FRAC_WORD_4(R,1)); \
	279	} while (0)
	280
	281	#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit) \
	282	do { \
	283	_FP_FRAC_DECL_4(_z); \
	284	\
	285	_FP_MUL_MEAT_DW_2_wide(wfracbits, _z, X, Y, doit); \
d876f532 UD	286	\
	287	/* Normalize since we know where the msb of the multiplicands \
	288	were (bit B), we know that the msb of the of the product is \
	289	at either 2B or 2B-1. */ \
	290	_FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
	291	R##_f0 = _FP_FRAC_WORD_4(_z,0); \
	292	R##_f1 = _FP_FRAC_WORD_4(_z,1); \
	293	} while (0)
	294
	295	/* Given a 1W * 1W => 2W primitive, do the extended multiplication.
	296	Do only 3 multiplications instead of four. This one is for machines
	297	where multiplication is much more expensive than subtraction. */
	298
77f01ab5	299	#define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit) \
d876f532	300	do { \
77f01ab5	301	_FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \
d876f532 UD	302	_FP_W_TYPE _d; \
	303	int _c1, _c2; \
	304	\
	305	_b_f0 = X##_f0 + X##_f1; \
	306	_c1 = _b_f0 < X##_f0; \
	307	_b_f1 = Y##_f0 + Y##_f1; \
	308	_c2 = _b_f1 < Y##_f0; \
77f01ab5 JM	309	doit(_d, _FP_FRAC_WORD_4(R,0), X##_f0, Y##_f0); \
77f01ab5 JM	310	doit(_FP_FRAC_WORD_4(R,2), _FP_FRAC_WORD_4(R,1), _b_f0, _b_f1); \
d876f532 UD	311	doit(_c_f1, _c_f0, X##_f1, Y##_f1); \
	312	\
	313	_b_f0 &= -_c2; \
	314	_b_f1 &= -_c1; \
77f01ab5 JM	315	__FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
	316	_FP_FRAC_WORD_4(R,1), (_c1 & _c2), 0, _d, \
	317	0, _FP_FRAC_WORD_4(R,2), _FP_FRAC_WORD_4(R,1)); \
	318	__FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
d876f532	319	_b_f0); \
77f01ab5	320	__FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
d876f532	321	_b_f1); \
77f01ab5 JM	322	__FP_FRAC_DEC_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
	323	_FP_FRAC_WORD_4(R,1), \
	324	0, _d, _FP_FRAC_WORD_4(R,0)); \
	325	__FP_FRAC_DEC_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \
	326	_FP_FRAC_WORD_4(R,1), 0, _c_f1, _c_f0); \
	327	__FP_FRAC_ADD_2(_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2), \
d876f532	328	_c_f1, _c_f0, \
77f01ab5 JM	329	_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2)); \
	330	} while (0)
	331
	332	#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit) \
	333	do { \
	334	_FP_FRAC_DECL_4(_z); \
	335	\
	336	_FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, _z, X, Y, doit); \
d876f532 UD	337	\
	338	/* Normalize since we know where the msb of the multiplicands \
	339	were (bit B), we know that the msb of the of the product is \
	340	at either 2B or 2B-1. */ \
	341	_FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
	342	R##_f0 = _FP_FRAC_WORD_4(_z,0); \
	343	R##_f1 = _FP_FRAC_WORD_4(_z,1); \
	344	} while (0)
	345
77f01ab5	346	#define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y) \
d876f532	347	do { \
d876f532 UD	348	_FP_W_TYPE _x[2], _y[2]; \
	349	_x[0] = X##_f0; _x[1] = X##_f1; \
	350	_y[0] = Y##_f0; _y[1] = Y##_f1; \
	351	\
77f01ab5 JM	352	mpn_mul_n(R##_f, _x, _y, 2); \
	353	} while (0)
	354
	355	#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y) \
	356	do { \
	357	_FP_FRAC_DECL_4(_z); \
	358	\
	359	_FP_MUL_MEAT_DW_2_gmp(wfracbits, _z, X, Y); \
d876f532 UD	360	\
	361	/* Normalize since we know where the msb of the multiplicands \
	362	were (bit B), we know that the msb of the of the product is \
	363	at either 2B or 2B-1. */ \
	364	_FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
	365	R##_f0 = _z_f[0]; \
	366	R##_f1 = _z_f[1]; \
	367	} while (0)
	368
	369	/* Do at most 120x120=240 bits multiplication using double floating
	370	point multiplication. This is useful if floating point
	371	multiplication has much bigger throughput than integer multiply.
	372	It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
9c84384c	373	between 106 and 120 only.
d876f532 UD	374	Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
	375	SETFETZ is a macro which will disable all FPU exceptions and set rounding
	376	towards zero, RESETFE should optionally reset it back. */
	377
	378	#define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
	379	do { \
	380	static const double _const[] = { \
	381	/* 2^-24 */ 5.9604644775390625e-08, \
	382	/* 2^-48 */ 3.5527136788005009e-15, \
	383	/* 2^-72 */ 2.1175823681357508e-22, \
	384	/* 2^-96 */ 1.2621774483536189e-29, \
	385	/* 2^28 */ 2.68435456e+08, \
	386	/* 2^4 */ 1.600000e+01, \
	387	/* 2^-20 */ 9.5367431640625e-07, \
	388	/* 2^-44 */ 5.6843418860808015e-14, \
	389	/* 2^-68 */ 3.3881317890172014e-21, \
	390	/* 2^-92 */ 2.0194839173657902e-28, \
	391	/* 2^-116 */ 1.2037062152420224e-35}; \
	392	double _a240, _b240, _c240, _d240, _e240, _f240, \
	393	_g240, _h240, _i240, _j240, _k240; \
	394	union { double d; UDItype i; } _l240, _m240, _n240, _o240, \
	395	_p240, _q240, _r240, _s240; \
	396	UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0; \
	397	\
	398	if (wfracbits < 106 \|\| wfracbits > 120) \
	399	abort(); \
	400	\
	401	setfetz; \
	402	\
	403	_e240 = (double)(long)(X##_f0 & 0xffffff); \
	404	_j240 = (double)(long)(Y##_f0 & 0xffffff); \
	405	_d240 = (double)(long)((X##_f0 >> 24) & 0xffffff); \
	406	_i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff); \
	407	_c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) \| (X##_f0 >> 48)); \
	408	_h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) \| (Y##_f0 >> 48)); \
	409	_b240 = (double)(long)((X##_f1 >> 8) & 0xffffff); \
	410	_g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff); \
	411	_a240 = (double)(long)(X##_f1 >> 32); \
	412	_f240 = (double)(long)(Y##_f1 >> 32); \
	413	_e240 *= _const[3]; \
	414	_j240 *= _const[3]; \
	415	_d240 *= _const[2]; \
	416	_i240 *= _const[2]; \
	417	_c240 *= _const[1]; \
	418	_h240 *= _const[1]; \
	419	_b240 *= _const[0]; \
	420	_g240 *= _const[0]; \
	421	_s240.d = _e240*_j240;\
	422	_r240.d = _d240_j240 + _e240_i240;\
	423	_q240.d = _c240_j240 + _d240_i240 + _e240*_h240;\
	424	_p240.d = _b240_j240 + _c240_i240 + _d240_h240 + _e240_g240;\
	425	_o240.d = _a240_j240 + _b240_i240 + _c240_h240 + _d240_g240 + _e240*_f240;\
	426	_n240.d = _a240_i240 + _b240_h240 + _c240_g240 + _d240_f240; \
	427	_m240.d = _a240_h240 + _b240_g240 + _c240*_f240; \
	428	_l240.d = _a240_g240 + _b240_f240; \
	429	_k240 = _a240*_f240; \
	430	_r240.d += _s240.d; \
	431	_q240.d += _r240.d; \
	432	_p240.d += _q240.d; \
	433	_o240.d += _p240.d; \
	434	_n240.d += _o240.d; \
	435	_m240.d += _n240.d; \
	436	_l240.d += _m240.d; \
	437	_k240 += _l240.d; \
438	_s240.d -= ((_const[10]+_s240.d)-_const[10]); \
439	_r240.d -= ((_const[9]+_r240.d)-_const[9]); \
440	_q240.d -= ((_const[8]+_q240.d)-_const[8]); \
441	_p240.d -= ((_const[7]+_p240.d)-_const[7]); \
442	_o240.d += _const[7]; \
443	_n240.d += _const[6]; \
444	_m240.d += _const[5]; \
445	_l240.d += _const[4]; \
446	if (_s240.d != 0.0) _y240 = 1; \
447	if (_r240.d != 0.0) _y240 = 1; \
448	if (_q240.d != 0.0) _y240 = 1; \
449	if (_p240.d != 0.0) _y240 = 1; \
450	_t240 = (DItype)_k240; \
451	_u240 = _l240.i; \
452	_v240 = _m240.i; \
453	_w240 = _n240.i; \
454	_x240 = _o240.i; \
455	R##_f1 = (_t240 << (128 - (wfracbits - 1))) \
456	\| ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)); \
457	R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1))) \
350635a5 OB	458	\| ((_v240 & 0xffffff) << (144 - (wfracbits - 1))) \
	459	\| ((_w240 & 0xffffff) << (120 - (wfracbits - 1))) \
	460	\| ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96)) \
	461	\| _y240; \
d876f532 UD	462	resetfe; \
	463	} while (0)
	464
	465	/*
	466	* Division algorithms:
	467	*/
	468
	469	#define _FP_DIV_MEAT_2_udiv(fs, R, X, Y) \
	470	do { \
	471	_FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0; \
	472	if (_FP_FRAC_GT_2(X, Y)) \
	473	{ \
	474	_n_f2 = X##_f1 >> 1; \
	475	_n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) \| X##_f0 >> 1; \
	476	_n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1); \
	477	} \
	478	else \
	479	{ \
	480	R##_e--; \
	481	_n_f2 = X##_f1; \
	482	_n_f1 = X##_f0; \
	483	_n_f0 = 0; \
	484	} \
	485	\
	486	/* Normalize, i.e. make the most significant bit of the \
	487	denominator set. */ \
	488	_FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs); \
	489	\
	490	udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1); \
	491	umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0); \
	492	_r_f0 = _n_f0; \
	493	if (_FP_FRAC_GT_2(_m, _r)) \
	494	{ \
	495	R##_f1--; \
	496	_FP_FRAC_ADD_2(_r, Y, _r); \
	497	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r)) \
	498	{ \
	499	R##_f1--; \
	500	_FP_FRAC_ADD_2(_r, Y, _r); \
	501	} \
	502	} \
	503	_FP_FRAC_DEC_2(_r, _m); \
	504	\
	505	if (_r_f1 == Y##_f1) \
	506	{ \
	507	/* This is a special case, not an optimization \
	508	(_r/Y##_f1 would not fit into UWtype). \
	509	As _r is guaranteed to be < Y, R##_f0 can be either \
	510	(UWtype)-1 or (UWtype)-2. But as we know what kind \
	511	of bits it is (sticky, guard, round), we don't care. \
	512	We also don't care what the reminder is, because the \
	513	guard bit will be set anyway. -jj */ \
	514	R##_f0 = -1; \
	515	} \
	516	else \
	517	{ \
	518	udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1); \
	519	umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0); \
	520	_r_f0 = 0; \
	521	if (_FP_FRAC_GT_2(_m, _r)) \
	522	{ \
	523	R##_f0--; \
	524	_FP_FRAC_ADD_2(_r, Y, _r); \
	525	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r)) \
526	{ \
527	R##_f0--; \
528	_FP_FRAC_ADD_2(_r, Y, _r); \
529	} \
530	} \
531	if (!_FP_FRAC_EQ_2(_r, _m)) \
532	R##_f0 \|= _FP_WORK_STICKY; \
533	} \
534	} while (0)
535
536
537	#define _FP_DIV_MEAT_2_gmp(fs, R, X, Y) \
538	do { \
539	_FP_W_TYPE _x[4], _y[2], _z[4]; \
540	_y[0] = Y##_f0; _y[1] = Y##_f1; \
541	_x[0] = _x[3] = 0; \
542	if (_FP_FRAC_GT_2(X, Y)) \
543	{ \
544	R##_e++; \
545	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) \| \
546	X##_f1 >> (_FP_W_TYPE_SIZE - \
547	(_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE))); \
548	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE); \
549	} \
550	else \
551	{ \
552	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) \| \
553	X##_f1 >> (_FP_W_TYPE_SIZE - \
554	(_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE))); \
555	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE); \
556	} \
557	\
558	(void) mpn_divrem (_z, 0, _x, 4, _y, 2); \
559	R##_f1 = _z[1]; \
560	R##_f0 = _z[0] \| ((_x[0] \| _x[1]) != 0); \
561	} while (0)
562
563
564	/*
565	* Square root algorithms:
566	* We have just one right now, maybe Newton approximation
567	* should be added for those machines where division is fast.
568	*/
9c84384c	569
d876f532 UD	570	#define _FP_SQRT_MEAT_2(R, S, T, X, q) \
	571	do { \
	572	while (q) \
	573	{ \
	574	T##_f1 = S##_f1 + q; \
	575	if (T##_f1 <= X##_f1) \
	576	{ \
	577	S##_f1 = T##_f1 + q; \
	578	X##_f1 -= T##_f1; \
	579	R##_f1 += q; \
	580	} \
	581	_FP_FRAC_SLL_2(X, 1); \
	582	q >>= 1; \
	583	} \
	584	q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1); \
	585	while (q != _FP_WORK_ROUND) \
	586	{ \
	587	T##_f0 = S##_f0 + q; \
	588	T##_f1 = S##_f1; \
	589	if (T##_f1 < X##_f1 \|\| \
	590	(T##_f1 == X##_f1 && T##_f0 <= X##_f0)) \
	591	{ \
	592	S##_f0 = T##_f0 + q; \
	593	S##_f1 += (T##_f0 > S##_f0); \
	594	_FP_FRAC_DEC_2(X, T); \
	595	R##_f0 += q; \
	596	} \
	597	_FP_FRAC_SLL_2(X, 1); \
	598	q >>= 1; \
	599	} \
	600	if (X##_f0 \| X##_f1) \
	601	{ \
	602	if (S##_f1 < X##_f1 \|\| \
	603	(S##_f1 == X##_f1 && S##_f0 < X##_f0)) \
	604	R##_f0 \|= _FP_WORK_ROUND; \
	605	R##_f0 \|= _FP_WORK_STICKY; \
	606	} \
	607	} while (0)
	608
	609
	610	/*
9c84384c	611	* Assembly/disassembly for converting to/from integral types.
d876f532 UD	612	* No shifting or overflow handled here.
	613	*/
	614
	615	#define _FP_FRAC_ASSEMBLE_2(r, X, rsize) \
fe0b1e85 RM	616	(void)((rsize <= _FP_W_TYPE_SIZE) \
	617	? ({ r = X##_f0; }) \
	618	: ({ \
	619	r = X##_f1; \
	620	r <<= _FP_W_TYPE_SIZE; \
	621	r += X##_f0; \
	622	}))
d876f532 UD	623
	624	#define _FP_FRAC_DISASSEMBLE_2(X, r, rsize) \
	625	do { \
	626	X##_f0 = r; \
	627	X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE); \
	628	} while (0)
	629
	630	/*
	631	* Convert FP values between word sizes
	632	*/
	633
fe0b1e85	634	#define _FP_FRAC_COPY_1_2(D, S) (D##_f = S##_f0)
d876f532	635
fe0b1e85	636	#define _FP_FRAC_COPY_2_1(D, S) ((D##_f0 = S##_f), (D##_f1 = 0))
37002cbc JJ	637
37002cbc JJ	638	#define _FP_FRAC_COPY_2_2(D,S) _FP_FRAC_COPY_2(D,S)