2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include <asm/div64.h>
35 //#include "softfloat.h"
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
40 division and square root approximations. (Can be specialized to target if
42 -------------------------------------------------------------------------------
44 #include "softfloat-macros"
47 -------------------------------------------------------------------------------
48 Functions and definitions to determine: (1) whether tininess for underflow
49 is detected before or after rounding by default, (2) what (if anything)
50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 are propagated from function inputs to output. These details are target-
54 -------------------------------------------------------------------------------
56 #include "softfloat-specialize"
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input. If `zSign' is nonzero, the input is negated before being converted
63 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer. If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
67 positive or negative integer is returned.
68 -------------------------------------------------------------------------------
70 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
73 flag roundNearestEven;
74 int8 roundIncrement, roundBits;
77 roundingMode = roundData->mode;
78 roundNearestEven = ( roundingMode == float_round_nearest_even );
79 roundIncrement = 0x40;
80 if ( ! roundNearestEven ) {
81 if ( roundingMode == float_round_to_zero ) {
85 roundIncrement = 0x7F;
87 if ( roundingMode == float_round_up ) roundIncrement = 0;
90 if ( roundingMode == float_round_down ) roundIncrement = 0;
94 roundBits = absZ & 0x7F;
95 absZ = ( absZ + roundIncrement )>>7;
96 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
99 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
100 roundData->exception |= float_flag_invalid;
101 return zSign ? 0x80000000 : 0x7FFFFFFF;
103 if ( roundBits ) roundData->exception |= float_flag_inexact;
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
113 INLINE bits32 extractFloat32Frac( float32 a )
116 return a & 0x007FFFFF;
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
125 INLINE int16 extractFloat32Exp( float32 a )
128 return ( a>>23 ) & 0xFF;
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
137 #if 0 /* in softfloat.h */
138 INLINE flag extractFloat32Sign( float32 a )
147 -------------------------------------------------------------------------------
148 Normalizes the subnormal single-precision floating-point value represented
149 by the denormalized significand `aSig'. The normalized exponent and
150 significand are stored at the locations pointed to by `zExpPtr' and
151 `zSigPtr', respectively.
152 -------------------------------------------------------------------------------
155 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
159 shiftCount = countLeadingZeros32( aSig ) - 8;
160 *zSigPtr = aSig<<shiftCount;
161 *zExpPtr = 1 - shiftCount;
166 -------------------------------------------------------------------------------
167 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
168 single-precision floating-point value, returning the result. After being
169 shifted into the proper positions, the three fields are simply added
170 together to form the result. This means that any integer portion of `zSig'
171 will be added into the exponent. Since a properly normalized significand
172 will have an integer portion equal to 1, the `zExp' input should be 1 less
173 than the desired result exponent whenever `zSig' is a complete, normalized
175 -------------------------------------------------------------------------------
177 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
181 __asm__("@ packFloat32 \n\
182 mov %0, %1, asl #31 \n\
183 orr %0, %2, asl #23 \n\
186 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
190 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
195 -------------------------------------------------------------------------------
196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
197 and significand `zSig', and returns the proper single-precision floating-
198 point value corresponding to the abstract input. Ordinarily, the abstract
199 value is simply rounded and packed into the single-precision format, with
200 the inexact exception raised if the abstract input cannot be represented
201 exactly. If the abstract value is too large, however, the overflow and
202 inexact exceptions are raised and an infinity or maximal finite value is
203 returned. If the abstract value is too small, the input value is rounded to
204 a subnormal number, and the underflow and inexact exceptions are raised if
205 the abstract input cannot be represented exactly as a subnormal single-
206 precision floating-point number.
207 The input significand `zSig' has its binary point between bits 30
208 and 29, which is 7 bits to the left of the usual location. This shifted
209 significand must be normalized or smaller. If `zSig' is not normalized,
210 `zExp' must be 0; in that case, the result returned is a subnormal number,
211 and it must not require rounding. In the usual case that `zSig' is
212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
213 The handling of underflow and overflow follows the IEC/IEEE Standard for
214 Binary Floating-point Arithmetic.
215 -------------------------------------------------------------------------------
217 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
220 flag roundNearestEven;
221 int8 roundIncrement, roundBits;
224 roundingMode = roundData->mode;
225 roundNearestEven = ( roundingMode == float_round_nearest_even );
226 roundIncrement = 0x40;
227 if ( ! roundNearestEven ) {
228 if ( roundingMode == float_round_to_zero ) {
232 roundIncrement = 0x7F;
234 if ( roundingMode == float_round_up ) roundIncrement = 0;
237 if ( roundingMode == float_round_down ) roundIncrement = 0;
241 roundBits = zSig & 0x7F;
242 if ( 0xFD <= (bits16) zExp ) {
244 || ( ( zExp == 0xFD )
245 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
247 roundData->exception |= float_flag_overflow | float_flag_inexact;
248 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
252 ( float_detect_tininess == float_tininess_before_rounding )
254 || ( zSig + roundIncrement < 0x80000000 );
255 shift32RightJamming( zSig, - zExp, &zSig );
257 roundBits = zSig & 0x7F;
258 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
261 if ( roundBits ) roundData->exception |= float_flag_inexact;
262 zSig = ( zSig + roundIncrement )>>7;
263 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
264 if ( zSig == 0 ) zExp = 0;
265 return packFloat32( zSign, zExp, zSig );
270 -------------------------------------------------------------------------------
271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
272 and significand `zSig', and returns the proper single-precision floating-
273 point value corresponding to the abstract input. This routine is just like
274 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
275 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
277 -------------------------------------------------------------------------------
280 normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
284 shiftCount = countLeadingZeros32( zSig ) - 1;
285 return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
290 -------------------------------------------------------------------------------
291 Returns the fraction bits of the double-precision floating-point value `a'.
292 -------------------------------------------------------------------------------
294 INLINE bits64 extractFloat64Frac( float64 a )
297 return a & LIT64( 0x000FFFFFFFFFFFFF );
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
306 INLINE int16 extractFloat64Exp( float64 a )
309 return ( a>>52 ) & 0x7FF;
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
318 #if 0 /* in softfloat.h */
319 INLINE flag extractFloat64Sign( float64 a )
328 -------------------------------------------------------------------------------
329 Normalizes the subnormal double-precision floating-point value represented
330 by the denormalized significand `aSig'. The normalized exponent and
331 significand are stored at the locations pointed to by `zExpPtr' and
332 `zSigPtr', respectively.
333 -------------------------------------------------------------------------------
336 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
340 shiftCount = countLeadingZeros64( aSig ) - 11;
341 *zSigPtr = aSig<<shiftCount;
342 *zExpPtr = 1 - shiftCount;
347 -------------------------------------------------------------------------------
348 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
349 double-precision floating-point value, returning the result. After being
350 shifted into the proper positions, the three fields are simply added
351 together to form the result. This means that any integer portion of `zSig'
352 will be added into the exponent. Since a properly normalized significand
353 will have an integer portion equal to 1, the `zExp' input should be 1 less
354 than the desired result exponent whenever `zSig' is a complete, normalized
356 -------------------------------------------------------------------------------
358 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
361 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper double-precision floating-
369 point value corresponding to the abstract input. Ordinarily, the abstract
370 value is simply rounded and packed into the double-precision format, with
371 the inexact exception raised if the abstract input cannot be represented
372 exactly. If the abstract value is too large, however, the overflow and
373 inexact exceptions are raised and an infinity or maximal finite value is
374 returned. If the abstract value is too small, the input value is rounded to
375 a subnormal number, and the underflow and inexact exceptions are raised if
376 the abstract input cannot be represented exactly as a subnormal double-
377 precision floating-point number.
378 The input significand `zSig' has its binary point between bits 62
379 and 61, which is 10 bits to the left of the usual location. This shifted
380 significand must be normalized or smaller. If `zSig' is not normalized,
381 `zExp' must be 0; in that case, the result returned is a subnormal number,
382 and it must not require rounding. In the usual case that `zSig' is
383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
384 The handling of underflow and overflow follows the IEC/IEEE Standard for
385 Binary Floating-point Arithmetic.
386 -------------------------------------------------------------------------------
388 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
391 flag roundNearestEven;
392 int16 roundIncrement, roundBits;
395 roundingMode = roundData->mode;
396 roundNearestEven = ( roundingMode == float_round_nearest_even );
397 roundIncrement = 0x200;
398 if ( ! roundNearestEven ) {
399 if ( roundingMode == float_round_to_zero ) {
403 roundIncrement = 0x3FF;
405 if ( roundingMode == float_round_up ) roundIncrement = 0;
408 if ( roundingMode == float_round_down ) roundIncrement = 0;
412 roundBits = zSig & 0x3FF;
413 if ( 0x7FD <= (bits16) zExp ) {
414 if ( ( 0x7FD < zExp )
415 || ( ( zExp == 0x7FD )
416 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
418 //register int lr = __builtin_return_address(0);
419 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
420 roundData->exception |= float_flag_overflow | float_flag_inexact;
421 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
425 ( float_detect_tininess == float_tininess_before_rounding )
427 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
428 shift64RightJamming( zSig, - zExp, &zSig );
430 roundBits = zSig & 0x3FF;
431 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
434 if ( roundBits ) roundData->exception |= float_flag_inexact;
435 zSig = ( zSig + roundIncrement )>>10;
436 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
437 if ( zSig == 0 ) zExp = 0;
438 return packFloat64( zSign, zExp, zSig );
443 -------------------------------------------------------------------------------
444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
445 and significand `zSig', and returns the proper double-precision floating-
446 point value corresponding to the abstract input. This routine is just like
447 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
448 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
450 -------------------------------------------------------------------------------
453 normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
457 shiftCount = countLeadingZeros64( zSig ) - 1;
458 return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
465 -------------------------------------------------------------------------------
466 Returns the fraction bits of the extended double-precision floating-point
468 -------------------------------------------------------------------------------
470 INLINE bits64 extractFloatx80Frac( floatx80 a )
478 -------------------------------------------------------------------------------
479 Returns the exponent bits of the extended double-precision floating-point
481 -------------------------------------------------------------------------------
483 INLINE int32 extractFloatx80Exp( floatx80 a )
486 return a.high & 0x7FFF;
491 -------------------------------------------------------------------------------
492 Returns the sign bit of the extended double-precision floating-point value
494 -------------------------------------------------------------------------------
496 INLINE flag extractFloatx80Sign( floatx80 a )
504 -------------------------------------------------------------------------------
505 Normalizes the subnormal extended double-precision floating-point value
506 represented by the denormalized significand `aSig'. The normalized exponent
507 and significand are stored at the locations pointed to by `zExpPtr' and
508 `zSigPtr', respectively.
509 -------------------------------------------------------------------------------
512 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
516 shiftCount = countLeadingZeros64( aSig );
517 *zSigPtr = aSig<<shiftCount;
518 *zExpPtr = 1 - shiftCount;
523 -------------------------------------------------------------------------------
524 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
525 extended double-precision floating-point value, returning the result.
526 -------------------------------------------------------------------------------
528 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
533 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
539 -------------------------------------------------------------------------------
540 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
541 and extended significand formed by the concatenation of `zSig0' and `zSig1',
542 and returns the proper extended double-precision floating-point value
543 corresponding to the abstract input. Ordinarily, the abstract value is
544 rounded and packed into the extended double-precision format, with the
545 inexact exception raised if the abstract input cannot be represented
546 exactly. If the abstract value is too large, however, the overflow and
547 inexact exceptions are raised and an infinity or maximal finite value is
548 returned. If the abstract value is too small, the input value is rounded to
549 a subnormal number, and the underflow and inexact exceptions are raised if
550 the abstract input cannot be represented exactly as a subnormal extended
551 double-precision floating-point number.
552 If `roundingPrecision' is 32 or 64, the result is rounded to the same
553 number of bits as single or double precision, respectively. Otherwise, the
554 result is rounded to the full precision of the extended double-precision
556 The input significand must be normalized or smaller. If the input
557 significand is not normalized, `zExp' must be 0; in that case, the result
558 returned is a subnormal number, and it must not require rounding. The
559 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
560 Floating-point Arithmetic.
561 -------------------------------------------------------------------------------
564 roundAndPackFloatx80(
565 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
568 int8 roundingMode, roundingPrecision;
569 flag roundNearestEven, increment, isTiny;
570 int64 roundIncrement, roundMask, roundBits;
572 roundingMode = roundData->mode;
573 roundingPrecision = roundData->precision;
574 roundNearestEven = ( roundingMode == float_round_nearest_even );
575 if ( roundingPrecision == 80 ) goto precision80;
576 if ( roundingPrecision == 64 ) {
577 roundIncrement = LIT64( 0x0000000000000400 );
578 roundMask = LIT64( 0x00000000000007FF );
580 else if ( roundingPrecision == 32 ) {
581 roundIncrement = LIT64( 0x0000008000000000 );
582 roundMask = LIT64( 0x000000FFFFFFFFFF );
587 zSig0 |= ( zSig1 != 0 );
588 if ( ! roundNearestEven ) {
589 if ( roundingMode == float_round_to_zero ) {
593 roundIncrement = roundMask;
595 if ( roundingMode == float_round_up ) roundIncrement = 0;
598 if ( roundingMode == float_round_down ) roundIncrement = 0;
602 roundBits = zSig0 & roundMask;
603 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
604 if ( ( 0x7FFE < zExp )
605 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
611 ( float_detect_tininess == float_tininess_before_rounding )
613 || ( zSig0 <= zSig0 + roundIncrement );
614 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
616 roundBits = zSig0 & roundMask;
617 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
618 if ( roundBits ) roundData->exception |= float_flag_inexact;
619 zSig0 += roundIncrement;
620 if ( (sbits64) zSig0 < 0 ) zExp = 1;
621 roundIncrement = roundMask + 1;
622 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
623 roundMask |= roundIncrement;
625 zSig0 &= ~ roundMask;
626 return packFloatx80( zSign, zExp, zSig0 );
629 if ( roundBits ) roundData->exception |= float_flag_inexact;
630 zSig0 += roundIncrement;
631 if ( zSig0 < roundIncrement ) {
633 zSig0 = LIT64( 0x8000000000000000 );
635 roundIncrement = roundMask + 1;
636 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
637 roundMask |= roundIncrement;
639 zSig0 &= ~ roundMask;
640 if ( zSig0 == 0 ) zExp = 0;
641 return packFloatx80( zSign, zExp, zSig0 );
643 increment = ( (sbits64) zSig1 < 0 );
644 if ( ! roundNearestEven ) {
645 if ( roundingMode == float_round_to_zero ) {
650 increment = ( roundingMode == float_round_down ) && zSig1;
653 increment = ( roundingMode == float_round_up ) && zSig1;
657 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
658 if ( ( 0x7FFE < zExp )
659 || ( ( zExp == 0x7FFE )
660 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
666 roundData->exception |= float_flag_overflow | float_flag_inexact;
667 if ( ( roundingMode == float_round_to_zero )
668 || ( zSign && ( roundingMode == float_round_up ) )
669 || ( ! zSign && ( roundingMode == float_round_down ) )
671 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
673 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
677 ( float_detect_tininess == float_tininess_before_rounding )
680 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
681 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
683 if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
684 if ( zSig1 ) roundData->exception |= float_flag_inexact;
685 if ( roundNearestEven ) {
686 increment = ( (sbits64) zSig1 < 0 );
690 increment = ( roundingMode == float_round_down ) && zSig1;
693 increment = ( roundingMode == float_round_up ) && zSig1;
698 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
699 if ( (sbits64) zSig0 < 0 ) zExp = 1;
701 return packFloatx80( zSign, zExp, zSig0 );
704 if ( zSig1 ) roundData->exception |= float_flag_inexact;
709 zSig0 = LIT64( 0x8000000000000000 );
712 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
716 if ( zSig0 == 0 ) zExp = 0;
719 return packFloatx80( zSign, zExp, zSig0 );
723 -------------------------------------------------------------------------------
724 Takes an abstract floating-point value having sign `zSign', exponent
725 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
726 and returns the proper extended double-precision floating-point value
727 corresponding to the abstract input. This routine is just like
728 `roundAndPackFloatx80' except that the input significand does not have to be
730 -------------------------------------------------------------------------------
733 normalizeRoundAndPackFloatx80(
734 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
744 shiftCount = countLeadingZeros64( zSig0 );
745 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
748 roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
755 -------------------------------------------------------------------------------
756 Returns the result of converting the 32-bit two's complement integer `a' to
757 the single-precision floating-point format. The conversion is performed
758 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
759 -------------------------------------------------------------------------------
761 float32 int32_to_float32(struct roundingData *roundData, int32 a)
765 if ( a == 0 ) return 0;
766 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
768 return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
773 -------------------------------------------------------------------------------
774 Returns the result of converting the 32-bit two's complement integer `a' to
775 the double-precision floating-point format. The conversion is performed
776 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
777 -------------------------------------------------------------------------------
779 float64 int32_to_float64( int32 a )
786 if ( a == 0 ) return 0;
788 absA = aSign ? - a : a;
789 shiftCount = countLeadingZeros32( absA ) + 21;
791 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
798 -------------------------------------------------------------------------------
799 Returns the result of converting the 32-bit two's complement integer `a'
800 to the extended double-precision floating-point format. The conversion
801 is performed according to the IEC/IEEE Standard for Binary Floating-point
803 -------------------------------------------------------------------------------
805 floatx80 int32_to_floatx80( int32 a )
812 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
814 absA = zSign ? - a : a;
815 shiftCount = countLeadingZeros32( absA ) + 32;
817 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
824 -------------------------------------------------------------------------------
825 Returns the result of converting the single-precision floating-point value
826 `a' to the 32-bit two's complement integer format. The conversion is
827 performed according to the IEC/IEEE Standard for Binary Floating-point
828 Arithmetic---which means in particular that the conversion is rounded
829 according to the current rounding mode. If `a' is a NaN, the largest
830 positive integer is returned. Otherwise, if the conversion overflows, the
831 largest integer with the same sign as `a' is returned.
832 -------------------------------------------------------------------------------
834 int32 float32_to_int32( struct roundingData *roundData, float32 a )
837 int16 aExp, shiftCount;
841 aSig = extractFloat32Frac( a );
842 aExp = extractFloat32Exp( a );
843 aSign = extractFloat32Sign( a );
844 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
845 if ( aExp ) aSig |= 0x00800000;
846 shiftCount = 0xAF - aExp;
849 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
850 return roundAndPackInt32( roundData, aSign, zSig );
855 -------------------------------------------------------------------------------
856 Returns the result of converting the single-precision floating-point value
857 `a' to the 32-bit two's complement integer format. The conversion is
858 performed according to the IEC/IEEE Standard for Binary Floating-point
859 Arithmetic, except that the conversion is always rounded toward zero. If
860 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
861 conversion overflows, the largest integer with the same sign as `a' is
863 -------------------------------------------------------------------------------
865 int32 float32_to_int32_round_to_zero( float32 a )
868 int16 aExp, shiftCount;
872 aSig = extractFloat32Frac( a );
873 aExp = extractFloat32Exp( a );
874 aSign = extractFloat32Sign( a );
875 shiftCount = aExp - 0x9E;
876 if ( 0 <= shiftCount ) {
877 if ( a == 0xCF000000 ) return 0x80000000;
878 float_raise( float_flag_invalid );
879 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
882 else if ( aExp <= 0x7E ) {
883 if ( aExp | aSig ) float_raise( float_flag_inexact );
886 aSig = ( aSig | 0x00800000 )<<8;
887 z = aSig>>( - shiftCount );
888 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
889 float_raise( float_flag_inexact );
891 return aSign ? - z : z;
896 -------------------------------------------------------------------------------
897 Returns the result of converting the single-precision floating-point value
898 `a' to the double-precision floating-point format. The conversion is
899 performed according to the IEC/IEEE Standard for Binary Floating-point
901 -------------------------------------------------------------------------------
903 float64 float32_to_float64( float32 a )
909 aSig = extractFloat32Frac( a );
910 aExp = extractFloat32Exp( a );
911 aSign = extractFloat32Sign( a );
912 if ( aExp == 0xFF ) {
913 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
914 return packFloat64( aSign, 0x7FF, 0 );
917 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
918 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
921 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
928 -------------------------------------------------------------------------------
929 Returns the result of converting the single-precision floating-point value
930 `a' to the extended double-precision floating-point format. The conversion
931 is performed according to the IEC/IEEE Standard for Binary Floating-point
933 -------------------------------------------------------------------------------
935 floatx80 float32_to_floatx80( float32 a )
941 aSig = extractFloat32Frac( a );
942 aExp = extractFloat32Exp( a );
943 aSign = extractFloat32Sign( a );
944 if ( aExp == 0xFF ) {
945 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
946 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
949 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
950 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
953 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
960 -------------------------------------------------------------------------------
961 Rounds the single-precision floating-point value `a' to an integer, and
962 returns the result as a single-precision floating-point value. The
963 operation is performed according to the IEC/IEEE Standard for Binary
964 Floating-point Arithmetic.
965 -------------------------------------------------------------------------------
967 float32 float32_round_to_int( struct roundingData *roundData, float32 a )
971 bits32 lastBitMask, roundBitsMask;
975 aExp = extractFloat32Exp( a );
976 if ( 0x96 <= aExp ) {
977 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
978 return propagateFloat32NaN( a, a );
982 roundingMode = roundData->mode;
983 if ( aExp <= 0x7E ) {
984 if ( (bits32) ( a<<1 ) == 0 ) return a;
985 roundData->exception |= float_flag_inexact;
986 aSign = extractFloat32Sign( a );
987 switch ( roundingMode ) {
988 case float_round_nearest_even:
989 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
990 return packFloat32( aSign, 0x7F, 0 );
993 case float_round_down:
994 return aSign ? 0xBF800000 : 0;
996 return aSign ? 0x80000000 : 0x3F800000;
998 return packFloat32( aSign, 0, 0 );
1001 lastBitMask <<= 0x96 - aExp;
1002 roundBitsMask = lastBitMask - 1;
1004 if ( roundingMode == float_round_nearest_even ) {
1005 z += lastBitMask>>1;
1006 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1008 else if ( roundingMode != float_round_to_zero ) {
1009 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1013 z &= ~ roundBitsMask;
1014 if ( z != a ) roundData->exception |= float_flag_inexact;
1020 -------------------------------------------------------------------------------
1021 Returns the result of adding the absolute values of the single-precision
1022 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1023 before being returned. `zSign' is ignored if the result is a NaN. The
1024 addition is performed according to the IEC/IEEE Standard for Binary
1025 Floating-point Arithmetic.
1026 -------------------------------------------------------------------------------
1028 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1030 int16 aExp, bExp, zExp;
1031 bits32 aSig, bSig, zSig;
1034 aSig = extractFloat32Frac( a );
1035 aExp = extractFloat32Exp( a );
1036 bSig = extractFloat32Frac( b );
1037 bExp = extractFloat32Exp( b );
1038 expDiff = aExp - bExp;
1041 if ( 0 < expDiff ) {
1042 if ( aExp == 0xFF ) {
1043 if ( aSig ) return propagateFloat32NaN( a, b );
1052 shift32RightJamming( bSig, expDiff, &bSig );
1055 else if ( expDiff < 0 ) {
1056 if ( bExp == 0xFF ) {
1057 if ( bSig ) return propagateFloat32NaN( a, b );
1058 return packFloat32( zSign, 0xFF, 0 );
1066 shift32RightJamming( aSig, - expDiff, &aSig );
1070 if ( aExp == 0xFF ) {
1071 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1074 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1075 zSig = 0x40000000 + aSig + bSig;
1080 zSig = ( aSig + bSig )<<1;
1082 if ( (sbits32) zSig < 0 ) {
1087 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1092 -------------------------------------------------------------------------------
1093 Returns the result of subtracting the absolute values of the single-
1094 precision floating-point values `a' and `b'. If `zSign' is true, the
1095 difference is negated before being returned. `zSign' is ignored if the
1096 result is a NaN. The subtraction is performed according to the IEC/IEEE
1097 Standard for Binary Floating-point Arithmetic.
1098 -------------------------------------------------------------------------------
1100 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1102 int16 aExp, bExp, zExp;
1103 bits32 aSig, bSig, zSig;
1106 aSig = extractFloat32Frac( a );
1107 aExp = extractFloat32Exp( a );
1108 bSig = extractFloat32Frac( b );
1109 bExp = extractFloat32Exp( b );
1110 expDiff = aExp - bExp;
1113 if ( 0 < expDiff ) goto aExpBigger;
1114 if ( expDiff < 0 ) goto bExpBigger;
1115 if ( aExp == 0xFF ) {
1116 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1117 roundData->exception |= float_flag_invalid;
1118 return float32_default_nan;
1124 if ( bSig < aSig ) goto aBigger;
1125 if ( aSig < bSig ) goto bBigger;
1126 return packFloat32( roundData->mode == float_round_down, 0, 0 );
1128 if ( bExp == 0xFF ) {
1129 if ( bSig ) return propagateFloat32NaN( a, b );
1130 return packFloat32( zSign ^ 1, 0xFF, 0 );
1138 shift32RightJamming( aSig, - expDiff, &aSig );
1144 goto normalizeRoundAndPack;
1146 if ( aExp == 0xFF ) {
1147 if ( aSig ) return propagateFloat32NaN( a, b );
1156 shift32RightJamming( bSig, expDiff, &bSig );
1161 normalizeRoundAndPack:
1163 return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1168 -------------------------------------------------------------------------------
1169 Returns the result of adding the single-precision floating-point values `a'
1170 and `b'. The operation is performed according to the IEC/IEEE Standard for
1171 Binary Floating-point Arithmetic.
1172 -------------------------------------------------------------------------------
1174 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1178 aSign = extractFloat32Sign( a );
1179 bSign = extractFloat32Sign( b );
1180 if ( aSign == bSign ) {
1181 return addFloat32Sigs( roundData, a, b, aSign );
1184 return subFloat32Sigs( roundData, a, b, aSign );
1190 -------------------------------------------------------------------------------
1191 Returns the result of subtracting the single-precision floating-point values
1192 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1193 for Binary Floating-point Arithmetic.
1194 -------------------------------------------------------------------------------
1196 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1200 aSign = extractFloat32Sign( a );
1201 bSign = extractFloat32Sign( b );
1202 if ( aSign == bSign ) {
1203 return subFloat32Sigs( roundData, a, b, aSign );
1206 return addFloat32Sigs( roundData, a, b, aSign );
1212 -------------------------------------------------------------------------------
1213 Returns the result of multiplying the single-precision floating-point values
1214 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1215 for Binary Floating-point Arithmetic.
1216 -------------------------------------------------------------------------------
1218 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1220 flag aSign, bSign, zSign;
1221 int16 aExp, bExp, zExp;
1226 aSig = extractFloat32Frac( a );
1227 aExp = extractFloat32Exp( a );
1228 aSign = extractFloat32Sign( a );
1229 bSig = extractFloat32Frac( b );
1230 bExp = extractFloat32Exp( b );
1231 bSign = extractFloat32Sign( b );
1232 zSign = aSign ^ bSign;
1233 if ( aExp == 0xFF ) {
1234 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1235 return propagateFloat32NaN( a, b );
1237 if ( ( bExp | bSig ) == 0 ) {
1238 roundData->exception |= float_flag_invalid;
1239 return float32_default_nan;
1241 return packFloat32( zSign, 0xFF, 0 );
1243 if ( bExp == 0xFF ) {
1244 if ( bSig ) return propagateFloat32NaN( a, b );
1245 if ( ( aExp | aSig ) == 0 ) {
1246 roundData->exception |= float_flag_invalid;
1247 return float32_default_nan;
1249 return packFloat32( zSign, 0xFF, 0 );
1252 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1253 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1256 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1257 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1259 zExp = aExp + bExp - 0x7F;
1260 aSig = ( aSig | 0x00800000 )<<7;
1261 bSig = ( bSig | 0x00800000 )<<8;
1262 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1264 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1268 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1273 -------------------------------------------------------------------------------
1274 Returns the result of dividing the single-precision floating-point value `a'
1275 by the corresponding value `b'. The operation is performed according to the
1276 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1277 -------------------------------------------------------------------------------
1279 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1281 flag aSign, bSign, zSign;
1282 int16 aExp, bExp, zExp;
1283 bits32 aSig, bSig, zSig;
1285 aSig = extractFloat32Frac( a );
1286 aExp = extractFloat32Exp( a );
1287 aSign = extractFloat32Sign( a );
1288 bSig = extractFloat32Frac( b );
1289 bExp = extractFloat32Exp( b );
1290 bSign = extractFloat32Sign( b );
1291 zSign = aSign ^ bSign;
1292 if ( aExp == 0xFF ) {
1293 if ( aSig ) return propagateFloat32NaN( a, b );
1294 if ( bExp == 0xFF ) {
1295 if ( bSig ) return propagateFloat32NaN( a, b );
1296 roundData->exception |= float_flag_invalid;
1297 return float32_default_nan;
1299 return packFloat32( zSign, 0xFF, 0 );
1301 if ( bExp == 0xFF ) {
1302 if ( bSig ) return propagateFloat32NaN( a, b );
1303 return packFloat32( zSign, 0, 0 );
1307 if ( ( aExp | aSig ) == 0 ) {
1308 roundData->exception |= float_flag_invalid;
1309 return float32_default_nan;
1311 roundData->exception |= float_flag_divbyzero;
1312 return packFloat32( zSign, 0xFF, 0 );
1314 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1317 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1318 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1320 zExp = aExp - bExp + 0x7D;
1321 aSig = ( aSig | 0x00800000 )<<7;
1322 bSig = ( bSig | 0x00800000 )<<8;
1323 if ( bSig <= ( aSig + aSig ) ) {
1328 bits64 tmp = ( (bits64) aSig )<<32;
1329 do_div( tmp, bSig );
1332 if ( ( zSig & 0x3F ) == 0 ) {
1333 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1335 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1340 -------------------------------------------------------------------------------
1341 Returns the remainder of the single-precision floating-point value `a'
1342 with respect to the corresponding value `b'. The operation is performed
1343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1344 -------------------------------------------------------------------------------
1346 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1348 flag aSign, bSign, zSign;
1349 int16 aExp, bExp, expDiff;
1352 bits64 aSig64, bSig64, q64;
1353 bits32 alternateASig;
1356 aSig = extractFloat32Frac( a );
1357 aExp = extractFloat32Exp( a );
1358 aSign = extractFloat32Sign( a );
1359 bSig = extractFloat32Frac( b );
1360 bExp = extractFloat32Exp( b );
1361 bSign = extractFloat32Sign( b );
1362 if ( aExp == 0xFF ) {
1363 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1364 return propagateFloat32NaN( a, b );
1366 roundData->exception |= float_flag_invalid;
1367 return float32_default_nan;
1369 if ( bExp == 0xFF ) {
1370 if ( bSig ) return propagateFloat32NaN( a, b );
1375 roundData->exception |= float_flag_invalid;
1376 return float32_default_nan;
1378 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1381 if ( aSig == 0 ) return a;
1382 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1384 expDiff = aExp - bExp;
1387 if ( expDiff < 32 ) {
1390 if ( expDiff < 0 ) {
1391 if ( expDiff < -1 ) return a;
1394 q = ( bSig <= aSig );
1395 if ( q ) aSig -= bSig;
1396 if ( 0 < expDiff ) {
1397 bits64 tmp = ( (bits64) aSig )<<32;
1398 do_div( tmp, bSig );
1402 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1410 if ( bSig <= aSig ) aSig -= bSig;
1411 aSig64 = ( (bits64) aSig )<<40;
1412 bSig64 = ( (bits64) bSig )<<40;
1414 while ( 0 < expDiff ) {
1415 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1416 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1417 aSig64 = - ( ( bSig * q64 )<<38 );
1421 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1422 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1423 q = q64>>( 64 - expDiff );
1425 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1428 alternateASig = aSig;
1431 } while ( 0 <= (sbits32) aSig );
1432 sigMean = aSig + alternateASig;
1433 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1434 aSig = alternateASig;
1436 zSign = ( (sbits32) aSig < 0 );
1437 if ( zSign ) aSig = - aSig;
1438 return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1443 -------------------------------------------------------------------------------
1444 Returns the square root of the single-precision floating-point value `a'.
1445 The operation is performed according to the IEC/IEEE Standard for Binary
1446 Floating-point Arithmetic.
1447 -------------------------------------------------------------------------------
1449 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1456 aSig = extractFloat32Frac( a );
1457 aExp = extractFloat32Exp( a );
1458 aSign = extractFloat32Sign( a );
1459 if ( aExp == 0xFF ) {
1460 if ( aSig ) return propagateFloat32NaN( a, 0 );
1461 if ( ! aSign ) return a;
1462 roundData->exception |= float_flag_invalid;
1463 return float32_default_nan;
1466 if ( ( aExp | aSig ) == 0 ) return a;
1467 roundData->exception |= float_flag_invalid;
1468 return float32_default_nan;
1471 if ( aSig == 0 ) return 0;
1472 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1474 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1475 aSig = ( aSig | 0x00800000 )<<8;
1476 zSig = estimateSqrt32( aExp, aSig ) + 2;
1477 if ( ( zSig & 0x7F ) <= 5 ) {
1483 term = ( (bits64) zSig ) * zSig;
1484 rem = ( ( (bits64) aSig )<<32 ) - term;
1485 while ( (sbits64) rem < 0 ) {
1487 rem += ( ( (bits64) zSig )<<1 ) | 1;
1489 zSig |= ( rem != 0 );
1492 shift32RightJamming( zSig, 1, &zSig );
1493 return roundAndPackFloat32( roundData, 0, zExp, zSig );
1498 -------------------------------------------------------------------------------
1499 Returns 1 if the single-precision floating-point value `a' is equal to the
1500 corresponding value `b', and 0 otherwise. The comparison is performed
1501 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1502 -------------------------------------------------------------------------------
1504 flag float32_eq( float32 a, float32 b )
1507 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1508 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1510 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1511 float_raise( float_flag_invalid );
1515 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1520 -------------------------------------------------------------------------------
1521 Returns 1 if the single-precision floating-point value `a' is less than or
1522 equal to the corresponding value `b', and 0 otherwise. The comparison is
1523 performed according to the IEC/IEEE Standard for Binary Floating-point
1525 -------------------------------------------------------------------------------
1527 flag float32_le( float32 a, float32 b )
1531 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1532 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1534 float_raise( float_flag_invalid );
1537 aSign = extractFloat32Sign( a );
1538 bSign = extractFloat32Sign( b );
1539 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1540 return ( a == b ) || ( aSign ^ ( a < b ) );
1545 -------------------------------------------------------------------------------
1546 Returns 1 if the single-precision floating-point value `a' is less than
1547 the corresponding value `b', and 0 otherwise. The comparison is performed
1548 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1549 -------------------------------------------------------------------------------
1551 flag float32_lt( float32 a, float32 b )
1555 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1556 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1558 float_raise( float_flag_invalid );
1561 aSign = extractFloat32Sign( a );
1562 bSign = extractFloat32Sign( b );
1563 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1564 return ( a != b ) && ( aSign ^ ( a < b ) );
1569 -------------------------------------------------------------------------------
1570 Returns 1 if the single-precision floating-point value `a' is equal to the
1571 corresponding value `b', and 0 otherwise. The invalid exception is raised
1572 if either operand is a NaN. Otherwise, the comparison is performed
1573 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1574 -------------------------------------------------------------------------------
1576 flag float32_eq_signaling( float32 a, float32 b )
1579 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1580 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1582 float_raise( float_flag_invalid );
1585 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1590 -------------------------------------------------------------------------------
1591 Returns 1 if the single-precision floating-point value `a' is less than or
1592 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1593 cause an exception. Otherwise, the comparison is performed according to the
1594 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1595 -------------------------------------------------------------------------------
1597 flag float32_le_quiet( float32 a, float32 b )
1602 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1603 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1605 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1606 float_raise( float_flag_invalid );
1610 aSign = extractFloat32Sign( a );
1611 bSign = extractFloat32Sign( b );
1612 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1613 return ( a == b ) || ( aSign ^ ( a < b ) );
1618 -------------------------------------------------------------------------------
1619 Returns 1 if the single-precision floating-point value `a' is less than
1620 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1621 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1622 Standard for Binary Floating-point Arithmetic.
1623 -------------------------------------------------------------------------------
1625 flag float32_lt_quiet( float32 a, float32 b )
1629 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1630 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1632 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1633 float_raise( float_flag_invalid );
1637 aSign = extractFloat32Sign( a );
1638 bSign = extractFloat32Sign( b );
1639 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1640 return ( a != b ) && ( aSign ^ ( a < b ) );
1645 -------------------------------------------------------------------------------
1646 Returns the result of converting the double-precision floating-point value
1647 `a' to the 32-bit two's complement integer format. The conversion is
1648 performed according to the IEC/IEEE Standard for Binary Floating-point
1649 Arithmetic---which means in particular that the conversion is rounded
1650 according to the current rounding mode. If `a' is a NaN, the largest
1651 positive integer is returned. Otherwise, if the conversion overflows, the
1652 largest integer with the same sign as `a' is returned.
1653 -------------------------------------------------------------------------------
1655 int32 float64_to_int32( struct roundingData *roundData, float64 a )
1658 int16 aExp, shiftCount;
1661 aSig = extractFloat64Frac( a );
1662 aExp = extractFloat64Exp( a );
1663 aSign = extractFloat64Sign( a );
1664 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1665 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1666 shiftCount = 0x42C - aExp;
1667 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1668 return roundAndPackInt32( roundData, aSign, aSig );
1673 -------------------------------------------------------------------------------
1674 Returns the result of converting the double-precision floating-point value
1675 `a' to the 32-bit two's complement integer format. The conversion is
1676 performed according to the IEC/IEEE Standard for Binary Floating-point
1677 Arithmetic, except that the conversion is always rounded toward zero. If
1678 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1679 conversion overflows, the largest integer with the same sign as `a' is
1681 -------------------------------------------------------------------------------
1683 int32 float64_to_int32_round_to_zero( float64 a )
1686 int16 aExp, shiftCount;
1687 bits64 aSig, savedASig;
1690 aSig = extractFloat64Frac( a );
1691 aExp = extractFloat64Exp( a );
1692 aSign = extractFloat64Sign( a );
1693 shiftCount = 0x433 - aExp;
1694 if ( shiftCount < 21 ) {
1695 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1698 else if ( 52 < shiftCount ) {
1699 if ( aExp || aSig ) float_raise( float_flag_inexact );
1702 aSig |= LIT64( 0x0010000000000000 );
1704 aSig >>= shiftCount;
1706 if ( aSign ) z = - z;
1707 if ( ( z < 0 ) ^ aSign ) {
1709 float_raise( float_flag_invalid );
1710 return aSign ? 0x80000000 : 0x7FFFFFFF;
1712 if ( ( aSig<<shiftCount ) != savedASig ) {
1713 float_raise( float_flag_inexact );
1720 -------------------------------------------------------------------------------
1721 Returns the result of converting the double-precision floating-point value
1722 `a' to the 32-bit two's complement unsigned integer format. The conversion
1723 is performed according to the IEC/IEEE Standard for Binary Floating-point
1724 Arithmetic---which means in particular that the conversion is rounded
1725 according to the current rounding mode. If `a' is a NaN, the largest
1726 positive integer is returned. Otherwise, if the conversion overflows, the
1727 largest positive integer is returned.
1728 -------------------------------------------------------------------------------
1730 int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1733 int16 aExp, shiftCount;
1736 aSig = extractFloat64Frac( a );
1737 aExp = extractFloat64Exp( a );
1738 aSign = 0; //extractFloat64Sign( a );
1739 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1740 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1741 shiftCount = 0x42C - aExp;
1742 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1743 return roundAndPackInt32( roundData, aSign, aSig );
1747 -------------------------------------------------------------------------------
1748 Returns the result of converting the double-precision floating-point value
1749 `a' to the 32-bit two's complement integer format. The conversion is
1750 performed according to the IEC/IEEE Standard for Binary Floating-point
1751 Arithmetic, except that the conversion is always rounded toward zero. If
1752 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1753 conversion overflows, the largest positive integer is returned.
1754 -------------------------------------------------------------------------------
1756 int32 float64_to_uint32_round_to_zero( float64 a )
1759 int16 aExp, shiftCount;
1760 bits64 aSig, savedASig;
1763 aSig = extractFloat64Frac( a );
1764 aExp = extractFloat64Exp( a );
1765 aSign = extractFloat64Sign( a );
1766 shiftCount = 0x433 - aExp;
1767 if ( shiftCount < 21 ) {
1768 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1771 else if ( 52 < shiftCount ) {
1772 if ( aExp || aSig ) float_raise( float_flag_inexact );
1775 aSig |= LIT64( 0x0010000000000000 );
1777 aSig >>= shiftCount;
1779 if ( aSign ) z = - z;
1780 if ( ( z < 0 ) ^ aSign ) {
1782 float_raise( float_flag_invalid );
1783 return aSign ? 0x80000000 : 0x7FFFFFFF;
1785 if ( ( aSig<<shiftCount ) != savedASig ) {
1786 float_raise( float_flag_inexact );
1792 -------------------------------------------------------------------------------
1793 Returns the result of converting the double-precision floating-point value
1794 `a' to the single-precision floating-point format. The conversion is
1795 performed according to the IEC/IEEE Standard for Binary Floating-point
1797 -------------------------------------------------------------------------------
1799 float32 float64_to_float32( struct roundingData *roundData, float64 a )
1806 aSig = extractFloat64Frac( a );
1807 aExp = extractFloat64Exp( a );
1808 aSign = extractFloat64Sign( a );
1809 if ( aExp == 0x7FF ) {
1810 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1811 return packFloat32( aSign, 0xFF, 0 );
1813 shift64RightJamming( aSig, 22, &aSig );
1815 if ( aExp || zSig ) {
1819 return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1826 -------------------------------------------------------------------------------
1827 Returns the result of converting the double-precision floating-point value
1828 `a' to the extended double-precision floating-point format. The conversion
1829 is performed according to the IEC/IEEE Standard for Binary Floating-point
1831 -------------------------------------------------------------------------------
1833 floatx80 float64_to_floatx80( float64 a )
1839 aSig = extractFloat64Frac( a );
1840 aExp = extractFloat64Exp( a );
1841 aSign = extractFloat64Sign( a );
1842 if ( aExp == 0x7FF ) {
1843 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1844 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1847 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1848 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1852 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1859 -------------------------------------------------------------------------------
1860 Rounds the double-precision floating-point value `a' to an integer, and
1861 returns the result as a double-precision floating-point value. The
1862 operation is performed according to the IEC/IEEE Standard for Binary
1863 Floating-point Arithmetic.
1864 -------------------------------------------------------------------------------
1866 float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1870 bits64 lastBitMask, roundBitsMask;
1874 aExp = extractFloat64Exp( a );
1875 if ( 0x433 <= aExp ) {
1876 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1877 return propagateFloat64NaN( a, a );
1881 if ( aExp <= 0x3FE ) {
1882 if ( (bits64) ( a<<1 ) == 0 ) return a;
1883 roundData->exception |= float_flag_inexact;
1884 aSign = extractFloat64Sign( a );
1885 switch ( roundData->mode ) {
1886 case float_round_nearest_even:
1887 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1888 return packFloat64( aSign, 0x3FF, 0 );
1891 case float_round_down:
1892 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1893 case float_round_up:
1895 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1897 return packFloat64( aSign, 0, 0 );
1900 lastBitMask <<= 0x433 - aExp;
1901 roundBitsMask = lastBitMask - 1;
1903 roundingMode = roundData->mode;
1904 if ( roundingMode == float_round_nearest_even ) {
1905 z += lastBitMask>>1;
1906 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1908 else if ( roundingMode != float_round_to_zero ) {
1909 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1913 z &= ~ roundBitsMask;
1914 if ( z != a ) roundData->exception |= float_flag_inexact;
1920 -------------------------------------------------------------------------------
1921 Returns the result of adding the absolute values of the double-precision
1922 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1923 before being returned. `zSign' is ignored if the result is a NaN. The
1924 addition is performed according to the IEC/IEEE Standard for Binary
1925 Floating-point Arithmetic.
1926 -------------------------------------------------------------------------------
1928 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1930 int16 aExp, bExp, zExp;
1931 bits64 aSig, bSig, zSig;
1934 aSig = extractFloat64Frac( a );
1935 aExp = extractFloat64Exp( a );
1936 bSig = extractFloat64Frac( b );
1937 bExp = extractFloat64Exp( b );
1938 expDiff = aExp - bExp;
1941 if ( 0 < expDiff ) {
1942 if ( aExp == 0x7FF ) {
1943 if ( aSig ) return propagateFloat64NaN( a, b );
1950 bSig |= LIT64( 0x2000000000000000 );
1952 shift64RightJamming( bSig, expDiff, &bSig );
1955 else if ( expDiff < 0 ) {
1956 if ( bExp == 0x7FF ) {
1957 if ( bSig ) return propagateFloat64NaN( a, b );
1958 return packFloat64( zSign, 0x7FF, 0 );
1964 aSig |= LIT64( 0x2000000000000000 );
1966 shift64RightJamming( aSig, - expDiff, &aSig );
1970 if ( aExp == 0x7FF ) {
1971 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1974 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1975 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1979 aSig |= LIT64( 0x2000000000000000 );
1980 zSig = ( aSig + bSig )<<1;
1982 if ( (sbits64) zSig < 0 ) {
1987 return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1992 -------------------------------------------------------------------------------
1993 Returns the result of subtracting the absolute values of the double-
1994 precision floating-point values `a' and `b'. If `zSign' is true, the
1995 difference is negated before being returned. `zSign' is ignored if the
1996 result is a NaN. The subtraction is performed according to the IEC/IEEE
1997 Standard for Binary Floating-point Arithmetic.
1998 -------------------------------------------------------------------------------
2000 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
2002 int16 aExp, bExp, zExp;
2003 bits64 aSig, bSig, zSig;
2006 aSig = extractFloat64Frac( a );
2007 aExp = extractFloat64Exp( a );
2008 bSig = extractFloat64Frac( b );
2009 bExp = extractFloat64Exp( b );
2010 expDiff = aExp - bExp;
2013 if ( 0 < expDiff ) goto aExpBigger;
2014 if ( expDiff < 0 ) goto bExpBigger;
2015 if ( aExp == 0x7FF ) {
2016 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2017 roundData->exception |= float_flag_invalid;
2018 return float64_default_nan;
2024 if ( bSig < aSig ) goto aBigger;
2025 if ( aSig < bSig ) goto bBigger;
2026 return packFloat64( roundData->mode == float_round_down, 0, 0 );
2028 if ( bExp == 0x7FF ) {
2029 if ( bSig ) return propagateFloat64NaN( a, b );
2030 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2036 aSig |= LIT64( 0x4000000000000000 );
2038 shift64RightJamming( aSig, - expDiff, &aSig );
2039 bSig |= LIT64( 0x4000000000000000 );
2044 goto normalizeRoundAndPack;
2046 if ( aExp == 0x7FF ) {
2047 if ( aSig ) return propagateFloat64NaN( a, b );
2054 bSig |= LIT64( 0x4000000000000000 );
2056 shift64RightJamming( bSig, expDiff, &bSig );
2057 aSig |= LIT64( 0x4000000000000000 );
2061 normalizeRoundAndPack:
2063 return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2068 -------------------------------------------------------------------------------
2069 Returns the result of adding the double-precision floating-point values `a'
2070 and `b'. The operation is performed according to the IEC/IEEE Standard for
2071 Binary Floating-point Arithmetic.
2072 -------------------------------------------------------------------------------
2074 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2078 aSign = extractFloat64Sign( a );
2079 bSign = extractFloat64Sign( b );
2080 if ( aSign == bSign ) {
2081 return addFloat64Sigs( roundData, a, b, aSign );
2084 return subFloat64Sigs( roundData, a, b, aSign );
2090 -------------------------------------------------------------------------------
2091 Returns the result of subtracting the double-precision floating-point values
2092 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2093 for Binary Floating-point Arithmetic.
2094 -------------------------------------------------------------------------------
2096 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2100 aSign = extractFloat64Sign( a );
2101 bSign = extractFloat64Sign( b );
2102 if ( aSign == bSign ) {
2103 return subFloat64Sigs( roundData, a, b, aSign );
2106 return addFloat64Sigs( roundData, a, b, aSign );
2112 -------------------------------------------------------------------------------
2113 Returns the result of multiplying the double-precision floating-point values
2114 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2115 for Binary Floating-point Arithmetic.
2116 -------------------------------------------------------------------------------
2118 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2120 flag aSign, bSign, zSign;
2121 int16 aExp, bExp, zExp;
2122 bits64 aSig, bSig, zSig0, zSig1;
2124 aSig = extractFloat64Frac( a );
2125 aExp = extractFloat64Exp( a );
2126 aSign = extractFloat64Sign( a );
2127 bSig = extractFloat64Frac( b );
2128 bExp = extractFloat64Exp( b );
2129 bSign = extractFloat64Sign( b );
2130 zSign = aSign ^ bSign;
2131 if ( aExp == 0x7FF ) {
2132 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2133 return propagateFloat64NaN( a, b );
2135 if ( ( bExp | bSig ) == 0 ) {
2136 roundData->exception |= float_flag_invalid;
2137 return float64_default_nan;
2139 return packFloat64( zSign, 0x7FF, 0 );
2141 if ( bExp == 0x7FF ) {
2142 if ( bSig ) return propagateFloat64NaN( a, b );
2143 if ( ( aExp | aSig ) == 0 ) {
2144 roundData->exception |= float_flag_invalid;
2145 return float64_default_nan;
2147 return packFloat64( zSign, 0x7FF, 0 );
2150 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2151 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2154 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2155 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2157 zExp = aExp + bExp - 0x3FF;
2158 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2159 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2160 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2161 zSig0 |= ( zSig1 != 0 );
2162 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2166 return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2171 -------------------------------------------------------------------------------
2172 Returns the result of dividing the double-precision floating-point value `a'
2173 by the corresponding value `b'. The operation is performed according to
2174 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2175 -------------------------------------------------------------------------------
2177 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2179 flag aSign, bSign, zSign;
2180 int16 aExp, bExp, zExp;
2181 bits64 aSig, bSig, zSig;
2183 bits64 term0, term1;
2185 aSig = extractFloat64Frac( a );
2186 aExp = extractFloat64Exp( a );
2187 aSign = extractFloat64Sign( a );
2188 bSig = extractFloat64Frac( b );
2189 bExp = extractFloat64Exp( b );
2190 bSign = extractFloat64Sign( b );
2191 zSign = aSign ^ bSign;
2192 if ( aExp == 0x7FF ) {
2193 if ( aSig ) return propagateFloat64NaN( a, b );
2194 if ( bExp == 0x7FF ) {
2195 if ( bSig ) return propagateFloat64NaN( a, b );
2196 roundData->exception |= float_flag_invalid;
2197 return float64_default_nan;
2199 return packFloat64( zSign, 0x7FF, 0 );
2201 if ( bExp == 0x7FF ) {
2202 if ( bSig ) return propagateFloat64NaN( a, b );
2203 return packFloat64( zSign, 0, 0 );
2207 if ( ( aExp | aSig ) == 0 ) {
2208 roundData->exception |= float_flag_invalid;
2209 return float64_default_nan;
2211 roundData->exception |= float_flag_divbyzero;
2212 return packFloat64( zSign, 0x7FF, 0 );
2214 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2217 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2218 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2220 zExp = aExp - bExp + 0x3FD;
2221 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2222 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2223 if ( bSig <= ( aSig + aSig ) ) {
2227 zSig = estimateDiv128To64( aSig, 0, bSig );
2228 if ( ( zSig & 0x1FF ) <= 2 ) {
2229 mul64To128( bSig, zSig, &term0, &term1 );
2230 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2231 while ( (sbits64) rem0 < 0 ) {
2233 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2235 zSig |= ( rem1 != 0 );
2237 return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2242 -------------------------------------------------------------------------------
2243 Returns the remainder of the double-precision floating-point value `a'
2244 with respect to the corresponding value `b'. The operation is performed
2245 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2246 -------------------------------------------------------------------------------
2248 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2250 flag aSign, bSign, zSign;
2251 int16 aExp, bExp, expDiff;
2253 bits64 q, alternateASig;
2256 aSig = extractFloat64Frac( a );
2257 aExp = extractFloat64Exp( a );
2258 aSign = extractFloat64Sign( a );
2259 bSig = extractFloat64Frac( b );
2260 bExp = extractFloat64Exp( b );
2261 bSign = extractFloat64Sign( b );
2262 if ( aExp == 0x7FF ) {
2263 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2264 return propagateFloat64NaN( a, b );
2266 roundData->exception |= float_flag_invalid;
2267 return float64_default_nan;
2269 if ( bExp == 0x7FF ) {
2270 if ( bSig ) return propagateFloat64NaN( a, b );
2275 roundData->exception |= float_flag_invalid;
2276 return float64_default_nan;
2278 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2281 if ( aSig == 0 ) return a;
2282 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2284 expDiff = aExp - bExp;
2285 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2286 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2287 if ( expDiff < 0 ) {
2288 if ( expDiff < -1 ) return a;
2291 q = ( bSig <= aSig );
2292 if ( q ) aSig -= bSig;
2294 while ( 0 < expDiff ) {
2295 q = estimateDiv128To64( aSig, 0, bSig );
2296 q = ( 2 < q ) ? q - 2 : 0;
2297 aSig = - ( ( bSig>>2 ) * q );
2301 if ( 0 < expDiff ) {
2302 q = estimateDiv128To64( aSig, 0, bSig );
2303 q = ( 2 < q ) ? q - 2 : 0;
2306 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2313 alternateASig = aSig;
2316 } while ( 0 <= (sbits64) aSig );
2317 sigMean = aSig + alternateASig;
2318 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2319 aSig = alternateASig;
2321 zSign = ( (sbits64) aSig < 0 );
2322 if ( zSign ) aSig = - aSig;
2323 return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2328 -------------------------------------------------------------------------------
2329 Returns the square root of the double-precision floating-point value `a'.
2330 The operation is performed according to the IEC/IEEE Standard for Binary
2331 Floating-point Arithmetic.
2332 -------------------------------------------------------------------------------
2334 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2339 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2342 aSig = extractFloat64Frac( a );
2343 aExp = extractFloat64Exp( a );
2344 aSign = extractFloat64Sign( a );
2345 if ( aExp == 0x7FF ) {
2346 if ( aSig ) return propagateFloat64NaN( a, a );
2347 if ( ! aSign ) return a;
2348 roundData->exception |= float_flag_invalid;
2349 return float64_default_nan;
2352 if ( ( aExp | aSig ) == 0 ) return a;
2353 roundData->exception |= float_flag_invalid;
2354 return float64_default_nan;
2357 if ( aSig == 0 ) return 0;
2358 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2360 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2361 aSig |= LIT64( 0x0010000000000000 );
2362 zSig = estimateSqrt32( aExp, aSig>>21 );
2364 aSig <<= 9 - ( aExp & 1 );
2365 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2366 if ( ( zSig & 0x3FF ) <= 5 ) {
2368 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2372 mul64To128( zSig, zSig, &term0, &term1 );
2373 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2374 while ( (sbits64) rem0 < 0 ) {
2376 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2378 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2380 zSig |= ( ( rem0 | rem1 ) != 0 );
2383 shift64RightJamming( zSig, 1, &zSig );
2384 return roundAndPackFloat64( roundData, 0, zExp, zSig );
2389 -------------------------------------------------------------------------------
2390 Returns 1 if the double-precision floating-point value `a' is equal to the
2391 corresponding value `b', and 0 otherwise. The comparison is performed
2392 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2393 -------------------------------------------------------------------------------
2395 flag float64_eq( float64 a, float64 b )
2398 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2399 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2401 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2402 float_raise( float_flag_invalid );
2406 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2411 -------------------------------------------------------------------------------
2412 Returns 1 if the double-precision floating-point value `a' is less than or
2413 equal to the corresponding value `b', and 0 otherwise. The comparison is
2414 performed according to the IEC/IEEE Standard for Binary Floating-point
2416 -------------------------------------------------------------------------------
2418 flag float64_le( float64 a, float64 b )
2422 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2423 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2425 float_raise( float_flag_invalid );
2428 aSign = extractFloat64Sign( a );
2429 bSign = extractFloat64Sign( b );
2430 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2431 return ( a == b ) || ( aSign ^ ( a < b ) );
2436 -------------------------------------------------------------------------------
2437 Returns 1 if the double-precision floating-point value `a' is less than
2438 the corresponding value `b', and 0 otherwise. The comparison is performed
2439 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2440 -------------------------------------------------------------------------------
2442 flag float64_lt( float64 a, float64 b )
2446 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2447 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2449 float_raise( float_flag_invalid );
2452 aSign = extractFloat64Sign( a );
2453 bSign = extractFloat64Sign( b );
2454 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2455 return ( a != b ) && ( aSign ^ ( a < b ) );
2460 -------------------------------------------------------------------------------
2461 Returns 1 if the double-precision floating-point value `a' is equal to the
2462 corresponding value `b', and 0 otherwise. The invalid exception is raised
2463 if either operand is a NaN. Otherwise, the comparison is performed
2464 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2465 -------------------------------------------------------------------------------
2467 flag float64_eq_signaling( float64 a, float64 b )
2470 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2471 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2473 float_raise( float_flag_invalid );
2476 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2481 -------------------------------------------------------------------------------
2482 Returns 1 if the double-precision floating-point value `a' is less than or
2483 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2484 cause an exception. Otherwise, the comparison is performed according to the
2485 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2486 -------------------------------------------------------------------------------
2488 flag float64_le_quiet( float64 a, float64 b )
2493 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2494 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2496 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2497 float_raise( float_flag_invalid );
2501 aSign = extractFloat64Sign( a );
2502 bSign = extractFloat64Sign( b );
2503 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2504 return ( a == b ) || ( aSign ^ ( a < b ) );
2509 -------------------------------------------------------------------------------
2510 Returns 1 if the double-precision floating-point value `a' is less than
2511 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2512 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2513 Standard for Binary Floating-point Arithmetic.
2514 -------------------------------------------------------------------------------
2516 flag float64_lt_quiet( float64 a, float64 b )
2520 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2521 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2523 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2524 float_raise( float_flag_invalid );
2528 aSign = extractFloat64Sign( a );
2529 bSign = extractFloat64Sign( b );
2530 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2531 return ( a != b ) && ( aSign ^ ( a < b ) );
2538 -------------------------------------------------------------------------------
2539 Returns the result of converting the extended double-precision floating-
2540 point value `a' to the 32-bit two's complement integer format. The
2541 conversion is performed according to the IEC/IEEE Standard for Binary
2542 Floating-point Arithmetic---which means in particular that the conversion
2543 is rounded according to the current rounding mode. If `a' is a NaN, the
2544 largest positive integer is returned. Otherwise, if the conversion
2545 overflows, the largest integer with the same sign as `a' is returned.
2546 -------------------------------------------------------------------------------
2548 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2551 int32 aExp, shiftCount;
2554 aSig = extractFloatx80Frac( a );
2555 aExp = extractFloatx80Exp( a );
2556 aSign = extractFloatx80Sign( a );
2557 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2558 shiftCount = 0x4037 - aExp;
2559 if ( shiftCount <= 0 ) shiftCount = 1;
2560 shift64RightJamming( aSig, shiftCount, &aSig );
2561 return roundAndPackInt32( roundData, aSign, aSig );
2566 -------------------------------------------------------------------------------
2567 Returns the result of converting the extended double-precision floating-
2568 point value `a' to the 32-bit two's complement integer format. The
2569 conversion is performed according to the IEC/IEEE Standard for Binary
2570 Floating-point Arithmetic, except that the conversion is always rounded
2571 toward zero. If `a' is a NaN, the largest positive integer is returned.
2572 Otherwise, if the conversion overflows, the largest integer with the same
2573 sign as `a' is returned.
2574 -------------------------------------------------------------------------------
2576 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2579 int32 aExp, shiftCount;
2580 bits64 aSig, savedASig;
2583 aSig = extractFloatx80Frac( a );
2584 aExp = extractFloatx80Exp( a );
2585 aSign = extractFloatx80Sign( a );
2586 shiftCount = 0x403E - aExp;
2587 if ( shiftCount < 32 ) {
2588 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2591 else if ( 63 < shiftCount ) {
2592 if ( aExp || aSig ) float_raise( float_flag_inexact );
2596 aSig >>= shiftCount;
2598 if ( aSign ) z = - z;
2599 if ( ( z < 0 ) ^ aSign ) {
2601 float_raise( float_flag_invalid );
2602 return aSign ? 0x80000000 : 0x7FFFFFFF;
2604 if ( ( aSig<<shiftCount ) != savedASig ) {
2605 float_raise( float_flag_inexact );
2612 -------------------------------------------------------------------------------
2613 Returns the result of converting the extended double-precision floating-
2614 point value `a' to the single-precision floating-point format. The
2615 conversion is performed according to the IEC/IEEE Standard for Binary
2616 Floating-point Arithmetic.
2617 -------------------------------------------------------------------------------
2619 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2625 aSig = extractFloatx80Frac( a );
2626 aExp = extractFloatx80Exp( a );
2627 aSign = extractFloatx80Sign( a );
2628 if ( aExp == 0x7FFF ) {
2629 if ( (bits64) ( aSig<<1 ) ) {
2630 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2632 return packFloat32( aSign, 0xFF, 0 );
2634 shift64RightJamming( aSig, 33, &aSig );
2635 if ( aExp || aSig ) aExp -= 0x3F81;
2636 return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2641 -------------------------------------------------------------------------------
2642 Returns the result of converting the extended double-precision floating-
2643 point value `a' to the double-precision floating-point format. The
2644 conversion is performed according to the IEC/IEEE Standard for Binary
2645 Floating-point Arithmetic.
2646 -------------------------------------------------------------------------------
2648 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2654 aSig = extractFloatx80Frac( a );
2655 aExp = extractFloatx80Exp( a );
2656 aSign = extractFloatx80Sign( a );
2657 if ( aExp == 0x7FFF ) {
2658 if ( (bits64) ( aSig<<1 ) ) {
2659 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2661 return packFloat64( aSign, 0x7FF, 0 );
2663 shift64RightJamming( aSig, 1, &zSig );
2664 if ( aExp || aSig ) aExp -= 0x3C01;
2665 return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2670 -------------------------------------------------------------------------------
2671 Rounds the extended double-precision floating-point value `a' to an integer,
2672 and returns the result as an extended quadruple-precision floating-point
2673 value. The operation is performed according to the IEC/IEEE Standard for
2674 Binary Floating-point Arithmetic.
2675 -------------------------------------------------------------------------------
2677 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2681 bits64 lastBitMask, roundBitsMask;
2685 aExp = extractFloatx80Exp( a );
2686 if ( 0x403E <= aExp ) {
2687 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2688 return propagateFloatx80NaN( a, a );
2692 if ( aExp <= 0x3FFE ) {
2694 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2697 roundData->exception |= float_flag_inexact;
2698 aSign = extractFloatx80Sign( a );
2699 switch ( roundData->mode ) {
2700 case float_round_nearest_even:
2701 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2704 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2707 case float_round_down:
2710 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2711 : packFloatx80( 0, 0, 0 );
2712 case float_round_up:
2714 aSign ? packFloatx80( 1, 0, 0 )
2715 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2717 return packFloatx80( aSign, 0, 0 );
2720 lastBitMask <<= 0x403E - aExp;
2721 roundBitsMask = lastBitMask - 1;
2723 roundingMode = roundData->mode;
2724 if ( roundingMode == float_round_nearest_even ) {
2725 z.low += lastBitMask>>1;
2726 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2728 else if ( roundingMode != float_round_to_zero ) {
2729 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2730 z.low += roundBitsMask;
2733 z.low &= ~ roundBitsMask;
2736 z.low = LIT64( 0x8000000000000000 );
2738 if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2744 -------------------------------------------------------------------------------
2745 Returns the result of adding the absolute values of the extended double-
2746 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2747 negated before being returned. `zSign' is ignored if the result is a NaN.
2748 The addition is performed according to the IEC/IEEE Standard for Binary
2749 Floating-point Arithmetic.
2750 -------------------------------------------------------------------------------
2752 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2754 int32 aExp, bExp, zExp;
2755 bits64 aSig, bSig, zSig0, zSig1;
2758 aSig = extractFloatx80Frac( a );
2759 aExp = extractFloatx80Exp( a );
2760 bSig = extractFloatx80Frac( b );
2761 bExp = extractFloatx80Exp( b );
2762 expDiff = aExp - bExp;
2763 if ( 0 < expDiff ) {
2764 if ( aExp == 0x7FFF ) {
2765 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2768 if ( bExp == 0 ) --expDiff;
2769 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2772 else if ( expDiff < 0 ) {
2773 if ( bExp == 0x7FFF ) {
2774 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2775 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2777 if ( aExp == 0 ) ++expDiff;
2778 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2782 if ( aExp == 0x7FFF ) {
2783 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2784 return propagateFloatx80NaN( a, b );
2789 zSig0 = aSig + bSig;
2791 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2798 zSig0 = aSig + bSig;
2800 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2802 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2803 zSig0 |= LIT64( 0x8000000000000000 );
2807 roundAndPackFloatx80(
2808 roundData, zSign, zExp, zSig0, zSig1 );
2813 -------------------------------------------------------------------------------
2814 Returns the result of subtracting the absolute values of the extended
2815 double-precision floating-point values `a' and `b'. If `zSign' is true,
2816 the difference is negated before being returned. `zSign' is ignored if the
2817 result is a NaN. The subtraction is performed according to the IEC/IEEE
2818 Standard for Binary Floating-point Arithmetic.
2819 -------------------------------------------------------------------------------
2821 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2823 int32 aExp, bExp, zExp;
2824 bits64 aSig, bSig, zSig0, zSig1;
2828 aSig = extractFloatx80Frac( a );
2829 aExp = extractFloatx80Exp( a );
2830 bSig = extractFloatx80Frac( b );
2831 bExp = extractFloatx80Exp( b );
2832 expDiff = aExp - bExp;
2833 if ( 0 < expDiff ) goto aExpBigger;
2834 if ( expDiff < 0 ) goto bExpBigger;
2835 if ( aExp == 0x7FFF ) {
2836 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2837 return propagateFloatx80NaN( a, b );
2839 roundData->exception |= float_flag_invalid;
2840 z.low = floatx80_default_nan_low;
2841 z.high = floatx80_default_nan_high;
2849 if ( bSig < aSig ) goto aBigger;
2850 if ( aSig < bSig ) goto bBigger;
2851 return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2853 if ( bExp == 0x7FFF ) {
2854 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2855 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2857 if ( aExp == 0 ) ++expDiff;
2858 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2860 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2863 goto normalizeRoundAndPack;
2865 if ( aExp == 0x7FFF ) {
2866 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2869 if ( bExp == 0 ) --expDiff;
2870 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2872 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2874 normalizeRoundAndPack:
2876 normalizeRoundAndPackFloatx80(
2877 roundData, zSign, zExp, zSig0, zSig1 );
2882 -------------------------------------------------------------------------------
2883 Returns the result of adding the extended double-precision floating-point
2884 values `a' and `b'. The operation is performed according to the IEC/IEEE
2885 Standard for Binary Floating-point Arithmetic.
2886 -------------------------------------------------------------------------------
2888 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2892 aSign = extractFloatx80Sign( a );
2893 bSign = extractFloatx80Sign( b );
2894 if ( aSign == bSign ) {
2895 return addFloatx80Sigs( roundData, a, b, aSign );
2898 return subFloatx80Sigs( roundData, a, b, aSign );
2904 -------------------------------------------------------------------------------
2905 Returns the result of subtracting the extended double-precision floating-
2906 point values `a' and `b'. The operation is performed according to the
2907 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2908 -------------------------------------------------------------------------------
2910 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2914 aSign = extractFloatx80Sign( a );
2915 bSign = extractFloatx80Sign( b );
2916 if ( aSign == bSign ) {
2917 return subFloatx80Sigs( roundData, a, b, aSign );
2920 return addFloatx80Sigs( roundData, a, b, aSign );
2926 -------------------------------------------------------------------------------
2927 Returns the result of multiplying the extended double-precision floating-
2928 point values `a' and `b'. The operation is performed according to the
2929 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2930 -------------------------------------------------------------------------------
2932 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2934 flag aSign, bSign, zSign;
2935 int32 aExp, bExp, zExp;
2936 bits64 aSig, bSig, zSig0, zSig1;
2939 aSig = extractFloatx80Frac( a );
2940 aExp = extractFloatx80Exp( a );
2941 aSign = extractFloatx80Sign( a );
2942 bSig = extractFloatx80Frac( b );
2943 bExp = extractFloatx80Exp( b );
2944 bSign = extractFloatx80Sign( b );
2945 zSign = aSign ^ bSign;
2946 if ( aExp == 0x7FFF ) {
2947 if ( (bits64) ( aSig<<1 )
2948 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2949 return propagateFloatx80NaN( a, b );
2951 if ( ( bExp | bSig ) == 0 ) goto invalid;
2952 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2954 if ( bExp == 0x7FFF ) {
2955 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2956 if ( ( aExp | aSig ) == 0 ) {
2958 roundData->exception |= float_flag_invalid;
2959 z.low = floatx80_default_nan_low;
2960 z.high = floatx80_default_nan_high;
2963 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2966 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2967 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2970 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2971 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2973 zExp = aExp + bExp - 0x3FFE;
2974 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2975 if ( 0 < (sbits64) zSig0 ) {
2976 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2980 roundAndPackFloatx80(
2981 roundData, zSign, zExp, zSig0, zSig1 );
2986 -------------------------------------------------------------------------------
2987 Returns the result of dividing the extended double-precision floating-point
2988 value `a' by the corresponding value `b'. The operation is performed
2989 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2990 -------------------------------------------------------------------------------
2992 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2994 flag aSign, bSign, zSign;
2995 int32 aExp, bExp, zExp;
2996 bits64 aSig, bSig, zSig0, zSig1;
2997 bits64 rem0, rem1, rem2, term0, term1, term2;
3000 aSig = extractFloatx80Frac( a );
3001 aExp = extractFloatx80Exp( a );
3002 aSign = extractFloatx80Sign( a );
3003 bSig = extractFloatx80Frac( b );
3004 bExp = extractFloatx80Exp( b );
3005 bSign = extractFloatx80Sign( b );
3006 zSign = aSign ^ bSign;
3007 if ( aExp == 0x7FFF ) {
3008 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3009 if ( bExp == 0x7FFF ) {
3010 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3013 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3015 if ( bExp == 0x7FFF ) {
3016 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3017 return packFloatx80( zSign, 0, 0 );
3021 if ( ( aExp | aSig ) == 0 ) {
3023 roundData->exception |= float_flag_invalid;
3024 z.low = floatx80_default_nan_low;
3025 z.high = floatx80_default_nan_high;
3028 roundData->exception |= float_flag_divbyzero;
3029 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3031 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3034 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3035 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3037 zExp = aExp - bExp + 0x3FFE;
3039 if ( bSig <= aSig ) {
3040 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3043 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3044 mul64To128( bSig, zSig0, &term0, &term1 );
3045 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3046 while ( (sbits64) rem0 < 0 ) {
3048 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3050 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3051 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3052 mul64To128( bSig, zSig1, &term1, &term2 );
3053 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3054 while ( (sbits64) rem1 < 0 ) {
3056 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3058 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3061 roundAndPackFloatx80(
3062 roundData, zSign, zExp, zSig0, zSig1 );
3067 -------------------------------------------------------------------------------
3068 Returns the remainder of the extended double-precision floating-point value
3069 `a' with respect to the corresponding value `b'. The operation is performed
3070 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3071 -------------------------------------------------------------------------------
3073 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3075 flag aSign, bSign, zSign;
3076 int32 aExp, bExp, expDiff;
3077 bits64 aSig0, aSig1, bSig;
3078 bits64 q, term0, term1, alternateASig0, alternateASig1;
3081 aSig0 = extractFloatx80Frac( a );
3082 aExp = extractFloatx80Exp( a );
3083 aSign = extractFloatx80Sign( a );
3084 bSig = extractFloatx80Frac( b );
3085 bExp = extractFloatx80Exp( b );
3086 bSign = extractFloatx80Sign( b );
3087 if ( aExp == 0x7FFF ) {
3088 if ( (bits64) ( aSig0<<1 )
3089 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3090 return propagateFloatx80NaN( a, b );
3094 if ( bExp == 0x7FFF ) {
3095 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3101 roundData->exception |= float_flag_invalid;
3102 z.low = floatx80_default_nan_low;
3103 z.high = floatx80_default_nan_high;
3106 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3109 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3110 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3112 bSig |= LIT64( 0x8000000000000000 );
3114 expDiff = aExp - bExp;
3116 if ( expDiff < 0 ) {
3117 if ( expDiff < -1 ) return a;
3118 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3121 q = ( bSig <= aSig0 );
3122 if ( q ) aSig0 -= bSig;
3124 while ( 0 < expDiff ) {
3125 q = estimateDiv128To64( aSig0, aSig1, bSig );
3126 q = ( 2 < q ) ? q - 2 : 0;
3127 mul64To128( bSig, q, &term0, &term1 );
3128 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3129 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3133 if ( 0 < expDiff ) {
3134 q = estimateDiv128To64( aSig0, aSig1, bSig );
3135 q = ( 2 < q ) ? q - 2 : 0;
3137 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3138 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3139 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3140 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3142 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3149 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3150 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3151 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3154 aSig0 = alternateASig0;
3155 aSig1 = alternateASig1;
3160 normalizeRoundAndPackFloatx80(
3161 roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3166 -------------------------------------------------------------------------------
3167 Returns the square root of the extended double-precision floating-point
3168 value `a'. The operation is performed according to the IEC/IEEE Standard
3169 for Binary Floating-point Arithmetic.
3170 -------------------------------------------------------------------------------
3172 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3176 bits64 aSig0, aSig1, zSig0, zSig1;
3177 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3178 bits64 shiftedRem0, shiftedRem1;
3181 aSig0 = extractFloatx80Frac( a );
3182 aExp = extractFloatx80Exp( a );
3183 aSign = extractFloatx80Sign( a );
3184 if ( aExp == 0x7FFF ) {
3185 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3186 if ( ! aSign ) return a;
3190 if ( ( aExp | aSig0 ) == 0 ) return a;
3192 roundData->exception |= float_flag_invalid;
3193 z.low = floatx80_default_nan_low;
3194 z.high = floatx80_default_nan_high;
3198 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3199 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3201 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3202 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3205 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3206 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3207 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3208 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3209 mul64To128( zSig0, zSig0, &term0, &term1 );
3210 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3211 while ( (sbits64) rem0 < 0 ) {
3213 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3215 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3217 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3218 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3219 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3220 if ( zSig1 == 0 ) zSig1 = 1;
3221 mul64To128( zSig0, zSig1, &term1, &term2 );
3222 shortShift128Left( term1, term2, 1, &term1, &term2 );
3223 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3224 mul64To128( zSig1, zSig1, &term2, &term3 );
3225 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3226 while ( (sbits64) rem1 < 0 ) {
3228 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3231 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3233 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3236 roundAndPackFloatx80(
3237 roundData, 0, zExp, zSig0, zSig1 );
3242 -------------------------------------------------------------------------------
3243 Returns 1 if the extended double-precision floating-point value `a' is
3244 equal to the corresponding value `b', and 0 otherwise. The comparison is
3245 performed according to the IEC/IEEE Standard for Binary Floating-point
3247 -------------------------------------------------------------------------------
3249 flag floatx80_eq( floatx80 a, floatx80 b )
3252 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3253 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3254 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3255 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3257 if ( floatx80_is_signaling_nan( a )
3258 || floatx80_is_signaling_nan( b ) ) {
3259 roundData->exception |= float_flag_invalid;
3265 && ( ( a.high == b.high )
3267 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3273 -------------------------------------------------------------------------------
3274 Returns 1 if the extended double-precision floating-point value `a' is
3275 less than or equal to the corresponding value `b', and 0 otherwise. The
3276 comparison is performed according to the IEC/IEEE Standard for Binary
3277 Floating-point Arithmetic.
3278 -------------------------------------------------------------------------------
3280 flag floatx80_le( floatx80 a, floatx80 b )
3284 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3285 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3286 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3287 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3289 roundData->exception |= float_flag_invalid;
3292 aSign = extractFloatx80Sign( a );
3293 bSign = extractFloatx80Sign( b );
3294 if ( aSign != bSign ) {
3297 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3301 aSign ? le128( b.high, b.low, a.high, a.low )
3302 : le128( a.high, a.low, b.high, b.low );
3307 -------------------------------------------------------------------------------
3308 Returns 1 if the extended double-precision floating-point value `a' is
3309 less than the corresponding value `b', and 0 otherwise. The comparison
3310 is performed according to the IEC/IEEE Standard for Binary Floating-point
3312 -------------------------------------------------------------------------------
3314 flag floatx80_lt( floatx80 a, floatx80 b )
3318 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3319 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3320 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3321 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3323 roundData->exception |= float_flag_invalid;
3326 aSign = extractFloatx80Sign( a );
3327 bSign = extractFloatx80Sign( b );
3328 if ( aSign != bSign ) {
3331 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3335 aSign ? lt128( b.high, b.low, a.high, a.low )
3336 : lt128( a.high, a.low, b.high, b.low );
3341 -------------------------------------------------------------------------------
3342 Returns 1 if the extended double-precision floating-point value `a' is equal
3343 to the corresponding value `b', and 0 otherwise. The invalid exception is
3344 raised if either operand is a NaN. Otherwise, the comparison is performed
3345 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3346 -------------------------------------------------------------------------------
3348 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3351 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3352 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3353 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3354 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3356 roundData->exception |= float_flag_invalid;
3361 && ( ( a.high == b.high )
3363 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3369 -------------------------------------------------------------------------------
3370 Returns 1 if the extended double-precision floating-point value `a' is less
3371 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3372 do not cause an exception. Otherwise, the comparison is performed according
3373 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3374 -------------------------------------------------------------------------------
3376 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3380 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3381 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3382 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3383 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3385 if ( floatx80_is_signaling_nan( a )
3386 || floatx80_is_signaling_nan( b ) ) {
3387 roundData->exception |= float_flag_invalid;
3391 aSign = extractFloatx80Sign( a );
3392 bSign = extractFloatx80Sign( b );
3393 if ( aSign != bSign ) {
3396 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3400 aSign ? le128( b.high, b.low, a.high, a.low )
3401 : le128( a.high, a.low, b.high, b.low );
3406 -------------------------------------------------------------------------------
3407 Returns 1 if the extended double-precision floating-point value `a' is less
3408 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3409 an exception. Otherwise, the comparison is performed according to the
3410 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3411 -------------------------------------------------------------------------------
3413 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3417 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3418 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3419 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3420 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3422 if ( floatx80_is_signaling_nan( a )
3423 || floatx80_is_signaling_nan( b ) ) {
3424 roundData->exception |= float_flag_invalid;
3428 aSign = extractFloatx80Sign( a );
3429 bSign = extractFloatx80Sign( b );
3430 if ( aSign != bSign ) {
3433 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3437 aSign ? lt128( b.high, b.low, a.high, a.low )
3438 : lt128( a.high, a.low, b.high, b.low );