2 // Floating point to IEEE-754 conversion routines
5 // (C) 2018 Underground Software
7 // Since there are no guarantees vis-a-vis floating point numbers in C, we have
8 // to utilize routines like the following in order to guarantee that the thing
9 // we get out of the C compiler is an honest-to-God IEEE-754 style floating
10 // point number (since that's what the Motorola processors that we target
19 // Check for IEEE-754 conformance (C99 compilers should be OK here)
21 // The reason we do this is mainly to ensure consistency across all platforms,
22 // even those that still haven't implemented C99 compliance after other
23 // compilers have had them for decades. The long and the short of it is, there
24 // are no guarantees for floating point implementations across platforms the
25 // way there is for ints (in <stdint.h>, for example) and so we have to be
26 // careful that bad assumptions vis-a-vis floating point numbers don't creep
27 // into the codebase and cause problems similar to the ones we had when adding
28 // proper 64-bit support. Hence, the following ugliness...
30 // IEEE-745 expects the following for floats and doubles:
31 // float: exponent is 8 bits, mantissa is 24 bits
32 // double: exponent is 11 bits, mantissa is 53 bits
33 // FLT_RADIX should be 2
36 #error "FLT_RADIX: Your compiler sucks. Get a real one."
40 #if FLT_MANT_DIG != 24
41 #error "FLT_MANT_DIG: Your compiler sucks. Get a real one."
45 #if DBL_MANT_DIG != 53
46 #error "DBL_MANT_DIG: Your compiler sucks. Get a real one."
50 #if FLT_MAX_EXP != 128
51 #error "FLT_MAX_EXP: Your compiler sucks. Get a real one."
55 #if DBL_MAX_EXP != 1024
56 #error "DBL_MAX_EXP: Your compiler sucks. Get a real one."
60 // So if we get here, we can be pretty sure that a float is 4 bytes and a
61 // double is 8. IEEE-754? Maaaaaaaaybe. But we don't have to worry about that
62 // so much, as long as the token stream is OK (floats are 4 bytes, doubles are
67 uint32_t FloatToIEEE754(float f)
69 uint32_t sign = (signbit(f) ? 0x80000000 : 0);
71 // Split the float into normalized mantissa (range: (-1, -0.5], 0,
72 // [+0.5, +1)) and base-2 exponent
73 // d = mantissa * (2 ^ exponent) *exactly* for FLT_RADIX=2
74 // Also, since we want the mantissa to be non-inverted (2's complemented),
75 // we make sure to pass in a positive number (floats/doubles are not 2's
76 // complemented) as we already captured the sign bit above.
78 float mantissa = frexpf((f < 0 ? -f : f), &exponent);
80 // Set the exponent bias for IEEE-754 floats
83 // Check for zero, set the proper exponent if so (zero exponent means no
84 // implied leading one)
88 // Extract most significant 24 bits of mantissa
89 mantissa = ldexpf(mantissa, 24);
91 // Convert to an unsigned int
92 uint32_t ieeeVal = truncf(mantissa);
94 // ieeeVal now has the mantissa in binary format, *including* the leading 1
95 // bit; so we have to strip that bit out, since in IEEE-754, it's implied.
96 ieeeVal &= 0x007FFFFF;
98 // Finally, add in the other parts to make a proper IEEE-754 float
99 ieeeVal |= sign | ((exponent & 0xFF) << 23);
105 uint64_t DoubleToIEEE754(double d)
107 uint64_t sign = (signbit(d) ? 0x8000000000000000LL : 0);
110 // Split double into normalized mantissa (range: (-1, -0.5], 0, [+0.5, +1))
111 // and base-2 exponent
112 // d = mantissa * (2 ^ exponent) *exactly* for FLT_RADIX=2
113 // Also, since we want the mantissa to be non-inverted (2's complemented),
114 // we make sure to pass in a positive number (floats/doubles are not 2's
115 // complemented) as we already captured the sign bit above.
116 double mantissa = frexp((d < 0 ? -d : d), &exponent);
118 // Set the exponent bias for IEEE-754 doubles
121 // Check for zero, set the proper exponent if so
125 // Extract most significant 53 bits of mantissa
126 mantissa = ldexp(mantissa, 53);
128 // Convert to an unsigned int
129 uint64_t ieeeVal = trunc(mantissa);
131 // ieeeVal now has the mantissa in binary format, *including* the leading 1
132 // bit; so we have to strip that bit out, since in IEEE-754, it's implied.
133 ieeeVal &= 0x000FFFFFFFFFFFFF;
135 // Finally, add in the other parts to make a proper IEEE-754 double
136 ieeeVal |= sign | ((uint64_t)(exponent & 0x7FF) << 52);
142 void DoubleToExtended(double d, uint8_t out[])
144 int8_t sign = (signbit(d) ? 0x80 : 0);
146 double mantissa = frexp((d < 0 ? -d : d), &exponent);
152 mantissa = ldexp(mantissa, 64);
153 uint64_t intMant = trunc(mantissa);
155 // Motorola extended floating point is 96 bits, so we pack it into the
156 // 12-byte array that's passed in. The format is as follows: 1 bit (sign),
157 // 15 bits (exponent w/$3FFF bias), 16 bits of zero, 64 bits of mantissa.
158 out[0] = sign | ((exponent >> 8) & 0x7F);
159 out[1] = exponent & 0xFF;
162 out[4] = (intMant >> 56) & 0xFF;
163 out[5] = (intMant >> 48) & 0xFF;
164 out[6] = (intMant >> 40) & 0xFF;
165 out[7] = (intMant >> 32) & 0xFF;
166 out[8] = (intMant >> 24) & 0xFF;
167 out[9] = (intMant >> 16) & 0xFF;
168 out[10] = (intMant >> 8) & 0xFF;
169 out[11] = intMant & 0xFF;