Shamusworld >> Repos - rmac/blob - fltpoint.c

   1 //
   2 // Floating point to IEEE-754 conversion routines
   3 //
   4 // by James Hammons
   5 // (C) 2019 Underground Software
   6 //
   7 // Since there are no guarantees vis-a-vis floating point numbers in C, we have
   8 // to utilize routines like the following in order to guarantee that the thing
   9 // we get out of the C compiler is an honest-to-God IEEE-754 style floating
  10 // point number (since that's what the Motorola processors that we target
  11 // expect).
  12 //
  13
  14 #include "fltpoint.h"
  15 #include <float.h>
  16 #include <math.h>
  17 #include <stdio.h>
  18 #include "error.h"
  19
  20 //
  21 // Check for IEEE-754 conformance (C99 compilers should be OK here)
  22 //
  23 // The reason we do this is mainly to ensure consistency across all platforms,
  24 // even those that still haven't implemented C99 compliance after other
  25 // compilers have had them for decades. The long and the short of it is, there
  26 // are no guarantees for floating point implementations across platforms the
  27 // way there is for ints (in <stdint.h>, for example) and so we have to be
  28 // careful that bad assumptions vis-a-vis floating point numbers don't creep
  29 // into the codebase and cause problems similar to the ones we had when adding
  30 // proper 64-bit support. Hence, the following ugliness...
  31 //
  32 // IEEE-745 expects the following for floats and doubles:
  33 //  float: exponent is 8 bits, mantissa is 24 bits
  34 // double: exponent is 11 bits, mantissa is 53 bits
  35 // FLT_RADIX should be 2
  36 #ifdef FLT_RADIX
  37         #if FLT_RADIX != 2
  38         #error "FLT_RADIX: Your compiler sucks. Get a real one."
  39         #endif
  40 #endif
  41 #ifdef FLT_MANT_DIG
  42         #if FLT_MANT_DIG != 24
  43         #error "FLT_MANT_DIG: Your compiler sucks. Get a real one."
  44         #endif
  45 #endif
  46 #ifdef DBL_MANT_DIG
  47         #if DBL_MANT_DIG != 53
  48         #error "DBL_MANT_DIG: Your compiler sucks. Get a real one."
  49         #endif
  50 #endif
  51 #ifdef FLT_MAX_EXP
  52         #if FLT_MAX_EXP != 128
  53         #error "FLT_MAX_EXP: Your compiler sucks. Get a real one."
  54         #endif
  55 #endif
  56 #ifdef DBL_MAX_EXP
  57         #if DBL_MAX_EXP != 1024
  58         #error "DBL_MAX_EXP: Your compiler sucks. Get a real one."
  59         #endif
  60 #endif
  61 //
  62 // So if we get here, we can be pretty sure that a float is 4 bytes and a
  63 // double is 8. IEEE-754? Maaaaaaaaybe. But we don't have to worry about that
  64 // so much, as long as the token stream is OK (floats are 4 bytes, doubles are
  65 // 8).
  66 //
  67
  68
  69 uint32_t FloatToIEEE754(float f)
  70 {
  71         uint32_t sign = (signbit(f) ? 0x80000000 : 0);
  72
  73         // Split the float into normalized mantissa (range: (-1, -0.5], 0,
  74         // [+0.5, +1)) and base-2 exponent
  75         // d = mantissa * (2 ^ exponent) *exactly* for FLT_RADIX=2
  76         // Also, since we want the mantissa to be non-inverted (2's complemented),
  77         // we make sure to pass in a positive number (floats/doubles are *not* 2's
  78         // complemented) as we already captured the sign bit above.
  79         int32_t exponent;
  80         float mantissa = frexpf((f < 0 ? -f : f), &exponent);
  81
  82         // Set the exponent bias for IEEE-754 floats
  83         exponent += 0x7E;
  84
  85         // Check for zero, set the proper exponent if so (zero exponent means no
  86         // implied leading one)
  87         if (f == 0)
  88                 exponent = 0;
  89
  90         // Extract most significant 24 bits of mantissa
  91         mantissa = ldexpf(mantissa, 24);
  92
  93         // Convert to an unsigned int
  94         uint32_t ieeeVal = truncf(mantissa);
  95
  96         // ieeeVal now has the mantissa in binary format, *including* the leading 1
  97         // bit; so we have to strip that bit out, since in IEEE-754, it's implied.
  98         ieeeVal &= 0x007FFFFF;
  99
 100         // Finally, add in the other parts to make a proper IEEE-754 float
 101         ieeeVal |= sign | ((exponent & 0xFF) << 23);
 102
 103         return ieeeVal;
 104 }
 105
 106
 107 uint64_t DoubleToIEEE754(double d)
 108 {
 109         uint64_t sign = (signbit(d) ? 0x8000000000000000LL : 0);
 110         int32_t exponent;
 111
 112         // Split double into normalized mantissa (range: (-1, -0.5], 0, [+0.5, +1))
 113         // and base-2 exponent
 114         // d = mantissa * (2 ^ exponent) *exactly* for FLT_RADIX=2
 115         // Also, since we want the mantissa to be non-inverted (2's complemented),
 116         // we make sure to pass in a positive number (floats/doubles are *not* 2's
 117         // complemented) as we already captured the sign bit above.
 118         double mantissa = frexp((d < 0 ? -d : d), &exponent);
 119
 120         // Set the exponent bias for IEEE-754 doubles
 121         exponent += 0x3FE;
 122
 123         // Check for zero, set the proper exponent if so
 124         if (d == 0)
 125                 exponent = 0;
 126
 127         // Extract most significant 53 bits of mantissa
 128         mantissa = ldexp(mantissa, 53);
 129
 130         // Convert to an unsigned int
 131         uint64_t ieeeVal = trunc(mantissa);
 132
 133         // ieeeVal now has the mantissa in binary format, *including* the leading 1
 134         // bit; so we have to strip that bit out, since in IEEE-754, it's implied.
 135         ieeeVal &= 0x000FFFFFFFFFFFFF;
 136
 137         // Finally, add in the other parts to make a proper IEEE-754 double
 138         ieeeVal |= sign | ((uint64_t)(exponent & 0x7FF) << 52);
 139
 140         return ieeeVal;
 141 }
 142
 143
 144 void DoubleToExtended(double d, uint8_t out[])
 145 {
 146         int8_t sign = (signbit(d) ? 0x80 : 0);
 147         int32_t exponent;
 148         double mantissa = frexp((d < 0 ? -d : d), &exponent);
 149         exponent += 0x3FFE;
 150
 151         if (d == 0)
 152                 exponent = 0;
 153
 154         mantissa = ldexp(mantissa, 64);
 155         uint64_t intMant = trunc(mantissa);
 156
 157         // Motorola extended floating point is 96 bits, so we pack it into the
 158         // 12-byte array that's passed in. The format is as follows: 1 bit (sign),
 159         // 15 bits (exponent w/$3FFF bias), 16 bits of zero, 64 bits of mantissa.
 160         out[0] = sign | ((exponent >> 8) & 0x7F);
 161         out[1] = exponent & 0xFF;
 162         out[2] = 0;
 163         out[3] = 0;
 164         out[4] = (intMant >> 56) & 0xFF;
 165         out[5] = (intMant >> 48) & 0xFF;
 166         out[6] = (intMant >> 40) & 0xFF;
 167         out[7] = (intMant >> 32) & 0xFF;
 168         out[8] = (intMant >> 24) & 0xFF;
 169         out[9] = (intMant >> 16) & 0xFF;
 170         out[10] = (intMant >> 8) & 0xFF;
 171         out[11] = intMant & 0xFF;
 172 }
 173
 174
 175 //
 176 // Convert a double to a DSP56001 style fixed point float.
 177 // Seems to be 23 bits of float value with 1 bit (MSB) for the sign.
 178 //
 179 uint32_t DoubleToDSPFloat(double d)
 180 {
 181         if (d >= 1)
 182         {
 183                 warn("DSP value clamped to +1.");
 184                 return 0x7FFFFF;
 185         }
 186         else if (d <= -1)
 187         {
 188                 warn("DSP value clamped to -1.");
 189                 return 0x800000;
 190         }
 191
 192         return trunc(round(ldexp(d, 23)));
 193 }
 194
 195
 196 //
 197 // Convert a host native floating point number to a fixed point number.
 198 //
 199 uint64_t DoubleToFixedPoint(double d, int intBits, int fracBits)
 200 {
 201         uint8_t signBit = (signbit(d) ? 1 : 0);
 202
 203         // Ensure what we're working on is positive...
 204         if (d < 0)
 205                 d *= -1;
 206
 207         double scaleFactor = (double)(1 << fracBits);
 208         uint64_t result = (uint64_t)(d * scaleFactor);
 209
 210         // Invert the result, if necessary
 211         if (signBit == 1)
 212                 result = (result = 0xFFFFFFFFFFFFFFFFLL) + 1;
 213
 214         return result;
 215 }
 216