Replace floating point functionality with integer alternative for microcontrollers

PiperOrigin-RevId: 211543125
author: Pete Warden <petewarden@google.com> 2018-09-04 16:09:43 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-09-04 16:20:09 -0700
commit: 7a2f0e251951fff033c57970a76d7339a79fc185 (patch)
tree: a2a422b49c3913111d36389d7e724eea1b18c4a5 /tensorflow/contrib/lite/kernels/internal
parent: ffcbd466a04f6c65623882dd4657d2e558521bb9 (diff)
3 files changed, 381 insertions, 0 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
index f882f9910e..544ef16ce1 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -23,6 +23,32 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+// These constants are used to manipulate the binary representation of doubles.
+// Double-precision binary64 floating point format is:
+// Bit |  63  |  62-52   |   51-0   |
+//     | Sign | Exponent | Fraction |
+// To avoid 64-bit integers as much as possible, I break this into high and
+// low 32-bit chunks. High is:
+// Bit |  31  |  30-20   |      19-0     |
+//     | Sign | Exponent | High Fraction |
+// Low is:
+// Bit |     31-0     |
+//     | Low Fraction |
+// We then access the components through logical bit-wise operations to
+// extract the parts needed, with the positions and masks derived from the
+// layout shown above.
+constexpr uint64_t kSignMask = 0x8000000000000000LL;
+constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
+constexpr int32_t kExponentShift = 52;
+constexpr int32_t kExponentBias = 1023;
+constexpr uint32_t kExponentIsBadNum = 0x7ff;
+constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
+constexpr uint32_t kFractionShift = 22;
+constexpr uint32_t kFractionRoundingMask = 0x003fffff;
+constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
+}  // namespace
+
 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
                         int* shift) {
   if (double_multiplier == 0.) {
@@ -30,8 +56,16 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
     *shift = 0;
     return;
   }
+#ifdef TFLITE_EMULATE_FLOAT
+  // If we're trying to avoid the use of floating-point instructions (for
+  // example on microcontrollers) then use an alternative implementation
+  // that only requires integer and bitwise operations. To enable this, you
+  // need to set the define during the build process for your platform.
+  int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
+#else   // TFLITE_EMULATE_FLOAT
   const double q = std::frexp(double_multiplier, shift);
   auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
+#endif  // TFLITE_EMULATE_FLOAT
   TFLITE_CHECK(q_fixed <= (1ll << 31));
   if (q_fixed == (1ll << 31)) {
     q_fixed /= 2;
@@ -60,6 +94,163 @@ void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
   *left_shift = shift;
 }
 
+int64_t IntegerFrExp(double input, int* shift) {
+  // Make sure our assumptions about the double layout hold.
+  TFLITE_CHECK_EQ(8, sizeof(double));
+
+  // We want to access the bits of the input double value directly, which is
+  // tricky to do safely, so use a union to handle the casting.
+  union {
+    double double_value;
+    uint64_t double_as_uint;
+  } cast_union;
+  cast_union.double_value = input;
+  const uint64_t u = cast_union.double_as_uint;
+
+  // If the bitfield is all zeros apart from the sign bit, this is a normalized
+  // zero value, so return standard values for this special case.
+  if ((u & ~kSignMask) == 0) {
+    *shift = 0;
+    return 0;
+  }
+
+  // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
+  // the exponent, and distinguished by whether the fractions are zero or
+  // non-zero.
+  const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
+  if (exponent_part == kExponentIsBadNum) {
+    *shift = std::numeric_limits<int>::max();
+    if (u & kFractionMask) {
+      // NaN, so just return zero (with the exponent set to INT_MAX).
+      return 0;
+    } else {
+      // Infinity, so return +/- INT_MAX.
+      if (u & kSignMask) {
+        return std::numeric_limits<int64_t>::min();
+      } else {
+        return std::numeric_limits<int64_t>::max();
+      }
+    }
+  }
+
+  // The shift is fairly easy to extract from the high bits of the double value,
+  // just by masking it out and applying a bias. The std::frexp() implementation
+  // always returns values between 0.5 and 1.0 though, whereas the exponent
+  // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
+  // interface.
+  *shift = (exponent_part - kExponentBias) + 1;
+
+  // There's an implicit high bit in the double format definition, so make sure
+  // we include that at the top, and then reconstruct the rest of the fractional
+  // value from the remaining fragments.
+  int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
+
+  // We're cutting off some bits at the bottom, so to exactly match the standard
+  // frexp implementation here we'll apply rounding by adding one to the least
+  // significant bit of the result if the discarded portion is over half of the
+  // maximum.
+  if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
+    fraction += 1;
+  }
+  // Negate the fraction if the sign bit was set.
+  if (u & kSignMask) {
+    fraction *= -1;
+  }
+
+  return fraction;
+}
+
+double DoubleFromFractionAndShift(int64_t fraction, int shift) {
+  union {
+    double double_value;
+    uint64_t double_as_uint;
+  } result;
+
+  // Detect NaNs and infinities.
+  if (shift == std::numeric_limits<int>::max()) {
+    if (fraction == 0) {
+      return NAN;
+    } else if (fraction > 0) {
+      return INFINITY;
+    } else {
+      return -INFINITY;
+    }
+  }
+
+  // Return a normalized zero for a zero fraction.
+  if (fraction == 0) {
+    result.double_as_uint = 0;
+    return result.double_value;
+  }
+
+  bool is_negative = (fraction < 0);
+  int64_t encoded_fraction = is_negative ? -fraction : fraction;
+  int64_t encoded_shift = (shift - 1);
+  while (encoded_fraction < 0x40000000) {
+    encoded_fraction *= 2;
+    encoded_shift -= 1;
+  }
+  while (encoded_fraction > 0x80000000) {
+    encoded_fraction /= 2;
+    encoded_shift += 1;
+  }
+  encoded_fraction -= 0x40000000;
+  if (encoded_shift < -1022) {
+    encoded_shift = -1023;
+  } else if (encoded_shift > 1022) {
+    encoded_shift = 1023;
+  }
+  encoded_shift += kExponentBias;
+  uint64_t encoded_sign = is_negative ? kSignMask : 0;
+  result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
+                          (encoded_fraction << kFractionShift);
+  return result.double_value;
+}
+
+double IntegerDoubleMultiply(double a, double b) {
+  int a_shift;
+  const int64_t a_fraction = IntegerFrExp(a, &a_shift);
+  int b_shift;
+  const int64_t b_fraction = IntegerFrExp(b, &b_shift);
+  // Detect NaNs and infinities.
+  if (a_shift == std::numeric_limits<int>::max() ||
+      (b_shift == std::numeric_limits<int>::max())) {
+    return NAN;
+  }
+  const int result_shift = a_shift + b_shift + 1;
+  const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
+  return DoubleFromFractionAndShift(result_fraction, result_shift);
+}
+
+int IntegerDoubleCompare(double a, double b) {
+  int a_shift;
+  const int64_t a_fraction = IntegerFrExp(a, &a_shift);
+  int b_shift;
+  const int64_t b_fraction = IntegerFrExp(b, &b_shift);
+
+  // Detect NaNs and infinities.
+  if (a_shift == std::numeric_limits<int>::max() ||
+      (b_shift == std::numeric_limits<int>::max())) {
+    return 1;
+  }
+
+  if ((a_fraction == 0) && (b_fraction < 0)) {
+    return 1;
+  } else if ((a_fraction < 0) && (b_fraction == 0)) {
+    return -1;
+  } else if (a_shift < b_shift) {
+    return -1;
+  } else if (a_shift > b_shift) {
+    return 1;
+  } else if (a_fraction < b_fraction) {
+    return -1;
+  } else if (a_fraction > b_fraction) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
 void PreprocessSoftmaxScaling(double beta, double input_scale,
                               int input_integer_bits,
                               int32_t* quantized_multiplier, int* left_shift) {
@@ -72,8 +263,20 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
   // result is double equivalent of Q0.31 (actually with more precision). Thus
   // this generates a Q(input_integer_bits).(31-input_integer_bits)
   // representation.
+#ifdef TFLITE_EMULATE_FLOAT
+  const double input_beta = IntegerDoubleMultiply(beta, input_scale);
+  int shift;
+  int64_t fraction = IntegerFrExp(input_beta, &shift);
+  shift += (31 - input_integer_bits);
+  double input_beta_real_multiplier =
+      DoubleFromFractionAndShift(fraction, shift);
+  if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
+    input_beta_real_multiplier = (1ll << 31) - 1.0;
+  }
+#else   // TFLITE_EMULATE_FLOAT
   const double input_beta_real_multiplier = std::min(
       beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
+#endif  // TFLITE_EMULATE_FLOAT
 
   QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
                                    quantized_multiplier, left_shift);
@@ -97,6 +300,12 @@ void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
 }
 
 int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
+#ifdef TFLITE_EMULATE_FLOAT
+  int64_t result = (1 << input_integer_bits) - 1;
+  result <<= (31 - input_integer_bits);
+  result >>= input_left_shift;
+  return result;
+#else   // TFLITE_EMULATE_FLOAT
   const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
                                     (1ll << (31 - input_integer_bits)) /
                                     (1ll << input_left_shift);
@@ -104,6 +313,7 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   // After scaling the difference, the result would be at the maximum.  Thus we
   // must ensure that our value has lower magnitude.
   return static_cast<int>(std::floor(max_input_rescaled));
+#endif  // TFLITE_EMULATE_FLOAT
 }
 
 void NudgeQuantizationRange(const float min, const float max,
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index 9ee4a47fbb..d74a1bac97 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -195,6 +195,44 @@ void QuantizeMultiplierGreaterThanOne(double double_multiplier,
 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
                         int* shift);
 
+// Splits a double input value into a returned fraction, and a shift value from
+// the exponent, using only bitwise and integer operations to support
+// microcontrollers and other environments without floating-point support.
+//
+// This is designed to be a replacement for how std::frexp() is used within the
+// QuantizeMultiplier() function, and so has a different signature than the
+// standard version, returning a 64-bit integer rather than a double. This
+// result has a maximum value of 1<<31, with the fraction expressed as a
+// proportion of that maximum.
+//
+// std::frexp() returns NaNs and infinities unmodified, but since we're
+// returning integers that can't represent those values, instead we return
+// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
+// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
+// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
+// result in return values that end up truncating some bits at the end,
+// reflecting the loss of precision inherent in denormalization.
+int64_t IntegerFrExp(double input, int* shift);
+
+// Converts an integer fraction in the format produced by IntegerFrExp (where
+// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
+// IEEE binary64 double format result. The implementation uses only integer and
+// bitwise operators, so no floating point hardware support or emulation is
+// needed. This is here so quantized operations can run non-time-critical
+// preparation calculations on microcontrollers and other platforms without
+// float support.
+double DoubleFromFractionAndShift(int64_t fraction, int shift);
+
+// Performs a multiplication of two numbers in double format, using only integer
+// and bitwise instructions. This is aimed at supporting housekeeping functions
+// for quantized operations on microcontrollers without floating-point hardware.
+double IntegerDoubleMultiply(double a, double b);
+
+// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
+// greater than b. It is implemented using only integer and logical instructions
+// so that it can be easily run on microcontrollers for quantized operations.
+int IntegerDoubleCompare(double a, double b);
+
 // This first creates a multiplier in a double equivalent of
 // Q(input_integer_bits).(31-input_integer_bits) representation, with extra
 // precision in the double's fractional bits.  It then splits the result into
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index 00fc3e91dc..14281f25c6 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -191,6 +191,139 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMaxBoundary) {
   EXPECT_EQ(qp.zero_point, 255);
 }
 
+TEST(QuantizationUtilTest, IntegerFrExp) {
+  int shift;
+  int64_t result = IntegerFrExp(0.0, &shift);
+  EXPECT_EQ(0, result);
+  EXPECT_EQ(0, shift);
+
+  result = IntegerFrExp(1.0, &shift);
+  EXPECT_NEAR(0x40000000, result, 1);
+  EXPECT_EQ(1, shift);
+
+  result = IntegerFrExp(0.25, &shift);
+  EXPECT_NEAR(0x40000000, result, 1);
+  EXPECT_EQ(-1, shift);
+
+  result = IntegerFrExp(-1.0, &shift);
+  EXPECT_NEAR(-(1 << 30), result, 1);
+  EXPECT_EQ(1, shift);
+
+  result = IntegerFrExp(123.45, &shift);
+  EXPECT_NEAR(2071147315, result, 1);
+  EXPECT_EQ(7, shift);
+
+  result = IntegerFrExp(NAN, &shift);
+  EXPECT_NEAR(0, result, 1);
+  EXPECT_EQ(0x7fffffff, shift);
+
+  result = IntegerFrExp(INFINITY, &shift);
+  EXPECT_NEAR(std::numeric_limits<int64_t>::max(), result, 1);
+  EXPECT_EQ(0x7fffffff, shift);
+
+  result = IntegerFrExp(-INFINITY, &shift);
+  EXPECT_NEAR(std::numeric_limits<int64_t>::min(), result, 1);
+  EXPECT_EQ(0x7fffffff, shift);
+}
+
+TEST(QuantizationUtilTest, IntegerFrExpVersusDouble) {
+  int shift;
+  int32_t result = IntegerFrExp(0.0, &shift);
+  EXPECT_EQ(result, 0);
+  EXPECT_EQ(shift, 0);
+
+  int double_shift;
+  double double_result = std::frexp(0.0, &double_shift);
+  EXPECT_EQ(double_result, 0);
+  EXPECT_EQ(double_shift, 0);
+
+  result = IntegerFrExp(1.0, &shift);
+  EXPECT_NEAR(result, 0x40000000, 1);
+  EXPECT_EQ(shift, 1);
+  double_result = std::frexp(1.0, &double_shift);
+  EXPECT_NEAR(double_result, 0.5, 1e-5);
+  EXPECT_EQ(double_shift, 1);
+
+  result = IntegerFrExp(0.25, &shift);
+  EXPECT_NEAR(result, 0x40000000, 1);
+  EXPECT_EQ(shift, -1);
+  double_result = std::frexp(0.25, &double_shift);
+  EXPECT_NEAR(double_result, 0.5, 1e-5);
+  EXPECT_EQ(double_shift, -1);
+
+  result = IntegerFrExp(-1.0, &shift);
+  EXPECT_NEAR(result, -(1 << 30), 1);
+  EXPECT_EQ(shift, 1);
+  double_result = std::frexp(-1.0, &double_shift);
+  EXPECT_NEAR(double_result, -0.5, 1e-5);
+  EXPECT_EQ(double_shift, 1);
+
+  result = IntegerFrExp(123.45, &shift);
+  EXPECT_NEAR(result, (0.964453 * (1L << 31)), 1000);
+  EXPECT_EQ(shift, 7);
+  double_result = std::frexp(123.45, &double_shift);
+  EXPECT_NEAR(double_result, 0.964453, 1e-5);
+  EXPECT_EQ(double_shift, 7);
+}
+
+TEST(QuantizationUtilTest, DoubleFromFractionAndShift) {
+  double result = DoubleFromFractionAndShift(0, 0);
+  EXPECT_EQ(0, result);
+
+  result = DoubleFromFractionAndShift(0x40000000, 1);
+  EXPECT_NEAR(1.0, result, 1e-5);
+
+  result = DoubleFromFractionAndShift(0x40000000, 2);
+  EXPECT_NEAR(2.0, result, 1e-5);
+
+  int shift;
+  int64_t fraction = IntegerFrExp(3.0, &shift);
+  result = DoubleFromFractionAndShift(fraction, shift);
+  EXPECT_NEAR(3.0, result, 1e-5);
+
+  fraction = IntegerFrExp(123.45, &shift);
+  result = DoubleFromFractionAndShift(fraction, shift);
+  EXPECT_NEAR(123.45, result, 1e-5);
+
+  fraction = IntegerFrExp(-23.232323, &shift);
+  result = DoubleFromFractionAndShift(fraction, shift);
+  EXPECT_NEAR(-23.232323, result, 1e-5);
+
+  fraction = IntegerFrExp(NAN, &shift);
+  result = DoubleFromFractionAndShift(fraction, shift);
+  EXPECT_TRUE(std::isnan(result));
+
+  fraction = IntegerFrExp(INFINITY, &shift);
+  result = DoubleFromFractionAndShift(fraction, shift);
+  EXPECT_FALSE(std::isfinite(result));
+}
+
+TEST(QuantizationUtilTest, IntegerDoubleMultiply) {
+  EXPECT_NEAR(1.0, IntegerDoubleMultiply(1.0, 1.0), 1e-5);
+  EXPECT_NEAR(2.0, IntegerDoubleMultiply(1.0, 2.0), 1e-5);
+  EXPECT_NEAR(2.0, IntegerDoubleMultiply(2.0, 1.0), 1e-5);
+  EXPECT_NEAR(4.0, IntegerDoubleMultiply(2.0, 2.0), 1e-5);
+  EXPECT_NEAR(0.5, IntegerDoubleMultiply(1.0, 0.5), 1e-5);
+  EXPECT_NEAR(0.25, IntegerDoubleMultiply(0.5, 0.5), 1e-5);
+  EXPECT_NEAR(-1.0, IntegerDoubleMultiply(1.0, -1.0), 1e-5);
+  EXPECT_NEAR(-1.0, IntegerDoubleMultiply(-1.0, 1.0), 1e-5);
+  EXPECT_NEAR(1.0, IntegerDoubleMultiply(-1.0, -1.0), 1e-5);
+  EXPECT_NEAR(15000000.0, IntegerDoubleMultiply(3000.0, 5000.0), 1e-5);
+  EXPECT_TRUE(std::isnan(IntegerDoubleMultiply(NAN, 5000.0)));
+  EXPECT_TRUE(std::isnan(IntegerDoubleMultiply(3000.0, NAN)));
+}
+
+TEST(QuantizationUtilTest, IntegerDoubleCompare) {
+  EXPECT_EQ(-1, IntegerDoubleCompare(0.0, 1.0));
+  EXPECT_EQ(1, IntegerDoubleCompare(1.0, 0.0));
+  EXPECT_EQ(0, IntegerDoubleCompare(1.0, 1.0));
+  EXPECT_EQ(0, IntegerDoubleCompare(0.0, 0.0));
+  EXPECT_EQ(-1, IntegerDoubleCompare(-10.0, 10.0));
+  EXPECT_EQ(1, IntegerDoubleCompare(123.45, 10.0));
+  EXPECT_EQ(1, IntegerDoubleCompare(NAN, INFINITY));
+  EXPECT_EQ(1, IntegerDoubleCompare(INFINITY, NAN));
+}
+
 #ifdef GTEST_HAS_DEATH_TEST
 TEST(QuantizationUtilTest, ChooseQuantizationParamsInvalidRange) {
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, -30.0), "");
author	Pete Warden <petewarden@google.com>	2018-09-04 16:09:43 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-09-04 16:20:09 -0700
commit	7a2f0e251951fff033c57970a76d7339a79fc185 (patch)
tree	a2a422b49c3913111d36389d7e724eea1b18c4a5 /tensorflow/contrib/lite/kernels/internal
parent	ffcbd466a04f6c65623882dd4657d2e558521bb9 (diff)