aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-05-21 17:18:06 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-05-21 17:21:23 -0700
commit31ca159c1c86cb983c78e134cc756489653228f2 (patch)
tree5126ae478155cbbb3aa706f3cecc42383891237a
parentc3587c5f25ed9dfc173476e61b1ec0445c2989be (diff)
Make the quantize_and_dequantize op use the full quantized range when possible.
PiperOrigin-RevId: 197487461
-rw-r--r--tensorflow/core/kernels/quantize_and_dequantize_op.h89
-rw-r--r--tensorflow/core/kernels/quantize_and_dequantize_op_test.cc46
2 files changed, 66 insertions, 69 deletions
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 3b09ea2527..906d507c8a 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -23,6 +23,8 @@ limitations under the License.
namespace tensorflow {
namespace functor {
+// TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
+
template <typename Device, typename T>
struct QuantizeAndDequantizeOneScaleFunctor {
void operator()(const Device& d, typename TTypes<T>::ConstVec input,
@@ -49,56 +51,51 @@ struct QuantizeAndDequantizeOneScaleImpl {
d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
- // Make sure the range is symmetric for signed quantization, or start from
- // 0 for unsigned quantization.
- max_range = std::max(std::abs(max_range), std::abs(min_range));
+ // Calculate the range for the simulated integer quantization:
+ // e.g. [-128,127] for signed = true, num_bits = 8,
+ // or [0, 255] for signed = false, num_bits = 8.
+ const int64 min_quantized = signed_input ? -(1ULL << (num_bits - 1)) : 0;
+ const int64 max_quantized = min_quantized + ((1ULL << num_bits) - 1);
- // If both min and max are 0, then the output should be just 0.
- if (max_range == 0) {
- out.device(d) = input.constant(T(0));
- return;
- }
+ // Determine the maximum scaling factor that would scale
+ // [min_range, max_range] to not exceed [min_quantized, max_quantized],
+ // while keeping 0 unchanged.
+ const T scale_from_min_side = (min_quantized * min_range > 0)
+ ? min_quantized / min_range
+ : std::numeric_limits<T>::max();
+ const T scale_from_max_side = (max_quantized * max_range > 0)
+ ? max_quantized / max_range
+ : std::numeric_limits<T>::max();
- if (signed_input) {
- min_range = -max_range;
+ // Note: Avoids changing the side of the range that determines scale.
+ T scale, inverse_scale;
+ if (scale_from_min_side < scale_from_max_side) {
+ scale = scale_from_min_side;
+ inverse_scale = min_range / min_quantized;
+ max_range = max_quantized * inverse_scale;
+ } else {
+ scale = scale_from_max_side;
+ inverse_scale = max_range / max_quantized;
+ min_range = min_quantized * inverse_scale;
+ }
- // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
- // example, if it is 8 bits, we have the range [-127, 127]. So for input
- // range of [-x, x], the scale should be 254/(2*x).
- T scale = static_cast<T>((uint64_t{1} << (num_bits - 1)) - 1) / max_range;
- T inverse_scale = T(1.0) / scale;
- if (range_given) {
- out.device(d) =
- ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
- scale +
- T(0.5))
- .floor() *
- inverse_scale +
- min_range;
- } else {
- // No need to compare with min and max as they are measured from the
- // tensor.
- out.device(d) =
- ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
- min_range;
- }
+ if (range_given) {
+ // Note: The clamping here is to avoid overflow in the quantized type.
+ // The semantics of the op does not guarantee to clamp to the specified
+ // min_range and max_range - because we may have changed either min_range
+ // or max_range.
+ out.device(d) =
+ ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) * scale +
+ T(0.5))
+ .floor() *
+ inverse_scale +
+ min_range;
} else {
- min_range = 0;
- // If it is unsigned and num_bits == 8, the range with 8 bits is [0, 255].
- // If the input range is [0, x], then the scale is x/255 instead of 254 as
- // in the case above.
- T scale = static_cast<T>((uint64_t{1} << num_bits) - 1) / max_range;
- T inverse_scale = 1.0 / scale;
- if (range_given) {
- out.device(d) =
- ((input.cwiseMin(max_range).cwiseMax(min_range)) * scale + T(0.5))
- .floor() *
- inverse_scale;
- } else {
- // No need to compare with min and max as they are measured from the
- // tensor.
- out.device(d) = (input * scale + T(0.5)).floor() * inverse_scale;
- }
+ // No need to clamp to min_range and max_range in this case as they were
+ // measured from the tensor.
+ out.device(d) =
+ ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
+ min_range;
}
}
};
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index e41df12d91..629c698503 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -105,13 +105,13 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
AddInputFromArray<float>(TensorShape({}), {0.0}); // Min
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
- // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
+ // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
// Scale is: 1/127
- // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+ // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(
- &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+ test::FillValues<float>(&expected,
+ {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
@@ -136,13 +136,13 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_V3) {
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
AddInputFromArray<int32>(TensorShape({}), {8}); // num_bits
- // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
- // Scale is: 1/127
- // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+ // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+ // Scale is: 1/128
+ // Then it is dequantized to {-1, -64.0/128, 0, 38.0/128, 102.0/128, 71.0/128}
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(
- &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+ test::FillValues<float>(&expected,
+ {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
@@ -166,12 +166,11 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
AddInputFromArray<float>(TensorShape({}), {0.0}); // Min
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
- // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
- // Scale is: 1/7
+ // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+ // Scale is: 1/8
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(&expected,
- {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+ test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
@@ -196,12 +195,11 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
AddInputFromArray<int32>(TensorShape({}), {4}); // num_bits
- // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
- // Scale is: 1/7
+ // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+ // Scale is: 1/8
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(&expected,
- {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+ test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
@@ -228,13 +226,14 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
AddInputFromArray<float>(TensorShape({}), {1.0}); // Max
// Note that the range is given as [-1, 1].
- // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+ // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
// 127}.
// Scale is: 1/127
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
- test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
- 102.0 / 127, 70.0 / 127, -1, 1});
+ test::FillValues<float>(
+ &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+ 70.0 / 127, -128.0 / 127, 1});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
}
@@ -258,13 +257,14 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
AddInputFromArray<int32>(TensorShape({}), {8}); // num_bits
// Note that the range is given as [-1, 1].
- // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+ // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
// 127}.
// Scale is: 1/127
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
- test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
- 102.0 / 127, 70.0 / 127, -1, 1});
+ test::FillValues<float>(
+ &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+ 70.0 / 127, -128.0 / 127, 1});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
}