diff options
4 files changed, 79 insertions, 11 deletions
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc index bc370608c0..eaf5a67d67 100644 --- a/tensorflow/contrib/lite/kernels/fully_connected.cc +++ b/tensorflow/contrib/lite/kernels/fully_connected.cc @@ -121,10 +121,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); - TF_LITE_ENSURE(context, real_multiplier < 1.0); - QuantizeMultiplierSmallerThanOneExp( - real_multiplier, &data->output_multiplier, &data->output_shift); - data->output_shift *= -1; + int exponent; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); + data->output_shift = -exponent; TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized( context, params->activation, output, &data->output_activation_min, &data->output_activation_max)); diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc index ec94905697..08b4320946 100644 --- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc +++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc @@ -423,6 +423,37 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) { ElementsAre(151, 152, 153, 185, 186, 187)); } +TEST_P(QuantizedFullyConnectedOpTest, + SimpleTestQuantizedOutputMultiplierGreaterThan1) { + // real_multiplier = 2. + QuantizedFullyConnectedOpModel m( + GetRegistration(), /*units=*/3, /*batches*/ 2, + /*input=*/{TensorType_UINT8, {2, 10}, -127, 128}, + /*output=*/{TensorType_UINT8, {}, -63.5, 64}); + + m.SetWeights({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 + }); + m.SetBias({1, 2, 3}); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear({ + 24, 25, 26, // first batch + 58, 59, 60, // second batch + }))); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAre(175, 177, 179, 243, 245, 247)); +} + void SimpleTestQuantizedInt16OutputCase( TfLiteRegistration* registration, int input_depth, int output_depth, int batches, FullyConnectedOptionsWeightsFormat weights_format) { @@ -631,6 +662,37 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) { ElementsAre(151, 152, 153, 185, 186, 187)); } +TEST_P(QuantizedFullyConnectedOpTest, + SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) { + // real_multiplier = 2. + QuantizedFullyConnectedOpModel m( + GetRegistration(), /*units=*/3, /*batches=*/2, + /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128}, + /*output=*/{TensorType_UINT8, {}, -63.5, 64}); + + m.SetWeights({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + }); + m.SetBias({1, 2, 3}); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear({ + 24, 25, 26, // first batch + 58, 59, 60, // second batch + }))); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAre(175, 177, 179, 243, 245, 247)); +} + INSTANTIATE_TEST_CASE_P( FloatFullyConnectedOpTest, FloatFullyConnectedOpTest, ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 6adb879c71..b870789772 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -893,6 +893,7 @@ inline void FullyConnectedAsGEMV( const int input_size = FlatSizeSkipDim(input_dims, 3); const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); static constexpr int kPeel = 4; + const bool shift_left = (output_shift <= 0); for (int k = 0; k < input_size; k += 64) { optimized_ops_preload_l1_stream(input_data + k); } @@ -1004,11 +1005,17 @@ inline void FullyConnectedAsGEMV( int32x4_t bias_vec = vld1q_s32(bias_ptr); bias_ptr += 4; reduced = vaddq_s32(reduced, bias_vec); - // Multiply by the fixed-point multiplier. - reduced = vqrdmulhq_n_s32(reduced, output_multiplier); - // Rounding-shift-right. - using gemmlowp::RoundingDivideByPOT; - reduced = RoundingDivideByPOT(reduced, output_shift); + if (shift_left) { + const int32 multiplier_power_of_two = 1 << -output_shift; + reduced = vmulq_n_s32(reduced, multiplier_power_of_two); + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + } else { + // Multiply by the fixed-point multiplier. + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + // Rounding-shift-right. + using gemmlowp::RoundingDivideByPOT; + reduced = RoundingDivideByPOT(reduced, output_shift); + } // Add the output offset. const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); reduced = vaddq_s32(reduced, output_offset_vec); diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index ace3af2da0..f4176e474e 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -546,8 +546,8 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, if (bias_data) { acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; } - acc = MultiplyByQuantizedMultiplierSmallerThanOneExp( - acc, output_multiplier, kReverseShift * output_shift); + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + kReverseShift * output_shift); acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); |