aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/contrib/lite/kernels/fully_connected.cc7
-rw-r--r--tensorflow/contrib/lite/kernels/fully_connected_test.cc62
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h17
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h4
4 files changed, 79 insertions, 11 deletions
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index bc370608c0..eaf5a67d67 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -121,10 +121,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
double real_multiplier = 0.0;
TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
context, input, filter, bias, output, &real_multiplier));
- TF_LITE_ENSURE(context, real_multiplier < 1.0);
- QuantizeMultiplierSmallerThanOneExp(
- real_multiplier, &data->output_multiplier, &data->output_shift);
- data->output_shift *= -1;
+ int exponent;
+ QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+ data->output_shift = -exponent;
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
context, params->activation, output, &data->output_activation_min,
&data->output_activation_max));
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
index ec94905697..08b4320946 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -423,6 +423,37 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
ElementsAre(151, 152, 153, 185, 186, 187));
}
+TEST_P(QuantizedFullyConnectedOpTest,
+ SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+ // real_multiplier = 2.
+ QuantizedFullyConnectedOpModel m(
+ GetRegistration(), /*units=*/3, /*batches*/ 2,
+ /*input=*/{TensorType_UINT8, {2, 10}, -127, 128},
+ /*output=*/{TensorType_UINT8, {}, -63.5, 64});
+
+ m.SetWeights({
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2
+ });
+ m.SetBias({1, 2, 3});
+
+ m.SetInput({
+ 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0
+ 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1
+ });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+ ElementsAreArray(ArrayFloatNear({
+ 24, 25, 26, // first batch
+ 58, 59, 60, // second batch
+ })));
+ EXPECT_THAT(m.GetOutput<uint8_t>(),
+ ElementsAre(175, 177, 179, 243, 245, 247));
+}
+
void SimpleTestQuantizedInt16OutputCase(
TfLiteRegistration* registration, int input_depth, int output_depth,
int batches, FullyConnectedOptionsWeightsFormat weights_format) {
@@ -631,6 +662,37 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
ElementsAre(151, 152, 153, 185, 186, 187));
}
+TEST_P(QuantizedFullyConnectedOpTest,
+ SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) {
+ // real_multiplier = 2.
+ QuantizedFullyConnectedOpModel m(
+ GetRegistration(), /*units=*/3, /*batches=*/2,
+ /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128},
+ /*output=*/{TensorType_UINT8, {}, -63.5, 64});
+
+ m.SetWeights({
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ });
+ m.SetBias({1, 2, 3});
+
+ m.SetInput({
+ 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0
+ 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1
+ });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+ ElementsAreArray(ArrayFloatNear({
+ 24, 25, 26, // first batch
+ 58, 59, 60, // second batch
+ })));
+ EXPECT_THAT(m.GetOutput<uint8_t>(),
+ ElementsAre(175, 177, 179, 243, 245, 247));
+}
+
INSTANTIATE_TEST_CASE_P(
FloatFullyConnectedOpTest, FloatFullyConnectedOpTest,
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 6adb879c71..b870789772 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -893,6 +893,7 @@ inline void FullyConnectedAsGEMV(
const int input_size = FlatSizeSkipDim(input_dims, 3);
const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
static constexpr int kPeel = 4;
+ const bool shift_left = (output_shift <= 0);
for (int k = 0; k < input_size; k += 64) {
optimized_ops_preload_l1_stream(input_data + k);
}
@@ -1004,11 +1005,17 @@ inline void FullyConnectedAsGEMV(
int32x4_t bias_vec = vld1q_s32(bias_ptr);
bias_ptr += 4;
reduced = vaddq_s32(reduced, bias_vec);
- // Multiply by the fixed-point multiplier.
- reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
- // Rounding-shift-right.
- using gemmlowp::RoundingDivideByPOT;
- reduced = RoundingDivideByPOT(reduced, output_shift);
+ if (shift_left) {
+ const int32 multiplier_power_of_two = 1 << -output_shift;
+ reduced = vmulq_n_s32(reduced, multiplier_power_of_two);
+ reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+ } else {
+ // Multiply by the fixed-point multiplier.
+ reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+ // Rounding-shift-right.
+ using gemmlowp::RoundingDivideByPOT;
+ reduced = RoundingDivideByPOT(reduced, output_shift);
+ }
// Add the output offset.
const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
reduced = vaddq_s32(reduced, output_offset_vec);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index ace3af2da0..f4176e474e 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -546,8 +546,8 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
if (bias_data) {
acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
}
- acc = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- acc, output_multiplier, kReverseShift * output_shift);
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+ kReverseShift * output_shift);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);