diff options
author | 2018-05-04 10:47:38 -0700 | |
---|---|---|
committer | 2018-05-04 10:59:43 -0700 | |
commit | e32c42a6deed1f8ed1dcdeaaba0acf74685c18e3 (patch) | |
tree | 4692efc20b5102e080e075e0d0edb888e59334b8 | |
parent | 47f1bd90658dd6858fb4bbefd4ef8acbef4ca931 (diff) |
Improve broadcast add implementation.
PiperOrigin-RevId: 195437679
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h | 87 | ||||
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h | 81 |
2 files changed, 165 insertions, 3 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 3d6042c31f..4776726972 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -2593,7 +2593,7 @@ inline void Add(int left_shift, const uint8* input1_data, } #endif // NEON - for (; i < size; i++) { + for (; i < size; ++i) { const int32 input1_val = input1_offset + input1_data[i]; const int32 input2_val = input2_offset + input2_data[i]; const int32 shifted_input1_val = input1_val * (1 << left_shift); @@ -2750,7 +2750,7 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, int32 output_activation_min, int32 output_activation_max, uint8* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); + gemmlowp::ScopedProfilingLabel label("BroadcastAddGeneric/8bit"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; @@ -2799,6 +2799,60 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, } } +inline void BroadcastAddFivefold( + int y0, int y1, int y2, int y3, int y4, int left_shift, + const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit"); + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise add of + // sections of the arrays. + uint8* output_data_ptr = output_data; + const uint8* input1_data_ptr = input1_data; + const uint8* input2_data_reset = input2_data; + for (int i4 = 0; i4 < y4; ++i4) { + const uint8* input2_data_ptr; + for (int i3 = 0; i3 < y3; ++i3) { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) { + for (int i1 = 0; i1 < y1; ++i1) { + for (int i0 = 0; i0 < y0; ++i0) { + const int32 input1_val = input1_offset + input1_data_ptr[i0]; + const int32 input2_val = input2_offset + input2_data_ptr[i0]; + const int32 shifted_input1_val = input1_val * (1 << left_shift); + const int32 shifted_input2_val = input2_val * (1 << left_shift); + const int32 scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_input1_val, input1_multiplier, input1_shift); + const int32 scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_input2_val, input2_multiplier, input2_shift); + const int32 raw_sum = scaled_input1_val + scaled_input2_val; + const int32 raw_output = + MultiplyByQuantizedMultiplierSmallerThanOne( + raw_sum, output_multiplier, output_shift) + + output_offset; + const int32 clamped_output = + std::min(output_activation_max, + std::max(output_activation_min, raw_output)); + output_data_ptr[i0] = static_cast<uint8>(clamped_output); + } + input2_data_ptr += y0; + output_data_ptr += y0; + } + input1_data_ptr += y0; + } + } + input2_data_reset = input2_data_ptr; + } +} + template <FusedActivationFunctionType Ac> inline void BroadcastAdd(int left_shift, const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, @@ -2827,6 +2881,33 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, output_activation_max, output_data, output_dims); } +template <FusedActivationFunctionType Ac> +inline void BroadcastAddFivefold( + int y0, int y1, int y2, int y3, int y4, int left_shift, + const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims, + input1_offset, input1_multiplier, input1_shift, + input2_data, input2_dims, input2_offset, + input2_multiplier, input2_shift, output_offset, + output_multiplier, output_shift, output_activation_min, + output_activation_max, output_data, output_dims); +} + inline void Mul(const float* input1_data, const Dims<4>& input1_dims, const float* input2_data, const Dims<4>& input2_dims, float output_activation_min, float output_activation_max, @@ -4375,7 +4456,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; -gemmlowp::ScopedProfilingLabel label("Softmax/8bit"); + gemmlowp::ScopedProfilingLabel label("Softmax/8bit"); const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); const int height = MatchingArraySize(input_dims, 2, output_dims, 2); const int width = MatchingArraySize(input_dims, 1, output_dims, 1); diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index d41ade4c9d..c6ed614593 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -1189,6 +1189,60 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, } } +inline void BroadcastAddFivefold( + int y0, int y1, int y2, int y3, int y4, int left_shift, + const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit"); + + int sb1 = y0; + int sa2 = y0; + int sb2 = y0 * y1; + int sa3 = y0 * y2; + int sa4 = y0 * y2 * y3; + int sb4 = y0 * y1 * y2; + + uint8* output_data_ptr = output_data; + for (int i4 = 0; i4 < y4; ++i4) { + for (int i3 = 0; i3 < y3; ++i3) { + for (int i2 = 0; i2 < y2; ++i2) { + for (int i1 = 0; i1 < y1; ++i1) { + for (int i0 = 0; i0 < y0; ++i0) { + const int32 input1_val = + input1_offset + + input1_data[i4 * sa4 + i3 * sa3 + i2 * sa2 + i0]; + const int32 input2_val = + input2_offset + + input2_data[i4 * sb4 + i2 * sb2 + i1 * sb1 + i0]; + const int32 shifted_input1_val = input1_val * (1 << left_shift); + const int32 shifted_input2_val = input2_val * (1 << left_shift); + const int32 scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_input1_val, input1_multiplier, input1_shift); + const int32 scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_input2_val, input2_multiplier, input2_shift); + const int32 raw_sum = scaled_input1_val + scaled_input2_val; + const int32 raw_output = + MultiplyByQuantizedMultiplierSmallerThanOne( + raw_sum, output_multiplier, output_shift) + + output_offset; + const int32 clamped_output = + std::min(output_activation_max, + std::max(output_activation_min, raw_output)); + *output_data_ptr = static_cast<uint8>(clamped_output); + ++output_data_ptr; + } + } + } + } + } +} + template <FusedActivationFunctionType Ac> inline void BroadcastAdd(int left_shift, const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, @@ -1217,6 +1271,33 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, output_activation_max, output_data, output_dims); } +template <FusedActivationFunctionType Ac> +inline void BroadcastAddFivefold( + int y0, int y1, int y2, int y3, int y4, int left_shift, + const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims, + input1_offset, input1_multiplier, input1_shift, + input2_data, input2_dims, input2_offset, + input2_multiplier, input2_shift, output_offset, + output_multiplier, output_shift, output_activation_min, + output_activation_max, output_data, output_dims); +} + inline void Mul(const float* input1_data, const Dims<4>& input1_dims, const float* input2_data, const Dims<4>& input2_dims, float output_activation_min, float output_activation_max, |