aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-05-04 10:47:38 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-05-04 10:59:43 -0700
commite32c42a6deed1f8ed1dcdeaaba0acf74685c18e3 (patch)
tree4692efc20b5102e080e075e0d0edb888e59334b8
parent47f1bd90658dd6858fb4bbefd4ef8acbef4ca931 (diff)
Improve broadcast add implementation.
PiperOrigin-RevId: 195437679
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h87
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h81
2 files changed, 165 insertions, 3 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 3d6042c31f..4776726972 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -2593,7 +2593,7 @@ inline void Add(int left_shift, const uint8* input1_data,
}
#endif // NEON
- for (; i < size; i++) {
+ for (; i < size; ++i) {
const int32 input1_val = input1_offset + input1_data[i];
const int32 input2_val = input2_offset + input2_data[i];
const int32 shifted_input1_val = input1_val * (1 << left_shift);
@@ -2750,7 +2750,7 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
int32 output_activation_min,
int32 output_activation_max, uint8* output_data,
const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+ gemmlowp::ScopedProfilingLabel label("BroadcastAddGeneric/8bit");
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
@@ -2799,6 +2799,60 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
}
}
+inline void BroadcastAddFivefold(
+ int y0, int y1, int y2, int y3, int y4, int left_shift,
+ const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+ const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+ int input2_shift, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit");
+
+ // Fivefold nested loops. The second input resets its position for each
+ // iteration of the second loop. The first input resets its position at the
+ // beginning of the fourth loop. The innermost loop is an elementwise add of
+ // sections of the arrays.
+ uint8* output_data_ptr = output_data;
+ const uint8* input1_data_ptr = input1_data;
+ const uint8* input2_data_reset = input2_data;
+ for (int i4 = 0; i4 < y4; ++i4) {
+ const uint8* input2_data_ptr;
+ for (int i3 = 0; i3 < y3; ++i3) {
+ input2_data_ptr = input2_data_reset;
+ for (int i2 = 0; i2 < y2; ++i2) {
+ for (int i1 = 0; i1 < y1; ++i1) {
+ for (int i0 = 0; i0 < y0; ++i0) {
+ const int32 input1_val = input1_offset + input1_data_ptr[i0];
+ const int32 input2_val = input2_offset + input2_data_ptr[i0];
+ const int32 shifted_input1_val = input1_val * (1 << left_shift);
+ const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ const int32 scaled_input1_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input1_val, input1_multiplier, input1_shift);
+ const int32 scaled_input2_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input2_val, input2_multiplier, input2_shift);
+ const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32 raw_output =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ raw_sum, output_multiplier, output_shift) +
+ output_offset;
+ const int32 clamped_output =
+ std::min(output_activation_max,
+ std::max(output_activation_min, raw_output));
+ output_data_ptr[i0] = static_cast<uint8>(clamped_output);
+ }
+ input2_data_ptr += y0;
+ output_data_ptr += y0;
+ }
+ input1_data_ptr += y0;
+ }
+ }
+ input2_data_reset = input2_data_ptr;
+ }
+}
+
template <FusedActivationFunctionType Ac>
inline void BroadcastAdd(int left_shift, const uint8* input1_data,
const Dims<4>& input1_dims, int32 input1_offset,
@@ -2827,6 +2881,33 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
output_activation_max, output_data, output_dims);
}
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAddFivefold(
+ int y0, int y1, int y2, int y3, int y4, int left_shift,
+ const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+ const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+ int input2_shift, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims,
+ input1_offset, input1_multiplier, input1_shift,
+ input2_data, input2_dims, input2_offset,
+ input2_multiplier, input2_shift, output_offset,
+ output_multiplier, output_shift, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
const float* input2_data, const Dims<4>& input2_dims,
float output_activation_min, float output_activation_max,
@@ -4375,7 +4456,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+ gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index d41ade4c9d..c6ed614593 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1189,6 +1189,60 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
}
}
+inline void BroadcastAddFivefold(
+ int y0, int y1, int y2, int y3, int y4, int left_shift,
+ const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+ const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+ int input2_shift, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit");
+
+ int sb1 = y0;
+ int sa2 = y0;
+ int sb2 = y0 * y1;
+ int sa3 = y0 * y2;
+ int sa4 = y0 * y2 * y3;
+ int sb4 = y0 * y1 * y2;
+
+ uint8* output_data_ptr = output_data;
+ for (int i4 = 0; i4 < y4; ++i4) {
+ for (int i3 = 0; i3 < y3; ++i3) {
+ for (int i2 = 0; i2 < y2; ++i2) {
+ for (int i1 = 0; i1 < y1; ++i1) {
+ for (int i0 = 0; i0 < y0; ++i0) {
+ const int32 input1_val =
+ input1_offset +
+ input1_data[i4 * sa4 + i3 * sa3 + i2 * sa2 + i0];
+ const int32 input2_val =
+ input2_offset +
+ input2_data[i4 * sb4 + i2 * sb2 + i1 * sb1 + i0];
+ const int32 shifted_input1_val = input1_val * (1 << left_shift);
+ const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ const int32 scaled_input1_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input1_val, input1_multiplier, input1_shift);
+ const int32 scaled_input2_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input2_val, input2_multiplier, input2_shift);
+ const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32 raw_output =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ raw_sum, output_multiplier, output_shift) +
+ output_offset;
+ const int32 clamped_output =
+ std::min(output_activation_max,
+ std::max(output_activation_min, raw_output));
+ *output_data_ptr = static_cast<uint8>(clamped_output);
+ ++output_data_ptr;
+ }
+ }
+ }
+ }
+ }
+}
+
template <FusedActivationFunctionType Ac>
inline void BroadcastAdd(int left_shift, const uint8* input1_data,
const Dims<4>& input1_dims, int32 input1_offset,
@@ -1217,6 +1271,33 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
output_activation_max, output_data, output_dims);
}
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAddFivefold(
+ int y0, int y1, int y2, int y3, int y4, int left_shift,
+ const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+ const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+ int input2_shift, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims,
+ input1_offset, input1_multiplier, input1_shift,
+ input2_data, input2_dims, input2_offset,
+ input2_multiplier, input2_shift, output_offset,
+ output_multiplier, output_shift, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
const float* input2_data, const Dims<4>& input2_dims,
float output_activation_min, float output_activation_max,