aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h')
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h797
1 files changed, 192 insertions, 605 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index c857fdf699..78567d52ea 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -42,10 +42,12 @@ namespace optimized_ops {
// Unoptimized reference ops:
using reference_ops::ArgMax;
using reference_ops::ArgMinMax;
+using reference_ops::BroadcastAdd4DSlow;
using reference_ops::BroadcastGreater;
using reference_ops::BroadcastGreaterEqual;
using reference_ops::BroadcastLess;
using reference_ops::BroadcastLessEqual;
+using reference_ops::BroadcastSub4DSlow;
using reference_ops::Concatenation;
using reference_ops::DepthConcatenation;
using reference_ops::Dequantize;
@@ -217,98 +219,6 @@ SaturatingRoundingMultiplyByPOTParam(
SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
}
-// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
-// BROADCASTING.
-//
-// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
-// rectangular array of numbers.
-//
-// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
-// However, as Dims<N> is to be deprecated, this class exists as an adaptor
-// to enable simple unoptimized implementations of element-wise broadcasting
-// operations.
-template <int N>
-struct NdArrayDesc {
- // The "extent" of each dimension. Indices along dimension d must be in the
- // half-open interval [0, extents[d]).
- int extents[N];
-
- // The number of *elements* (not bytes) between consecutive indices of each
- // dimension.
- int strides[N];
-};
-
-// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
-// ELEMENT-WISE BROADCASTING.
-//
-// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
-inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
- int i3) {
- TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
- TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
- TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
- TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
- return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
- i3 * desc.strides[3];
-}
-
-// Given the dimensions of the operands for an element-wise binary broadcast,
-// adjusts them so that they can be directly iterated over with simple loops.
-// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
-// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
-//
-// This function assumes that the two input shapes are compatible up to
-// broadcasting and the shorter one has already been prepended with 1s to be the
-// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
-// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
-// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
-// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
-//
-// When two shapes are compatible up to broadcasting, for each dimension d,
-// the input extents are either equal, or one of them is 1.
-//
-// This function performs the following for each dimension d:
-// - If the extents are equal, then do nothing since the loop that walks over
-// both of the input arrays is correct.
-// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
-// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
-// array0 to be referenced *at any index* in dimension d and still access the
-// same slice.
-template <int N>
-inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
- const Dims<N>& input1_dims,
- NdArrayDesc<N>* desc0_out,
- NdArrayDesc<N>* desc1_out) {
- TFLITE_DCHECK(desc0_out != nullptr);
- TFLITE_DCHECK(desc1_out != nullptr);
-
- // Copy dims to desc.
- for (int i = 0; i < N; ++i) {
- desc0_out->extents[i] = input0_dims.sizes[i];
- desc0_out->strides[i] = input0_dims.strides[i];
- desc1_out->extents[i] = input1_dims.sizes[i];
- desc1_out->strides[i] = input1_dims.strides[i];
- }
-
- // Walk over each dimension. If the extents are equal do nothing.
- // Otherwise, set the desc with extent 1 to have extent equal to the other and
- // stride 0.
- for (int i = 0; i < N; ++i) {
- const int extent0 = ArraySize(input0_dims, i);
- const int extent1 = ArraySize(input1_dims, i);
- if (extent0 != extent1) {
- if (extent0 == 1) {
- desc0_out->strides[i] = 0;
- desc0_out->extents[i] = extent1;
- } else {
- TFLITE_DCHECK_EQ(extent1, 1);
- desc1_out->strides[i] = 0;
- desc1_out->extents[i] = extent0;
- }
- }
- }
-}
-
inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) {
for (int i = 0; i < 4; i++) {
if (dims1.sizes[i] != dims2.sizes[i]) {
@@ -2478,20 +2388,17 @@ inline void L2Normalization(const uint8* input_data,
}
}
-inline void Add(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float output_activation_min, float output_activation_max,
- float* output_data, const Dims<4>& output_dims) {
+inline void Add(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const float* input1_data,
+ const RuntimeShape& input2_shape, const float* input2_data,
+ const RuntimeShape& output_shape, float* output_data) {
gemmlowp::ScopedProfilingLabel label("Add");
- TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
- TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
- TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
int i = 0;
- const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+ const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
#ifdef USE_NEON
- const auto activation_min = vdupq_n_f32(output_activation_min);
- const auto activation_max = vdupq_n_f32(output_activation_max);
+ const auto activation_min = vdupq_n_f32(params.float_activation_min);
+ const auto activation_max = vdupq_n_f32(params.float_activation_max);
for (; i <= size - 16; i += 16) {
auto a10 = vld1q_f32(input1_data + i);
auto a11 = vld1q_f32(input1_data + i + 4);
@@ -2530,29 +2437,26 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
for (; i < size; i++) {
auto x = input1_data[i] + input2_data[i];
- output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
- output_activation_max);
+ output_data[i] = ActivationFunctionWithMinMax(
+ x, params.float_activation_min, params.float_activation_max);
}
}
// Element-wise add that can often be used for inner loop of broadcast add as
// well as the non-broadcast add.
-inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
- int32 input1_offset, int32 input1_multiplier,
- int input1_shift, const uint8* input2_data,
- int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data) {
+inline void AddElementwise(int size, const ArithmeticParams& params,
+ const uint8* input1_data, const uint8* input2_data,
+ uint8* output_data) {
int i = 0;
- TFLITE_DCHECK_GT(input1_offset, -256);
- TFLITE_DCHECK_GT(input2_offset, -256);
- TFLITE_DCHECK_LT(input1_offset, 256);
- TFLITE_DCHECK_LT(input2_offset, 256);
+ TFLITE_DCHECK_GT(params.input1_offset, -256);
+ TFLITE_DCHECK_GT(params.input2_offset, -256);
+ TFLITE_DCHECK_LT(params.input1_offset, 256);
+ TFLITE_DCHECK_LT(params.input2_offset, 256);
#ifdef USE_NEON
- const auto output_activation_min_vector = vdup_n_u8(output_activation_min);
- const auto output_activation_max_vector = vdup_n_u8(output_activation_max);
+ const auto output_activation_min_vector =
+ vdup_n_u8(params.quantized_activation_min);
+ const auto output_activation_max_vector =
+ vdup_n_u8(params.quantized_activation_max);
for (; i <= size - 8; i += 8) {
const auto input1_val_original = vld1_u8(input1_data + i);
const auto input2_val_original = vld1_u8(input2_data + i);
@@ -2561,9 +2465,9 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
const auto input2_val_s16 =
vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
const auto input1_val =
- vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset));
+ vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
const auto input2_val =
- vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset));
+ vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
const auto input1_val_high = vget_high_s16(input1_val);
const auto input1_val_low = vget_low_s16(input1_val);
const auto input2_val_high = vget_high_s16(input2_val);
@@ -2572,32 +2476,32 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
auto x12 = vmovl_s16(input1_val_high);
auto x21 = vmovl_s16(input2_val_low);
auto x22 = vmovl_s16(input2_val_high);
- const auto left_shift_dup = vdupq_n_s32(left_shift);
+ const auto left_shift_dup = vdupq_n_s32(params.left_shift);
x11 = vshlq_s32(x11, left_shift_dup);
x12 = vshlq_s32(x12, left_shift_dup);
x21 = vshlq_s32(x21, left_shift_dup);
x22 = vshlq_s32(x22, left_shift_dup);
- x11 = vqrdmulhq_n_s32(x11, input1_multiplier);
- x12 = vqrdmulhq_n_s32(x12, input1_multiplier);
- x21 = vqrdmulhq_n_s32(x21, input2_multiplier);
- x22 = vqrdmulhq_n_s32(x22, input2_multiplier);
- const auto input1_shift_dup = vdupq_n_s32(-input1_shift);
- const auto input2_shift_dup = vdupq_n_s32(-input2_shift);
+ x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+ x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+ x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+ x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+ const auto input1_shift_dup = vdupq_n_s32(params.input1_shift);
+ const auto input2_shift_dup = vdupq_n_s32(params.input2_shift);
x11 = vshlq_s32(x11, input1_shift_dup);
x12 = vshlq_s32(x12, input1_shift_dup);
x21 = vshlq_s32(x21, input2_shift_dup);
x22 = vshlq_s32(x22, input2_shift_dup);
auto s1 = vaddq_s32(x11, x21);
auto s2 = vaddq_s32(x12, x22);
- s1 = vqrdmulhq_n_s32(s1, output_multiplier);
- s2 = vqrdmulhq_n_s32(s2, output_multiplier);
+ s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+ s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
using gemmlowp::RoundingDivideByPOT;
- s1 = RoundingDivideByPOT(s1, output_shift);
- s2 = RoundingDivideByPOT(s2, output_shift);
+ s1 = RoundingDivideByPOT(s1, -params.output_shift);
+ s2 = RoundingDivideByPOT(s2, -params.output_shift);
const auto s1_narrowed = vmovn_s32(s1);
const auto s2_narrowed = vmovn_s32(s2);
const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
- vdupq_n_s16(output_offset));
+ vdupq_n_s16(params.output_offset));
const auto clamped =
vmax_u8(output_activation_min_vector,
vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
@@ -2606,101 +2510,74 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
#endif // NEON
for (; i < size; ++i) {
- const int32 input1_val = input1_offset + input1_data[i];
- const int32 input2_val = input2_offset + input2_data[i];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ const int32 input1_val = params.input1_offset + input1_data[i];
+ const int32 input2_val = params.input2_offset + input2_data[i];
+ const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+ const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
const int32 scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier,
- kReverseShift * input1_shift);
+ shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32 scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier,
- kReverseShift * input2_shift);
+ shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32 raw_sum = scaled_input1_val + scaled_input2_val;
const int32 raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sum, output_multiplier, kReverseShift * output_shift) +
- output_offset;
- const int32 clamped_output = std::min(
- output_activation_max, std::max(output_activation_min, raw_output));
+ raw_sum, params.output_multiplier, params.output_shift) +
+ params.output_offset;
+ const int32 clamped_output =
+ std::min(params.quantized_activation_max,
+ std::max(params.quantized_activation_min, raw_output));
output_data[i] = static_cast<uint8>(clamped_output);
}
}
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Add(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float* output_data, const Dims<4>& output_dims) {
- float output_activation_min, output_activation_max;
- GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
- Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
- output_activation_max, output_data, output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-inline void Add(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier, int input2_shift,
- int32 output_offset, int32 output_multiplier, int output_shift,
- int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, 0);
- TFLITE_DCHECK_EQ(output_activation_max, 255);
- }
+inline void Add(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const uint8* input1_data,
+ const RuntimeShape& input2_shape, const uint8* input2_data,
+ const RuntimeShape& output_shape, uint8* output_data) {
+ TFLITE_DCHECK_LE(params.quantized_activation_min,
+ params.quantized_activation_max);
gemmlowp::ScopedProfilingLabel label("Add/8bit");
- const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
- TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
- TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
- TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-
- TFLITE_DCHECK_GT(input1_offset, -256);
- TFLITE_DCHECK_GT(input2_offset, -256);
- TFLITE_DCHECK_LT(input1_offset, 256);
- TFLITE_DCHECK_LT(input2_offset, 256);
- AddElementwise(flat_size, left_shift, input1_data, input1_offset,
- input1_multiplier, input1_shift, input2_data, input2_offset,
- input2_multiplier, input2_shift, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_data);
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+ TFLITE_DCHECK_GT(params.input1_offset, -256);
+ TFLITE_DCHECK_GT(params.input2_offset, -256);
+ TFLITE_DCHECK_LT(params.input1_offset, 256);
+ TFLITE_DCHECK_LT(params.input2_offset, 256);
+ AddElementwise(flat_size, params, input1_data, input2_data, output_data);
}
-inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
- int input1_shift, const int16* input2_data,
- const Dims<4>& input2_dims, int input2_shift,
- int16 output_activation_min, int16 output_activation_max,
- int16* output_data, const Dims<4>& output_dims) {
+inline void Add(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const int16* input1_data,
+ const RuntimeShape& input2_shape, const int16* input2_data,
+ const RuntimeShape& output_shape, int16* output_data) {
gemmlowp::ScopedProfilingLabel label("Add/Int16");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ TFLITE_DCHECK_LE(params.quantized_activation_min,
+ params.quantized_activation_max);
- const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
+ const int input1_shift = params.input1_shift;
+ const int flat_size =
+ MatchingFlatSize(output_shape, input1_shape, input2_shape);
+ const int16 output_activation_min = params.quantized_activation_min;
+ const int16 output_activation_max = params.quantized_activation_max;
- TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
- TFLITE_DCHECK_GE(input1_shift, 0);
- TFLITE_DCHECK_GE(input2_shift, 0);
+ TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+ TFLITE_DCHECK_LE(input1_shift, 0);
+ TFLITE_DCHECK_LE(params.input2_shift, 0);
const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
- const int input_shift = input1_shift == 0 ? input2_shift : input1_shift;
+ const int input_right_shift =
+ input1_shift == 0 ? -params.input2_shift : -input1_shift;
for (int i = 0; i < flat_size; i++) {
// F0 uses 0 integer bits, range [-1, 1].
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
- F0 scaled_input =
- F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift));
+ F0 scaled_input = F0::FromRaw(
+ gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
const int16 raw_output = result.raw();
const int16 clamped_output = std::min(
@@ -2709,195 +2586,59 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
}
}
-inline void Add(const int32* input1_data, const Dims<4>& input1_dims,
- const int32* input2_data, const Dims<4>& input2_dims,
- int32 output_activation_min, int32 output_activation_max,
- int32* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Add/int32");
-
- const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
- for (int i = 0; i < flat_size; ++i) {
- output_data[i] = ActivationFunctionWithMinMax(
- input1_data[i] + input2_data[i], output_activation_min,
- output_activation_max);
- }
-}
-
-template <FusedActivationFunctionType Ac>
-inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
- int input1_shift, const int16* input2_data,
- const Dims<4>& input2_dims, int input2_shift,
- int16 output_activation_min, int16 output_activation_max,
- int16* output_data, const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, -32768);
- TFLITE_DCHECK_EQ(output_activation_max, 32767);
- }
-
- Add(input1_data, input1_dims, input1_shift, input2_data, input2_dims,
- input2_shift, output_activation_min, output_activation_max, output_data,
- output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-void Add(const int32* input1_data, const Dims<4>& input1_dims,
- const int32* input2_data, const Dims<4>& input2_dims,
- int32* output_data, const Dims<4>& output_dims) {
+inline void Add(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const int32* input1_data,
+ const RuntimeShape& input2_shape, const int32* input2_data,
+ const RuntimeShape& output_shape, int32* output_data) {
gemmlowp::ScopedProfilingLabel label("Add/int32");
- TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
- auto input1_map = MapAsVector(input1_data, input1_dims);
- auto input2_map = MapAsVector(input2_data, input2_dims);
- auto output_map = MapAsVector(output_data, output_dims);
- if (AreSameDims(input1_dims, input2_dims)) {
+ auto input1_map = MapAsVector(input1_data, input1_shape);
+ auto input2_map = MapAsVector(input2_data, input2_shape);
+ auto output_map = MapAsVector(output_data, output_shape);
+ if (input1_shape == input2_shape) {
output_map.array() = input1_map.array() + input2_map.array();
- } else if (FlatSize(input2_dims) == 1) {
+ } else if (input2_shape.FlatSize() == 1) {
auto scalar = input2_data[0];
output_map.array() = input1_map.array() + scalar;
- } else if (FlatSize(input1_dims) == 1) {
+ } else if (input1_shape.FlatSize() == 1) {
auto scalar = input1_data[0];
output_map.array() = scalar + input2_map.array();
} else {
// Should not come here.
TFLITE_DCHECK(false);
}
+ output_map = output_map.cwiseMax(params.quantized_activation_min);
+ output_map = output_map.cwiseMin(params.quantized_activation_max);
}
-// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, const Dims<4>& input2_dims,
- T output_activation_min, T output_activation_max,
- T* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
-
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- output_data[Offset(output_dims, c, x, y, b)] =
- ActivationFunctionWithMinMax(
- input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
- input2_data[SubscriptToIndex(desc2, c, x, y, b)],
- output_activation_min, output_activation_max);
- }
- }
- }
- }
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, const Dims<4>& input2_dims,
- T* output_data, const Dims<4>& output_dims) {
- T output_activation_min, output_activation_max;
- GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
- BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
- output_activation_min, output_activation_max, output_data,
- output_dims);
-}
-
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastAddGeneric/8bit");
-
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- const int32 input1_val =
- input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
- const int32 input2_val =
- input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
- const int32 scaled_input1_val =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier,
- kReverseShift * input1_shift);
- const int32 scaled_input2_val =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier,
- kReverseShift * input2_shift);
- const int32 raw_sum = scaled_input1_val + scaled_input2_val;
- const int32 raw_output =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sum, output_multiplier, kReverseShift * output_shift) +
- output_offset;
- const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, raw_output));
- output_data[Offset(output_dims, c, x, y, b)] =
- static_cast<uint8>(clamped_output);
- }
- }
- }
- }
-}
-
-inline void BroadcastAddFivefold(
- int y0, int y1, int y2, int y3, int y4, int left_shift,
- const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift, const uint8* input2_data,
- const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+ const RuntimeShape& unswitched_input1_shape,
+ const uint8* unswitched_input1_data,
+ const RuntimeShape& unswitched_input2_shape,
+ const uint8* unswitched_input2_data,
+ const RuntimeShape& output_shape,
+ uint8* output_data) {
gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit");
+ ArithmeticParams switched_params = unswitched_params;
+ switched_params.input1_offset = unswitched_params.input2_offset;
+ switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+ switched_params.input1_shift = unswitched_params.input2_shift;
+ switched_params.input2_offset = unswitched_params.input1_offset;
+ switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+ switched_params.input2_shift = unswitched_params.input1_shift;
+
+ const bool use_unswitched =
+ unswitched_params.broadcast_category ==
+ tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+ const ArithmeticParams& params =
+ use_unswitched ? unswitched_params : switched_params;
+ const uint8* input1_data =
+ use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+ const uint8* input2_data =
+ use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
// Fivefold nested loops. The second input resets its position for each
// iteration of the second loop. The first input resets its position at the
// beginning of the fourth loop. The innermost loop is an elementwise add of
@@ -2905,82 +2646,29 @@ inline void BroadcastAddFivefold(
uint8* output_data_ptr = output_data;
const uint8* input1_data_ptr = input1_data;
const uint8* input2_data_reset = input2_data;
- for (int i4 = 0; i4 < y4; ++i4) {
+ int y0 = params.broadcast_shape[0];
+ int y1 = params.broadcast_shape[1];
+ int y2 = params.broadcast_shape[2];
+ int y3 = params.broadcast_shape[3];
+ int y4 = params.broadcast_shape[4];
+ for (int i0 = 0; i0 < y0; ++i0) {
const uint8* input2_data_ptr;
- for (int i3 = 0; i3 < y3; ++i3) {
+ for (int i1 = 0; i1 < y1; ++i1) {
input2_data_ptr = input2_data_reset;
for (int i2 = 0; i2 < y2; ++i2) {
- for (int i1 = 0; i1 < y1; ++i1) {
- AddElementwise(
- y0, left_shift, input1_data_ptr, input1_offset, input1_multiplier,
- input1_shift, input2_data_ptr, input2_offset, input2_multiplier,
- input2_shift, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max, output_data_ptr);
- input2_data_ptr += y0;
- output_data_ptr += y0;
+ for (int i3 = 0; i3 < y3; ++i3) {
+ AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+ output_data_ptr);
+ input2_data_ptr += y4;
+ output_data_ptr += y4;
}
- input1_data_ptr += y0;
+ input1_data_ptr += y4;
}
}
input2_data_reset = input2_data_ptr;
}
}
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, 0);
- TFLITE_DCHECK_EQ(output_activation_max, 255);
- }
- BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
- input1_multiplier, input1_shift, input2_data, input2_dims,
- input2_offset, input2_multiplier, input2_shift, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_data, output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAddFivefold(
- int y0, int y1, int y2, int y3, int y4, int left_shift,
- const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift, const uint8* input2_data,
- const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, 0);
- TFLITE_DCHECK_EQ(output_activation_max, 255);
- }
- BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims,
- input1_offset, input1_multiplier, input1_shift,
- input2_data, input2_dims, input2_offset,
- input2_multiplier, input2_shift, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_data, output_dims);
-}
-
inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
const float* input2_data, const Dims<4>& input2_dims,
float output_activation_min, float output_activation_max,
@@ -3305,122 +2993,78 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
}
// TODO(aselle): This is not actually optimized yet.
-inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float output_activation_min, float output_activation_max,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Sub");
- const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+inline void SubNonBroadcast(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const float* input1_data,
+ const RuntimeShape& input2_shape,
+ const float* input2_data,
+ const RuntimeShape& output_shape,
+ float* output_data) {
+ gemmlowp::ScopedProfilingLabel label("SubNonBroadcast");
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
- input1_data[i] - input2_data[i], output_activation_min,
- output_activation_max);
+ input1_data[i] - input2_data[i], params.float_activation_min,
+ params.float_activation_max);
}
}
-// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <typename T>
-void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, const Dims<4>& input2_dims,
- T output_activation_min, T output_activation_max,
- T* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastSub");
-
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- output_data[Offset(output_dims, c, x, y, b)] =
- ActivationFunctionWithMinMax(
- input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
- input2_data[SubscriptToIndex(desc2, c, x, y, b)],
- output_activation_min, output_activation_max);
- }
- }
- }
+inline void SubWithActivation(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const int32* input1_data,
+ const RuntimeShape& input2_shape,
+ const int32* input2_data,
+ const RuntimeShape& output_shape,
+ int32* output_data) {
+ gemmlowp::ScopedProfilingLabel label("SubWithActivation/int32");
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, input2_shape);
+ for (int i = 0; i < flat_size; ++i) {
+ output_data[i] = ActivationFunctionWithMinMax(
+ input1_data[i] - input2_data[i], params.quantized_activation_min,
+ params.quantized_activation_max);
}
}
-inline void BroadcastSub(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
+inline void SubWithActivation(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const float* input1_data,
+ const RuntimeShape& input2_shape,
+ const float* input2_data,
+ const RuntimeShape& output_shape,
+ float* output_data) {
+ gemmlowp::ScopedProfilingLabel label("SubWithActivation/float");
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, input2_shape);
+ for (int i = 0; i < flat_size; ++i) {
+ output_data[i] = ActivationFunctionWithMinMax(
+ input1_data[i] - input2_data[i], params.float_activation_min,
+ params.float_activation_max);
+ }
+}
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+template <typename T>
+void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
+ const T* input1_data, const RuntimeShape& input2_shape,
+ const T* input2_data, const RuntimeShape& output_shape,
+ T* output_data) {
+ gemmlowp::ScopedProfilingLabel label("Sub");
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- const int32 input1_val =
- input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
- const int32 input2_val =
- input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
- const int32 scaled_input1_val =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier,
- kReverseShift * input1_shift);
- const int32 scaled_input2_val =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier,
- kReverseShift * input2_shift);
- const int32 raw_sub = scaled_input1_val - scaled_input2_val;
- const int32 raw_output =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sub, output_multiplier, kReverseShift * output_shift) +
- output_offset;
- const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, raw_output));
- output_data[Offset(output_dims, c, x, y, b)] =
- static_cast<uint8>(clamped_output);
- }
- }
- }
+ auto input1_map = MapAsVector(input1_data, input1_shape);
+ auto input2_map = MapAsVector(input2_data, input2_shape);
+ auto output_map = MapAsVector(output_data, output_shape);
+ if (input1_shape == input2_shape) {
+ output_map.array() = input1_map.array() - input2_map.array();
+ } else if (input1_shape.FlatSize() == 1) {
+ auto scalar = input1_data[0];
+ output_map.array() = scalar - input2_map.array();
+ } else if (input2_shape.FlatSize() == 1) {
+ auto scalar = input2_data[0];
+ output_map.array() = input1_map.array() - scalar;
+ } else {
+ BroadcastSub4DSlow(params, input1_shape, input1_data, input2_shape,
+ input2_data, output_shape, output_data);
}
}
@@ -5863,63 +5507,6 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
}
template <typename T>
-void GenericBroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, const Dims<4>& input2_dims,
- T* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("GenericBroadcastSub");
-
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- output_data[Offset(output_dims, c, x, y, b)] =
- input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
- input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- }
- }
- }
- }
-}
-
-template <typename T>
-void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
- const Dims<4>& input2_dims, T* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Sub");
-
- auto input1_map = MapAsVector(input1_data, input1_dims);
- auto input2_map = MapAsVector(input2_data, input2_dims);
- auto output_map = MapAsVector(output_data, output_dims);
- if (AreSameDims(input1_dims, input2_dims)) {
- output_map.array() = input1_map.array() - input2_map.array();
- } else if (FlatSize(input1_dims) == 1) {
- auto scalar = input1_data[0];
- output_map.array() = scalar - input2_map.array();
- } else if (FlatSize(input2_dims) == 1) {
- auto scalar = input2_data[0];
- output_map.array() = input1_map.array() - scalar;
- } else {
- GenericBroadcastSub(input1_data, input1_dims, input2_data, input2_dims,
- output_data, output_dims);
- }
-}
-
-template <typename T>
void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
const T* input2_data, T* output_data,
const Dims<4>& output_dims) {