aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h')
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h1229
1 files changed, 684 insertions, 545 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 16901a3e53..31a54c2b62 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -158,98 +158,6 @@ SaturatingRoundingMultiplyByPOTParam(
SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
}
-// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
-// BROADCASTING.
-//
-// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
-// rectangular array of numbers.
-//
-// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
-// However, as Dims<N> is to be deprecated, this class exists as an adaptor
-// to enable simple unoptimized implementations of element-wise broadcasting
-// operations.
-template <int N>
-struct NdArrayDesc {
- // The "extent" of each dimension. Indices along dimension d must be in the
- // half-open interval [0, extents[d]).
- int extents[N];
-
- // The number of *elements* (not bytes) between consecutive indices of each
- // dimension.
- int strides[N];
-};
-
-// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
-// ELEMENT-WISE BROADCASTING.
-//
-// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
-inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
- int i3) {
- TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
- TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
- TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
- TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
- return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
- i3 * desc.strides[3];
-}
-
-// Given the dimensions of the operands for an element-wise binary broadcast,
-// adjusts them so that they can be directly iterated over with simple loops.
-// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
-// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
-//
-// This function assumes that the two input shapes are compatible up to
-// broadcasting and the shorter one has already been prepended with 1s to be the
-// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
-// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
-// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
-// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
-//
-// When two shapes are compatible up to broadcasting, for each dimension d,
-// the input extents are either equal, or one of them is 1.
-//
-// This function performs the following for each dimension d:
-// - If the extents are equal, then do nothing since the loop that walks over
-// both of the input arrays is correct.
-// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
-// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
-// array0 to be referenced *at any index* in dimension d and still access the
-// same slice.
-template <int N>
-inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
- const Dims<N>& input1_dims,
- NdArrayDesc<N>* desc0_out,
- NdArrayDesc<N>* desc1_out) {
- TFLITE_DCHECK(desc0_out != nullptr);
- TFLITE_DCHECK(desc1_out != nullptr);
-
- // Copy dims to desc.
- for (int i = 0; i < N; ++i) {
- desc0_out->extents[i] = input0_dims.sizes[i];
- desc0_out->strides[i] = input0_dims.strides[i];
- desc1_out->extents[i] = input1_dims.sizes[i];
- desc1_out->strides[i] = input1_dims.strides[i];
- }
-
- // Walk over each dimension. If the extents are equal do nothing.
- // Otherwise, set the desc with extent 1 to have extent equal to the other and
- // stride 0.
- for (int i = 0; i < N; ++i) {
- const int extent0 = ArraySize(input0_dims, i);
- const int extent1 = ArraySize(input1_dims, i);
- if (extent0 != extent1) {
- if (extent0 == 1) {
- desc0_out->strides[i] = 0;
- desc0_out->extents[i] = extent1;
- } else {
- TFLITE_DCHECK_EQ(extent1, 1);
- desc1_out->strides[i] = 0;
- desc1_out->extents[i] = extent0;
- }
- }
- }
-}
-
inline void Conv(const float* input_data, const Dims<4>& input_dims,
const float* filter_data, const Dims<4>& filter_dims,
const float* bias_data, const Dims<4>& bias_dims,
@@ -951,6 +859,19 @@ inline void Relu6(const float* input_data, const RuntimeShape& input_shape,
}
}
+inline void ReluX(uint8 min_value, uint8 max_value, const uint8* input_data,
+ const RuntimeShape& input_shape, uint8* output_data,
+ const RuntimeShape& output_shape) {
+ gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)");
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i) {
+ const uint8 val = input_data[i];
+ const uint8 clamped =
+ val > max_value ? max_value : val < min_value ? min_value : val;
+ output_data[i] = clamped;
+ }
+}
+
template <FusedActivationFunctionType Ac>
void L2Normalization(const float* input_data, const RuntimeShape& input_shape,
float* output_data, const RuntimeShape& output_shape) {
@@ -1052,114 +973,108 @@ inline void L2Normalization(const uint8* input_data,
}
template <typename T>
-inline void Add(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, const Dims<4>& input2_dims,
- T output_activation_min, T output_activation_max,
- T* output_data, const Dims<4>& output_dims) {
- const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+inline void Add(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const T* input1_data,
+ const RuntimeShape& input2_shape, const T* input2_data,
+ const RuntimeShape& output_shape, T* output_data) {
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
- input1_data[i] + input2_data[i], output_activation_min,
- output_activation_max);
+ input1_data[i] + input2_data[i], params.quantized_activation_min,
+ params.quantized_activation_max);
}
}
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Add(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float* output_data, const Dims<4>& output_dims) {
- float output_activation_min, output_activation_max;
- GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
- Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
- output_activation_max, output_data, output_dims);
+inline void Add(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const float* input1_data,
+ const RuntimeShape& input2_shape, const float* input2_data,
+ const RuntimeShape& output_shape, float* output_data) {
+ const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ for (int i = 0; i < size; i++) {
+ auto x = input1_data[i] + input2_data[i];
+ output_data[i] = ActivationFunctionWithMinMax(
+ x, params.float_activation_min, params.float_activation_max);
+ }
}
-template <FusedActivationFunctionType Ac>
-inline void Add(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier, int input2_shift,
- int32 output_offset, int32 output_multiplier, int output_shift,
- int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, 0);
- TFLITE_DCHECK_EQ(output_activation_max, 255);
- }
- const int batches =
- MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
- const int height =
- MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
- const int width =
- MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
- const int depth =
- MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- const int32 input1_val =
- input1_offset + input1_data[Offset(input1_dims, c, x, y, b)];
- const int32 input2_val =
- input2_offset + input2_data[Offset(input2_dims, c, x, y, b)];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
- const int32 scaled_input1_val =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier,
- kReverseShift * input1_shift);
- const int32 scaled_input2_val =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier,
- kReverseShift * input2_shift);
- const int32 raw_sum = scaled_input1_val + scaled_input2_val;
- const int32 raw_output =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sum, output_multiplier, kReverseShift * output_shift) +
- output_offset;
- const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, raw_output));
- output_data[Offset(output_dims, c, x, y, b)] =
- static_cast<uint8>(clamped_output);
- }
- }
- }
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+ const uint8* input1_data, const uint8* input2_data,
+ uint8* output_data) {
+ TFLITE_DCHECK_GT(params.input1_offset, -256);
+ TFLITE_DCHECK_GT(params.input2_offset, -256);
+ TFLITE_DCHECK_LT(params.input1_offset, 256);
+ TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+ for (int i = 0; i < size; ++i) {
+ const int32 input1_val = params.input1_offset + input1_data[i];
+ const int32 input2_val = params.input2_offset + input2_data[i];
+ const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+ const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+ const int32 scaled_input1_val =
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ shifted_input1_val, params.input1_multiplier, params.input1_shift);
+ const int32 scaled_input2_val =
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ shifted_input2_val, params.input2_multiplier, params.input2_shift);
+ const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32 raw_output =
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ raw_sum, params.output_multiplier, params.output_shift) +
+ params.output_offset;
+ const int32 clamped_output =
+ std::min(params.quantized_activation_max,
+ std::max(params.quantized_activation_min, raw_output));
+ output_data[i] = static_cast<uint8>(clamped_output);
}
}
-inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
- int input1_shift, const int16* input2_data,
- const Dims<4>& input2_dims, int input2_shift,
- int16 output_activation_min, int16 output_activation_max,
- int16* output_data, const Dims<4>& output_dims) {
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-
- const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
-
- TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
- TFLITE_DCHECK_GE(input1_shift, 0);
- TFLITE_DCHECK_GE(input2_shift, 0);
+inline void Add(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const uint8* input1_data,
+ const RuntimeShape& input2_shape, const uint8* input2_data,
+ const RuntimeShape& output_shape, uint8* output_data) {
+ TFLITE_DCHECK_LE(params.quantized_activation_min,
+ params.quantized_activation_max);
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+ TFLITE_DCHECK_GT(params.input1_offset, -256);
+ TFLITE_DCHECK_GT(params.input2_offset, -256);
+ TFLITE_DCHECK_LT(params.input1_offset, 256);
+ TFLITE_DCHECK_LT(params.input2_offset, 256);
+ AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Add(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const int16* input1_data,
+ const RuntimeShape& input2_shape, const int16* input2_data,
+ const RuntimeShape& output_shape, int16* output_data) {
+ TFLITE_DCHECK_LE(params.quantized_activation_min,
+ params.quantized_activation_max);
+
+ const int input1_shift = params.input1_shift;
+ const int flat_size =
+ MatchingFlatSize(output_shape, input1_shape, input2_shape);
+ const int16 output_activation_min = params.quantized_activation_min;
+ const int16 output_activation_max = params.quantized_activation_max;
+
+ TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+ TFLITE_DCHECK_LE(input1_shift, 0);
+ TFLITE_DCHECK_LE(params.input2_shift, 0);
const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
- const int input_shift = input1_shift == 0 ? input2_shift : input1_shift;
+ const int input_right_shift =
+ input1_shift == 0 ? -params.input2_shift : -input1_shift;
for (int i = 0; i < flat_size; i++) {
// F0 uses 0 integer bits, range [-1, 1].
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
- F0 scaled_input =
- F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift));
+ F0 scaled_input = F0::FromRaw(
+ gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
const int16 raw_output = result.raw();
const int16 clamped_output = std::min(
@@ -1168,42 +1083,28 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
}
}
-template <FusedActivationFunctionType Ac>
-inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
- int input1_shift, const int16* input2_data,
- const Dims<4>& input2_dims, int input2_shift,
- int16 output_activation_min, int16 output_activation_max,
- int16* output_data, const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, -32768);
- TFLITE_DCHECK_EQ(output_activation_max, 32767);
- }
-
- Add(input1_data, input1_dims, input1_shift, input2_data, input2_dims,
- input2_shift, output_activation_min, output_activation_max, output_data,
- output_dims);
-}
-
// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
// generate max(D1, D2) nested for loops.
-template <typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, const Dims<4>& input2_dims,
- T output_activation_min, T output_activation_max,
- T* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
-
+// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const float* input1_data,
+ const RuntimeShape& input2_shape,
+ const float* input2_data,
+ const RuntimeShape& output_shape,
+ float* output_data) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float");
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+ &desc2);
+ RuntimeShape extended_output_shape =
+ RuntimeShape::ExtendedShape(4, output_shape);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
@@ -1216,49 +1117,77 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- output_data[Offset(output_dims, c, x, y, b)] =
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
ActivationFunctionWithMinMax(
- input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
- input2_data[SubscriptToIndex(desc2, c, x, y, b)],
- output_activation_min, output_activation_max);
+ input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+ params.float_activation_min, params.float_activation_max);
}
}
}
}
}
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, const Dims<4>& input2_dims,
- T* output_data, const Dims<4>& output_dims) {
- T output_activation_min, output_activation_max;
- GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const int32* input1_data,
+ const RuntimeShape& input2_shape,
+ const int32* input2_data,
+ const RuntimeShape& output_shape,
+ int32* output_data) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32");
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+ &desc2);
+ RuntimeShape extended_output_shape =
+ RuntimeShape::ExtendedShape(4, output_shape);
- BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
- output_activation_min, output_activation_max, output_data,
- output_dims);
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
+ ActivationFunctionWithMinMax(
+ input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+ params.quantized_activation_min,
+ params.quantized_activation_max);
+ }
+ }
+ }
+ }
}
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
-
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const uint8* input1_data,
+ const RuntimeShape& input2_shape,
+ const uint8* input2_data,
+ const RuntimeShape& output_shape,
+ uint8* output_data) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8");
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+ &desc2);
+ RuntimeShape extended_output_shape =
+ RuntimeShape::ExtendedShape(4, output_shape);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
@@ -1271,33 +1200,37 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
const int32 input1_val =
- input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+ params.input1_offset +
+ input1_data[SubscriptToIndex(desc1, b, y, x, c)];
const int32 input2_val =
- input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ params.input2_offset +
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+ const int32 shifted_input1_val =
+ input1_val * (1 << params.left_shift);
+ const int32 shifted_input2_val =
+ input2_val * (1 << params.left_shift);
const int32 scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier,
- kReverseShift * input1_shift);
+ shifted_input1_val, params.input1_multiplier,
+ params.input1_shift);
const int32 scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier,
- kReverseShift * input2_shift);
+ shifted_input2_val, params.input2_multiplier,
+ params.input2_shift);
const int32 raw_sum = scaled_input1_val + scaled_input2_val;
const int32 raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sum, output_multiplier, kReverseShift * output_shift) +
- output_offset;
+ raw_sum, params.output_multiplier, params.output_shift) +
+ params.output_offset;
const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, raw_output));
- output_data[Offset(output_dims, c, x, y, b)] =
+ std::min(params.quantized_activation_max,
+ std::max(params.quantized_activation_min, raw_output));
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
static_cast<uint8>(clamped_output);
}
}
@@ -1305,121 +1238,67 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
}
}
-inline void BroadcastAddFivefold(
- int y0, int y1, int y2, int y3, int y4, int left_shift,
- const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift, const uint8* input2_data,
- const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit");
-
- int sb1 = y0;
- int sa2 = y0;
- int sb2 = y0 * y1;
- int sa3 = y0 * y2;
- int sa4 = y0 * y2 * y3;
- int sb4 = y0 * y1 * y2;
-
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+ const RuntimeShape& unswitched_input1_shape,
+ const uint8* unswitched_input1_data,
+ const RuntimeShape& unswitched_input2_shape,
+ const uint8* unswitched_input2_data,
+ const RuntimeShape& output_shape,
+ uint8* output_data) {
+ ArithmeticParams switched_params = unswitched_params;
+ switched_params.input1_offset = unswitched_params.input2_offset;
+ switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+ switched_params.input1_shift = unswitched_params.input2_shift;
+ switched_params.input2_offset = unswitched_params.input1_offset;
+ switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+ switched_params.input2_shift = unswitched_params.input1_shift;
+
+ const bool use_unswitched =
+ unswitched_params.broadcast_category ==
+ tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+ const ArithmeticParams& params =
+ use_unswitched ? unswitched_params : switched_params;
+ const uint8* input1_data =
+ use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+ const uint8* input2_data =
+ use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+ // Fivefold nested loops. The second input resets its position for each
+ // iteration of the second loop. The first input resets its position at the
+ // beginning of the fourth loop. The innermost loop is an elementwise add of
+ // sections of the arrays.
uint8* output_data_ptr = output_data;
- for (int i4 = 0; i4 < y4; ++i4) {
- for (int i3 = 0; i3 < y3; ++i3) {
+ const uint8* input1_data_ptr = input1_data;
+ const uint8* input2_data_reset = input2_data;
+ int y0 = params.broadcast_shape[0];
+ int y1 = params.broadcast_shape[1];
+ int y2 = params.broadcast_shape[2];
+ int y3 = params.broadcast_shape[3];
+ int y4 = params.broadcast_shape[4];
+ for (int i0 = 0; i0 < y0; ++i0) {
+ const uint8* input2_data_ptr;
+ for (int i1 = 0; i1 < y1; ++i1) {
+ input2_data_ptr = input2_data_reset;
for (int i2 = 0; i2 < y2; ++i2) {
- for (int i1 = 0; i1 < y1; ++i1) {
- for (int i0 = 0; i0 < y0; ++i0) {
- const int32 input1_val =
- input1_offset +
- input1_data[i4 * sa4 + i3 * sa3 + i2 * sa2 + i0];
- const int32 input2_val =
- input2_offset +
- input2_data[i4 * sb4 + i2 * sb2 + i1 * sb1 + i0];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
- const int32 scaled_input1_val =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier,
- kReverseShift * input1_shift);
- const int32 scaled_input2_val =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier,
- kReverseShift * input2_shift);
- const int32 raw_sum = scaled_input1_val + scaled_input2_val;
- const int32 raw_output =
- MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sum, output_multiplier, kReverseShift * output_shift) +
- output_offset;
- const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, raw_output));
- *output_data_ptr = static_cast<uint8>(clamped_output);
- ++output_data_ptr;
- }
+ for (int i3 = 0; i3 < y3; ++i3) {
+ AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+ output_data_ptr);
+ input2_data_ptr += y4;
+ output_data_ptr += y4;
}
+ input1_data_ptr += y4;
}
}
+ input2_data_reset = input2_data_ptr;
}
}
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, 0);
- TFLITE_DCHECK_EQ(output_activation_max, 255);
- }
- BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
- input1_multiplier, input1_shift, input2_data, input2_dims,
- input2_offset, input2_multiplier, input2_shift, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_data, output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAddFivefold(
- int y0, int y1, int y2, int y3, int y4, int left_shift,
- const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift, const uint8* input2_data,
- const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, 0);
- TFLITE_DCHECK_EQ(output_activation_max, 255);
- }
- BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims,
- input1_offset, input1_multiplier, input1_shift,
- input2_data, input2_dims, input2_offset,
- input2_multiplier, input2_shift, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_data, output_dims);
-}
-
-inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float output_activation_min, float output_activation_max,
- float* output_data, const Dims<4>& output_dims) {
+template <typename T>
+inline void Mul(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, const Dims<4>& input2_dims,
+ T output_activation_min, T output_activation_max,
+ T* output_data, const Dims<4>& output_dims) {
const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
@@ -1640,10 +1519,11 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
}
}
-inline void Div(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float output_activation_min, float output_activation_max,
- float* output_data, const Dims<4>& output_dims) {
+template <typename T>
+inline void Div(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, const Dims<4>& input2_dims,
+ T output_activation_min, T output_activation_max,
+ T* output_data, const Dims<4>& output_dims) {
const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
@@ -1652,15 +1532,35 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
}
}
-inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float output_activation_min, float output_activation_max,
- float* output_data, const Dims<4>& output_dims) {
- const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+inline void SubNonBroadcast(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const float* input1_data,
+ const RuntimeShape& input2_shape,
+ const float* input2_data,
+ const RuntimeShape& output_shape,
+ float* output_data) {
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
- input1_data[i] - input2_data[i], output_activation_min,
- output_activation_max);
+ input1_data[i] - input2_data[i], params.float_activation_min,
+ params.float_activation_max);
+ }
+}
+
+inline void SubNonBroadcast(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const int32* input1_data,
+ const RuntimeShape& input2_shape,
+ const int32* input2_data,
+ const RuntimeShape& output_shape,
+ int32* output_data) {
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i) {
+ output_data[i] = ActivationFunctionWithMinMax(
+ input1_data[i] - input2_data[i], params.quantized_activation_min,
+ params.quantized_activation_max);
}
}
@@ -1668,16 +1568,24 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
// generate max(D1, D2) nested for loops.
-template <typename T>
-void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, const Dims<4>& input2_dims,
- T output_activation_min, T output_activation_max,
- T* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastSub");
-
+// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const float* input1_data,
+ const RuntimeShape& input2_shape,
+ const float* input2_data,
+ const RuntimeShape& output_shape,
+ float* output_data) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float");
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+ &desc2);
+ RuntimeShape extended_output_shape =
+ RuntimeShape::ExtendedShape(4, output_shape);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
@@ -1690,36 +1598,35 @@ void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- output_data[Offset(output_dims, c, x, y, b)] =
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
ActivationFunctionWithMinMax(
- input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
- input2_data[SubscriptToIndex(desc2, c, x, y, b)],
- output_activation_min, output_activation_max);
+ input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+ params.float_activation_min, params.float_activation_max);
}
}
}
}
}
-inline void BroadcastSub(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
-
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const uint8* input1_data,
+ const RuntimeShape& input2_shape,
+ const uint8* input2_data,
+ const RuntimeShape& output_shape,
+ uint8* output_data) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8");
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+ &desc2);
+ RuntimeShape extended_output_shape =
+ RuntimeShape::ExtendedShape(4, output_shape);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
@@ -1732,33 +1639,37 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data,
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
const int32 input1_val =
- input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+ params.input1_offset +
+ input1_data[SubscriptToIndex(desc1, b, y, x, c)];
const int32 input2_val =
- input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ params.input2_offset +
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+ const int32 shifted_input1_val =
+ input1_val * (1 << params.left_shift);
+ const int32 shifted_input2_val =
+ input2_val * (1 << params.left_shift);
const int32 scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier,
- kReverseShift * input1_shift);
+ shifted_input1_val, params.input1_multiplier,
+ params.input1_shift);
const int32 scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier,
- kReverseShift * input2_shift);
+ shifted_input2_val, params.input2_multiplier,
+ params.input2_shift);
const int32 raw_sub = scaled_input1_val - scaled_input2_val;
const int32 raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sub, output_multiplier, kReverseShift * output_shift) +
- output_offset;
+ raw_sub, params.output_multiplier, params.output_shift) +
+ params.output_offset;
const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, raw_output));
- output_data[Offset(output_dims, c, x, y, b)] =
+ std::min(params.quantized_activation_max,
+ std::max(params.quantized_activation_min, raw_output));
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
static_cast<uint8>(clamped_output);
}
}
@@ -1766,6 +1677,156 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data,
}
}
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const int32* input1_data,
+ const RuntimeShape& input2_shape,
+ const int32* input2_data,
+ const RuntimeShape& output_shape,
+ int32* output_data) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32");
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+ &desc2);
+ RuntimeShape extended_output_shape =
+ RuntimeShape::ExtendedShape(4, output_shape);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
+ ActivationFunctionWithMinMax(
+ input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+ params.quantized_activation_min,
+ params.quantized_activation_max);
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+void BroadcastSub4DSlow(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const T* input1_data,
+ const RuntimeShape& input2_shape, const T* input2_data,
+ const RuntimeShape& output_shape, T* output_data) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/templated");
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+ &desc2);
+ RuntimeShape extended_output_shape =
+ RuntimeShape::ExtendedShape(4, output_shape);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
+ ActivationFunctionWithMinMax(
+ input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+ params.quantized_activation_min,
+ params.quantized_activation_max);
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
+ const T* input1_data, const RuntimeShape& input2_shape,
+ const T* input2_data, const RuntimeShape& output_shape,
+ T* output_data) {
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+ &desc2);
+ RuntimeShape extended_output_shape =
+ RuntimeShape::ExtendedShape(4, output_shape);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
+ input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+ }
+ }
+ }
+ }
+}
+
+inline void SubWithActivation(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const int32* input1_data,
+ const RuntimeShape& input2_shape,
+ const int32* input2_data,
+ const RuntimeShape& output_shape,
+ int32* output_data) {
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, input2_shape);
+ for (int i = 0; i < flat_size; ++i) {
+ output_data[i] = ActivationFunctionWithMinMax(
+ input1_data[i] - input2_data[i], params.quantized_activation_min,
+ params.quantized_activation_max);
+ }
+}
+
+inline void SubWithActivation(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape,
+ const float* input1_data,
+ const RuntimeShape& input2_shape,
+ const float* input2_data,
+ const RuntimeShape& output_shape,
+ float* output_data) {
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, input2_shape);
+ for (int i = 0; i < flat_size; ++i) {
+ output_data[i] = ActivationFunctionWithMinMax(
+ input1_data[i] - input2_data[i], params.float_activation_min,
+ params.float_activation_max);
+ }
+}
+
template <FusedActivationFunctionType Ac, typename Scalar>
void Concatenation(int concat_dim, const Scalar* const* input_data,
const Dims<4>* const* input_dims, int inputs_count,
@@ -1799,6 +1860,26 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
}
}
+template <typename Scalar>
+void Pack(int dim, const Scalar* const* input_data,
+ const Dims<4>* const* input_dims, int inputs_count,
+ Scalar* output_data, const Dims<4>& output_dims) {
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+ int outer_size = 1;
+ for (int i = dim + 1; i < 4; i++) {
+ outer_size *= output_dims.sizes[i];
+ }
+ Scalar* output_ptr = output_data;
+ const int copy_size = FlatSize(**input_dims) / outer_size;
+ for (int k = 0; k < outer_size; k++) {
+ for (int i = 0; i < inputs_count; ++i) {
+ memcpy(output_ptr, input_data[i] + k * copy_size,
+ copy_size * sizeof(Scalar));
+ output_ptr += copy_size;
+ }
+ }
+}
+
// TODO(prabhumk): This is the same as the optimized implementation.
// TODO(prabhumk): The quantized implementation of concatentation isn't fully
// quantized as it takes scale as a floating point value. This should be fixed
@@ -2260,13 +2341,10 @@ inline int NodeOffset(int b, int h, int w, int height, int width) {
return (b * height + h) * width + w;
}
-inline void AveragePool(const float* input_data,
- const RuntimeShape& input_shape, int stride_width,
- int stride_height, int pad_width, int pad_height,
- int filter_width, int filter_height,
- float output_activation_min,
- float output_activation_max, float* output_data,
- const RuntimeShape& output_shape) {
+inline void AveragePool(const PoolParams& params,
+ const RuntimeShape& input_shape,
+ const float* input_data,
+ const RuntimeShape& output_shape, float* output_data) {
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
@@ -2275,20 +2353,24 @@ inline void AveragePool(const float* input_data,
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int in_x_origin =
+ (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin =
+ (out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
- std::min(filter_width, input_width - in_x_origin);
+ std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
+ std::min(params.filter_height, input_height - in_y_origin);
float total = 0.f;
float filter_count = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
@@ -2304,22 +2386,20 @@ inline void AveragePool(const float* input_data,
}
const float average = total / filter_count;
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
- ActivationFunctionWithMinMax(average, output_activation_min,
- output_activation_max);
+ ActivationFunctionWithMinMax(average, params.float_activation_min,
+ params.float_activation_max);
}
}
}
}
}
-inline void AveragePool(const uint8* input_data,
- const RuntimeShape& input_shape, int stride_width,
- int stride_height, int pad_width, int pad_height,
- int filter_width, int filter_height,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const RuntimeShape& output_shape) {
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+inline void AveragePool(const PoolParams& params,
+ const RuntimeShape& input_shape,
+ const uint8* input_data,
+ const RuntimeShape& output_shape, uint8* output_data) {
+ TFLITE_DCHECK_LE(params.quantized_activation_min,
+ params.quantized_activation_max);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
@@ -2328,20 +2408,24 @@ inline void AveragePool(const uint8* input_data,
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int in_x_origin =
+ (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin =
+ (out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
- std::min(filter_width, input_width - in_x_origin);
+ std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
+ std::min(params.filter_height, input_height - in_y_origin);
int32 acc = 0;
int filter_count = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
@@ -2356,8 +2440,8 @@ inline void AveragePool(const uint8* input_data,
}
}
acc = (acc + filter_count / 2) / filter_count;
- acc = std::max(acc, output_activation_min);
- acc = std::min(acc, output_activation_max);
+ acc = std::max(acc, params.quantized_activation_min);
+ acc = std::min(acc, params.quantized_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
static_cast<uint8>(acc);
}
@@ -2366,11 +2450,9 @@ inline void AveragePool(const uint8* input_data,
}
}
-inline void L2Pool(const float* input_data, const RuntimeShape& input_shape,
- int stride_width, int stride_height, int pad_width,
- int pad_height, int filter_width, int filter_height,
- float output_activation_min, float output_activation_max,
- float* output_data, const RuntimeShape& output_shape) {
+inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
+ const float* input_data, const RuntimeShape& output_shape,
+ float* output_data) {
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
@@ -2379,20 +2461,24 @@ inline void L2Pool(const float* input_data, const RuntimeShape& input_shape,
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int in_x_origin =
+ (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin =
+ (out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
- std::min(filter_width, input_width - in_x_origin);
+ std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
+ std::min(params.filter_height, input_height - in_y_origin);
float sum_squares = 0.f;
int filter_count = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
@@ -2409,19 +2495,18 @@ inline void L2Pool(const float* input_data, const RuntimeShape& input_shape,
}
const float l2pool_result = std::sqrt(sum_squares / filter_count);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
- ActivationFunctionWithMinMax(l2pool_result, output_activation_min,
- output_activation_max);
+ ActivationFunctionWithMinMax(l2pool_result,
+ params.float_activation_min,
+ params.float_activation_max);
}
}
}
}
}
-inline void MaxPool(const float* input_data, const RuntimeShape& input_shape,
- int stride_width, int stride_height, int pad_width,
- int pad_height, int filter_width, int filter_height,
- float output_activation_min, float output_activation_max,
- float* output_data, const RuntimeShape& output_shape) {
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+ const float* input_data, const RuntimeShape& output_shape,
+ float* output_data) {
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
@@ -2430,20 +2515,24 @@ inline void MaxPool(const float* input_data, const RuntimeShape& input_shape,
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int in_x_origin =
+ (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin =
+ (out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
- std::min(filter_width, input_width - in_x_origin);
+ std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
+ std::min(params.filter_height, input_height - in_y_origin);
float max = std::numeric_limits<float>::lowest();
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
@@ -2457,22 +2546,21 @@ inline void MaxPool(const float* input_data, const RuntimeShape& input_shape,
}
}
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
- ActivationFunctionWithMinMax(max, output_activation_min,
- output_activation_max);
+ ActivationFunctionWithMinMax(max, params.float_activation_min,
+ params.float_activation_max);
}
}
}
}
}
-inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape,
- int stride_width, int stride_height, int pad_width,
- int pad_height, int filter_width, int filter_height,
- int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const RuntimeShape& output_shape) {
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- TFLITE_DCHECK_GE(output_activation_min, 0);
- TFLITE_DCHECK_LE(output_activation_max, 255);
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+ const uint8* input_data, const RuntimeShape& output_shape,
+ uint8* output_data) {
+ TFLITE_DCHECK_LE(params.quantized_activation_min,
+ params.quantized_activation_max);
+ TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
+ TFLITE_DCHECK_LE(params.quantized_activation_max, 255);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
@@ -2481,20 +2569,24 @@ inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape,
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int in_x_origin =
+ (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin =
+ (out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
- std::min(filter_width, input_width - in_x_origin);
+ std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
+ std::min(params.filter_height, input_height - in_y_origin);
uint8 max = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
@@ -2507,8 +2599,8 @@ inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape,
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
}
}
- max = std::max<uint8>(max, output_activation_min);
- max = std::min<uint8>(max, output_activation_max);
+ max = std::max<uint8>(max, params.quantized_activation_min);
+ max = std::min<uint8>(max, params.quantized_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
static_cast<uint8>(max);
}
@@ -3442,9 +3534,9 @@ inline bool Reduce(const In* input_data, const int* input_dims,
const int* output_dims, const int input_num_dims,
const int output_num_dims, const int* axis,
const int num_axis, int* input_iter,
- Out reducer(Out current, const In in), Out* output_data) {
+ Out reducer(const Out current, const In in),
+ Out* output_data) {
// Reset input iterator.
- TFLITE_DCHECK(input_num_dims > 0);
for (int idx = 0; idx < input_num_dims; ++idx) {
input_iter[idx] = 0;
}
@@ -3460,11 +3552,16 @@ inline bool Reduce(const In* input_data, const int* input_dims,
return true;
}
-inline bool ResolveAxis(const int num_dims, const int* axis, const int num_axis,
- int* out_axis, int* out_num_axis) {
+inline bool ResolveAxis(const int num_dims, const int* axis,
+ const int64_t num_axis, int* out_axis,
+ int* out_num_axis) {
*out_num_axis = 0; // Just in case.
+ // Short-circuit axis resolution for scalars; the axis will go unused.
+ if (num_dims == 0) {
+ return true;
+ }
// o(n^2) is fine since out_num_axis should be really small, mostly <= 4
- for (int idx = 0; idx < num_axis; ++idx) {
+ for (int64_t idx = 0; idx < num_axis; ++idx) {
// Handle negative index.
int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
TFLITE_DCHECK(current >= 0 && current < num_dims);
@@ -3490,7 +3587,7 @@ inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
const int output_num_dims, const int* axis,
const int num_axis, int* input_iter,
Out* output_data) {
- auto reducer = [](Out current, const In in) -> Out {
+ auto reducer = [](const Out current, const In in) -> Out {
const Out actual_in = static_cast<Out>(in);
return current + actual_in;
};
@@ -3499,6 +3596,24 @@ inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
output_data);
}
+template <typename T>
+inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
+ const T init_value, T* data) {
+ size_t num_elements = 1;
+ for (int idx = 0; idx < num_dims; ++idx) {
+ size_t current = static_cast<size_t>(dims[idx]);
+ // Overflow prevention.
+ if (num_elements > std::numeric_limits<size_t>::max() / current) {
+ return false;
+ }
+ num_elements *= current;
+ }
+ for (size_t idx = 0; idx < num_elements; ++idx) {
+ data[idx] = init_value;
+ }
+ return true;
+}
+
// Computes the sum of elements across dimensions given in axis.
template <typename T>
inline bool Sum(const T* input_data, const int* input_dims,
@@ -3507,17 +3622,9 @@ inline bool Sum(const T* input_data, const int* input_dims,
const int* axis, const int num_axis_dimensions, bool keep_dims,
int* temp_index, int* resolved_axis) {
// Reset output data.
- size_t num_outputs = 1;
- for (int idx = 0; idx < output_num_dims; ++idx) {
- size_t current = static_cast<size_t>(output_dims[idx]);
- // Overflow prevention.
- if (num_outputs > std::numeric_limits<size_t>::max() / current) {
- return false;
- }
- num_outputs *= current;
- }
- for (size_t idx = 0; idx < num_outputs; ++idx) {
- output_data[idx] = T();
+ if (!InitTensorDataForReduce(output_dims, output_num_dims, static_cast<T>(0),
+ output_data)) {
+ return false;
}
// Resolve axis.
@@ -3532,6 +3639,61 @@ inline bool Sum(const T* input_data, const int* input_dims,
num_resolved_axis, temp_index, output_data);
}
+// Computes the max of elements across dimensions given in axis.
+template <typename T>
+inline bool ReduceMax(const T* input_data, const int* input_dims,
+ const int input_num_dims, T* output_data,
+ const int* output_dims, const int output_num_dims,
+ const int* axis, const int64_t num_axis_dimensions,
+ bool keep_dims, int* temp_index, int* resolved_axis) {
+ T init_value = std::numeric_limits<T>::lowest();
+ // Reset output data.
+ if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
+ output_data)) {
+ return false;
+ }
+
+ // Resolve axis.
+ int num_resolved_axis = 0;
+ if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+ &num_resolved_axis)) {
+ return false;
+ }
+
+ auto reducer = [](const T current, const T in) -> T {
+ return (in > current) ? in : current;
+ };
+ return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
+ output_num_dims, resolved_axis, num_resolved_axis,
+ temp_index, reducer, output_data);
+}
+
+// Computes the prod of elements across dimensions given in axis.
+template <typename T>
+inline bool ReduceProd(const T* input_data, const int* input_dims,
+ const int input_num_dims, T* output_data,
+ const int* output_dims, const int output_num_dims,
+ const int* axis, const int64_t num_axis_dimensions,
+ bool keep_dims, int* temp_index, int* resolved_axis) {
+ // Reset output data.
+ if (!InitTensorDataForReduce(output_dims, output_num_dims, static_cast<T>(1),
+ output_data)) {
+ return false;
+ }
+
+ // Resolve axis.
+ int num_resolved_axis = 0;
+ if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+ &num_resolved_axis)) {
+ return false;
+ }
+
+ auto reducer = [](const T current, const T in) -> T { return in * current; };
+ return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
+ output_num_dims, resolved_axis, num_resolved_axis,
+ temp_index, reducer, output_data);
+}
+
// Computes the mean of elements across dimensions given in axis.
// It does so in two stages, first calculates the sum of elements along the axis
// then divides it by the number of element in axis.
@@ -3624,38 +3786,6 @@ inline void Mean(const T* input_data, const Dims<4>& input_dims,
}
template <typename T>
-void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
- const Dims<4>& input2_dims, T* output_data,
- const Dims<4>& output_dims) {
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- output_data[Offset(output_dims, c, x, y, b)] =
- input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
- input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- }
- }
- }
- }
-}
-
-template <typename T>
void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
const T* input2_data, T* output_data,
const Dims<4>& output_dims) {
@@ -3704,9 +3834,9 @@ void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
}
}
-template <typename T1, typename T2, typename T3>
-void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
- T2* output_data, const Dims<4>& output_dims) {
+template <typename T1, typename T2, typename T3, typename Cmp>
+void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
+ T2* output_data, const Dims<4>& output_dims, const Cmp& cmp) {
// The current ArgMax implemention can only determine the index of the maximum
// value in the last dimension. So the axis argument is ignored.
@@ -3719,19 +3849,28 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
const int depth = ArraySize(input_dims, 0);
for (int i = 0; i < outer_size; ++i) {
- auto max_value = input_data[i * depth];
- int max_index = 0;
+ auto min_max_value = input_data[i * depth];
+ int min_max_index = 0;
for (int d = 1; d < depth; ++d) {
const auto& curr_value = input_data[i * depth + d];
- if (curr_value > max_value) {
- max_value = curr_value;
- max_index = d;
+ if (cmp(curr_value, min_max_value)) {
+ min_max_value = curr_value;
+ min_max_index = d;
}
}
- output_data[i] = max_index;
+ output_data[i] = min_max_index;
}
}
+// TODO(renjieliu): Remove this one.
+template <typename T1, typename T2, typename T3>
+void ArgMax(const T3* axis, const T1* input_data,
+ const tflite::Dims<4>& input_dims, T2* output_data,
+ const tflite::Dims<4>& output_dims) {
+ ArgMinMax(axis, input_data, input_dims, output_data, output_dims,
+ std::greater<T1>());
+}
+
template <typename T>
void Transpose(const T* input, const Dims<4>& input_dims, T* output,
const Dims<4>& output_dims, const int* permuted_axes) {