diff options
Diffstat (limited to 'tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h')
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h | 1229 |
1 files changed, 684 insertions, 545 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 16901a3e53..31a54c2b62 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -158,98 +158,6 @@ SaturatingRoundingMultiplyByPOTParam( SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent)); } -// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE -// BROADCASTING. -// -// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional -// rectangular array of numbers. -// -// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. -// However, as Dims<N> is to be deprecated, this class exists as an adaptor -// to enable simple unoptimized implementations of element-wise broadcasting -// operations. -template <int N> -struct NdArrayDesc { - // The "extent" of each dimension. Indices along dimension d must be in the - // half-open interval [0, extents[d]). - int extents[N]; - - // The number of *elements* (not bytes) between consecutive indices of each - // dimension. - int strides[N]; -}; - -// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING -// ELEMENT-WISE BROADCASTING. -// -// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. -inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, - int i3) { - TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); - TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); - TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); - TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); - return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + - i3 * desc.strides[3]; -} - -// Given the dimensions of the operands for an element-wise binary broadcast, -// adjusts them so that they can be directly iterated over with simple loops. -// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and -// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. -// -// This function assumes that the two input shapes are compatible up to -// broadcasting and the shorter one has already been prepended with 1s to be the -// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), -// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that -// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be -// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). -// -// When two shapes are compatible up to broadcasting, for each dimension d, -// the input extents are either equal, or one of them is 1. -// -// This function performs the following for each dimension d: -// - If the extents are equal, then do nothing since the loop that walks over -// both of the input arrays is correct. -// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 -// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows -// array0 to be referenced *at any index* in dimension d and still access the -// same slice. -template <int N> -inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, - const Dims<N>& input1_dims, - NdArrayDesc<N>* desc0_out, - NdArrayDesc<N>* desc1_out) { - TFLITE_DCHECK(desc0_out != nullptr); - TFLITE_DCHECK(desc1_out != nullptr); - - // Copy dims to desc. - for (int i = 0; i < N; ++i) { - desc0_out->extents[i] = input0_dims.sizes[i]; - desc0_out->strides[i] = input0_dims.strides[i]; - desc1_out->extents[i] = input1_dims.sizes[i]; - desc1_out->strides[i] = input1_dims.strides[i]; - } - - // Walk over each dimension. If the extents are equal do nothing. - // Otherwise, set the desc with extent 1 to have extent equal to the other and - // stride 0. - for (int i = 0; i < N; ++i) { - const int extent0 = ArraySize(input0_dims, i); - const int extent1 = ArraySize(input1_dims, i); - if (extent0 != extent1) { - if (extent0 == 1) { - desc0_out->strides[i] = 0; - desc0_out->extents[i] = extent1; - } else { - TFLITE_DCHECK_EQ(extent1, 1); - desc1_out->strides[i] = 0; - desc1_out->extents[i] = extent0; - } - } - } -} - inline void Conv(const float* input_data, const Dims<4>& input_dims, const float* filter_data, const Dims<4>& filter_dims, const float* bias_data, const Dims<4>& bias_dims, @@ -951,6 +859,19 @@ inline void Relu6(const float* input_data, const RuntimeShape& input_shape, } } +inline void ReluX(uint8 min_value, uint8 max_value, const uint8* input_data, + const RuntimeShape& input_shape, uint8* output_data, + const RuntimeShape& output_shape) { + gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)"); + const int flat_size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + const uint8 val = input_data[i]; + const uint8 clamped = + val > max_value ? max_value : val < min_value ? min_value : val; + output_data[i] = clamped; + } +} + template <FusedActivationFunctionType Ac> void L2Normalization(const float* input_data, const RuntimeShape& input_shape, float* output_data, const RuntimeShape& output_shape) { @@ -1052,114 +973,108 @@ inline void L2Normalization(const uint8* input_data, } template <typename T> -inline void Add(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( - input1_data[i] + input2_data[i], output_activation_min, - output_activation_max); + input1_data[i] + input2_data[i], params.quantized_activation_min, + params.quantized_activation_max); } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Add(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, - output_activation_max, output_data, output_dims); +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const float* input1_data, + const RuntimeShape& input2_shape, const float* input2_data, + const RuntimeShape& output_shape, float* output_data) { + const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < size; i++) { + auto x = input1_data[i] + input2_data[i]; + output_data[i] = ActivationFunctionWithMinMax( + x, params.float_activation_min, params.float_activation_max); + } } -template <FusedActivationFunctionType Ac> -inline void Add(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, int input2_shift, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - const int batches = - MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); - const int height = - MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); - const int width = - MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); - const int depth = - MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - const int32 input1_val = - input1_offset + input1_data[Offset(input1_dims, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[Offset(input2_dims, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, - kReverseShift * input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, - kReverseShift * input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, output_multiplier, kReverseShift * output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } +// Element-wise add that can often be used for inner loop of broadcast add as +// well as the non-broadcast add. +inline void AddElementwise(int size, const ArithmeticParams& params, + const uint8* input1_data, const uint8* input2_data, + uint8* output_data) { + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + + for (int i = 0; i < size; ++i) { + const int32 input1_val = params.input1_offset + input1_data[i]; + const int32 input2_val = params.input2_offset + input2_data[i]; + const int32 shifted_input1_val = input1_val * (1 << params.left_shift); + const int32 shifted_input2_val = input2_val * (1 << params.left_shift); + const int32 scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + const int32 scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32 raw_sum = scaled_input1_val + scaled_input2_val; + const int32 raw_output = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32 clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[i] = static_cast<uint8>(clamped_output); } } -inline void Add(const int16* input1_data, const Dims<4>& input1_dims, - int input1_shift, const int16* input2_data, - const Dims<4>& input2_dims, int input2_shift, - int16 output_activation_min, int16 output_activation_max, - int16* output_data, const Dims<4>& output_dims) { - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); - - TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0); - TFLITE_DCHECK_GE(input1_shift, 0); - TFLITE_DCHECK_GE(input2_shift, 0); +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const uint8* input1_data, + const RuntimeShape& input2_shape, const uint8* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + AddElementwise(flat_size, params, input1_data, input2_data, output_data); +} + +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + + const int input1_shift = params.input1_shift; + const int flat_size = + MatchingFlatSize(output_shape, input1_shape, input2_shape); + const int16 output_activation_min = params.quantized_activation_min; + const int16 output_activation_max = params.quantized_activation_max; + + TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0); + TFLITE_DCHECK_LE(input1_shift, 0); + TFLITE_DCHECK_LE(params.input2_shift, 0); const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data; const int16* shift_input = input1_shift == 0 ? input2_data : input1_data; - const int input_shift = input1_shift == 0 ? input2_shift : input1_shift; + const int input_right_shift = + input1_shift == 0 ? -params.input2_shift : -input1_shift; for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]); - F0 scaled_input = - F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift)); + F0 scaled_input = F0::FromRaw( + gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift)); F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled); const int16 raw_output = result.raw(); const int16 clamped_output = std::min( @@ -1168,42 +1083,28 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims, } } -template <FusedActivationFunctionType Ac> -inline void Add(const int16* input1_data, const Dims<4>& input1_dims, - int input1_shift, const int16* input2_data, - const Dims<4>& input2_dims, int input2_shift, - int16 output_activation_min, int16 output_activation_max, - int16* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, -32768); - TFLITE_DCHECK_EQ(output_activation_max, 32767); - } - - Add(input1_data, input1_dims, input1_shift, input2_data, input2_dims, - input2_shift, output_activation_min, output_activation_max, output_data, - output_dims); -} - // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then // generate max(D1, D2) nested for loops. -template <typename T> -void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); - +// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from +// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> +// is no longer referenced in this file, move NdArrayDesc<T> from types.h to +// reference_ops.h. +inline void BroadcastAdd4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1216,49 +1117,77 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] + - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); + input1_data[SubscriptToIndex(desc1, b, y, x, c)] + + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.float_activation_min, params.float_activation_max); } } } } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac, typename T> -void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - T output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); +inline void BroadcastAdd4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32"); + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); - BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims, - output_activation_min, output_activation_max, output_data, - output_dims); + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax( + input1_data[SubscriptToIndex(desc1, b, y, x, c)] + + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.quantized_activation_min, + params.quantized_activation_max); + } + } + } + } } -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); - +inline void BroadcastAdd4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const uint8* input1_data, + const RuntimeShape& input2_shape, + const uint8* input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1271,33 +1200,37 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; + params.input1_offset + + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); + params.input2_offset + + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + const int32 shifted_input1_val = + input1_val * (1 << params.left_shift); + const int32 shifted_input2_val = + input2_val * (1 << params.left_shift); const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, - kReverseShift * input1_shift); + shifted_input1_val, params.input1_multiplier, + params.input1_shift); const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, - kReverseShift * input2_shift); + shifted_input2_val, params.input2_multiplier, + params.input2_shift); const int32 raw_sum = scaled_input1_val + scaled_input2_val; const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, output_multiplier, kReverseShift * output_shift) + - output_offset; + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[Offset(extended_output_shape, b, y, x, c)] = static_cast<uint8>(clamped_output); } } @@ -1305,121 +1238,67 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, } } -inline void BroadcastAddFivefold( - int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit"); - - int sb1 = y0; - int sa2 = y0; - int sb2 = y0 * y1; - int sa3 = y0 * y2; - int sa4 = y0 * y2 * y3; - int sb4 = y0 * y1 * y2; - +inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, + const RuntimeShape& unswitched_input1_shape, + const uint8* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const uint8* unswitched_input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + ArithmeticParams switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input1_multiplier = unswitched_params.input2_multiplier; + switched_params.input1_shift = unswitched_params.input2_shift; + switched_params.input2_offset = unswitched_params.input1_offset; + switched_params.input2_multiplier = unswitched_params.input1_multiplier; + switched_params.input2_shift = unswitched_params.input1_shift; + + const bool use_unswitched = + unswitched_params.broadcast_category == + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const ArithmeticParams& params = + use_unswitched ? unswitched_params : switched_params; + const uint8* input1_data = + use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const uint8* input2_data = + use_unswitched ? unswitched_input2_data : unswitched_input1_data; + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise add of + // sections of the arrays. uint8* output_data_ptr = output_data; - for (int i4 = 0; i4 < y4; ++i4) { - for (int i3 = 0; i3 < y3; ++i3) { + const uint8* input1_data_ptr = input1_data; + const uint8* input2_data_reset = input2_data; + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + for (int i0 = 0; i0 < y0; ++i0) { + const uint8* input2_data_ptr; + for (int i1 = 0; i1 < y1; ++i1) { + input2_data_ptr = input2_data_reset; for (int i2 = 0; i2 < y2; ++i2) { - for (int i1 = 0; i1 < y1; ++i1) { - for (int i0 = 0; i0 < y0; ++i0) { - const int32 input1_val = - input1_offset + - input1_data[i4 * sa4 + i3 * sa3 + i2 * sa2 + i0]; - const int32 input2_val = - input2_offset + - input2_data[i4 * sb4 + i2 * sb2 + i1 * sb1 + i0]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, - kReverseShift * input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, - kReverseShift * input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, output_multiplier, kReverseShift * output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - *output_data_ptr = static_cast<uint8>(clamped_output); - ++output_data_ptr; - } + for (int i3 = 0; i3 < y3; ++i3) { + AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; } + input1_data_ptr += y4; } } + input2_data_reset = input2_data_ptr; } } -template <FusedActivationFunctionType Ac> -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset, - input1_multiplier, input1_shift, input2_data, input2_dims, - input2_offset, input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims); -} - -template <FusedActivationFunctionType Ac> -inline void BroadcastAddFivefold( - int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims, - input1_offset, input1_multiplier, input1_shift, - input2_data, input2_dims, input2_offset, - input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims); -} - -inline void Mul(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { +template <typename T> +inline void Mul(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( @@ -1640,10 +1519,11 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, } } -inline void Div(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { +template <typename T> +inline void Div(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( @@ -1652,15 +1532,35 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims, } } -inline void Sub(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); +inline void SubNonBroadcast(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( - input1_data[i] - input2_data[i], output_activation_min, - output_activation_max); + input1_data[i] - input2_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +inline void SubNonBroadcast(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.quantized_activation_min, + params.quantized_activation_max); } } @@ -1668,16 +1568,24 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims, // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then // generate max(D1, D2) nested for loops. -template <typename T> -void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastSub"); - +// TODO(benoitjacob): BroadcastSub is intentionally duplicated from +// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> +// is no longer referenced in this file, move NdArrayDesc<T> from types.h to +// reference_ops.h. +inline void BroadcastSub4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1690,36 +1598,35 @@ void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] - - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); + input1_data[SubscriptToIndex(desc1, b, y, x, c)] - + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.float_activation_min, params.float_activation_max); } } } } } -inline void BroadcastSub(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit"); - +inline void BroadcastSub4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const uint8* input1_data, + const RuntimeShape& input2_shape, + const uint8* input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1732,33 +1639,37 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; + params.input1_offset + + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); + params.input2_offset + + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + const int32 shifted_input1_val = + input1_val * (1 << params.left_shift); + const int32 shifted_input2_val = + input2_val * (1 << params.left_shift); const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, - kReverseShift * input1_shift); + shifted_input1_val, params.input1_multiplier, + params.input1_shift); const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, - kReverseShift * input2_shift); + shifted_input2_val, params.input2_multiplier, + params.input2_shift); const int32 raw_sub = scaled_input1_val - scaled_input2_val; const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sub, output_multiplier, kReverseShift * output_shift) + - output_offset; + raw_sub, params.output_multiplier, params.output_shift) + + params.output_offset; const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[Offset(extended_output_shape, b, y, x, c)] = static_cast<uint8>(clamped_output); } } @@ -1766,6 +1677,156 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data, } } +inline void BroadcastSub4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32"); + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax( + input1_data[SubscriptToIndex(desc1, b, y, x, c)] - + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.quantized_activation_min, + params.quantized_activation_max); + } + } + } + } +} + +template <typename T> +void BroadcastSub4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/templated"); + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax( + input1_data[SubscriptToIndex(desc1, b, y, x, c)] - + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.quantized_activation_min, + params.quantized_activation_max); + } + } + } + } +} + +template <typename T> +void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, + T* output_data) { + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = + input1_data[SubscriptToIndex(desc1, b, y, x, c)] - + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + } + } + } + } +} + +inline void SubWithActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, input2_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.quantized_activation_min, + params.quantized_activation_max); + } +} + +inline void SubWithActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, input2_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.float_activation_min, + params.float_activation_max); + } +} + template <FusedActivationFunctionType Ac, typename Scalar> void Concatenation(int concat_dim, const Scalar* const* input_data, const Dims<4>* const* input_dims, int inputs_count, @@ -1799,6 +1860,26 @@ void Concatenation(int concat_dim, const Scalar* const* input_data, } } +template <typename Scalar> +void Pack(int dim, const Scalar* const* input_data, + const Dims<4>* const* input_dims, int inputs_count, + Scalar* output_data, const Dims<4>& output_dims) { + TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); + int outer_size = 1; + for (int i = dim + 1; i < 4; i++) { + outer_size *= output_dims.sizes[i]; + } + Scalar* output_ptr = output_data; + const int copy_size = FlatSize(**input_dims) / outer_size; + for (int k = 0; k < outer_size; k++) { + for (int i = 0; i < inputs_count; ++i) { + memcpy(output_ptr, input_data[i] + k * copy_size, + copy_size * sizeof(Scalar)); + output_ptr += copy_size; + } + } +} + // TODO(prabhumk): This is the same as the optimized implementation. // TODO(prabhumk): The quantized implementation of concatentation isn't fully // quantized as it takes scale as a floating point value. This should be fixed @@ -2260,13 +2341,10 @@ inline int NodeOffset(int b, int h, int w, int height, int width) { return (b * height + h) * width + w; } -inline void AveragePool(const float* input_data, - const RuntimeShape& input_shape, int stride_width, - int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, - float output_activation_min, - float output_activation_max, float* output_data, - const RuntimeShape& output_shape) { +inline void AveragePool(const PoolParams& params, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, float* output_data) { TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); @@ -2275,20 +2353,24 @@ inline void AveragePool(const float* input_data, const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); float total = 0.f; float filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; @@ -2304,22 +2386,20 @@ inline void AveragePool(const float* input_data, } const float average = total / filter_count; output_data[Offset(output_shape, batch, out_y, out_x, channel)] = - ActivationFunctionWithMinMax(average, output_activation_min, - output_activation_max); + ActivationFunctionWithMinMax(average, params.float_activation_min, + params.float_activation_max); } } } } } -inline void AveragePool(const uint8* input_data, - const RuntimeShape& input_shape, int stride_width, - int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const RuntimeShape& output_shape) { - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); +inline void AveragePool(const PoolParams& params, + const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); @@ -2328,20 +2408,24 @@ inline void AveragePool(const uint8* input_data, const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); int32 acc = 0; int filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; @@ -2356,8 +2440,8 @@ inline void AveragePool(const uint8* input_data, } } acc = (acc + filter_count / 2) / filter_count; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); + acc = std::max(acc, params.quantized_activation_min); + acc = std::min(acc, params.quantized_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, channel)] = static_cast<uint8>(acc); } @@ -2366,11 +2450,9 @@ inline void AveragePool(const uint8* input_data, } } -inline void L2Pool(const float* input_data, const RuntimeShape& input_shape, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - float output_activation_min, float output_activation_max, - float* output_data, const RuntimeShape& output_shape) { +inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); @@ -2379,20 +2461,24 @@ inline void L2Pool(const float* input_data, const RuntimeShape& input_shape, const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); float sum_squares = 0.f; int filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; @@ -2409,19 +2495,18 @@ inline void L2Pool(const float* input_data, const RuntimeShape& input_shape, } const float l2pool_result = std::sqrt(sum_squares / filter_count); output_data[Offset(output_shape, batch, out_y, out_x, channel)] = - ActivationFunctionWithMinMax(l2pool_result, output_activation_min, - output_activation_max); + ActivationFunctionWithMinMax(l2pool_result, + params.float_activation_min, + params.float_activation_max); } } } } } -inline void MaxPool(const float* input_data, const RuntimeShape& input_shape, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - float output_activation_min, float output_activation_max, - float* output_data, const RuntimeShape& output_shape) { +inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); @@ -2430,20 +2515,24 @@ inline void MaxPool(const float* input_data, const RuntimeShape& input_shape, const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); float max = std::numeric_limits<float>::lowest(); for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { @@ -2457,22 +2546,21 @@ inline void MaxPool(const float* input_data, const RuntimeShape& input_shape, } } output_data[Offset(output_shape, batch, out_y, out_x, channel)] = - ActivationFunctionWithMinMax(max, output_activation_min, - output_activation_max); + ActivationFunctionWithMinMax(max, params.float_activation_min, + params.float_activation_max); } } } } } -inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const RuntimeShape& output_shape) { - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - TFLITE_DCHECK_GE(output_activation_min, 0); - TFLITE_DCHECK_LE(output_activation_max, 255); +inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& output_shape, + uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + TFLITE_DCHECK_GE(params.quantized_activation_min, 0); + TFLITE_DCHECK_LE(params.quantized_activation_max, 255); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); @@ -2481,20 +2569,24 @@ inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape, const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); uint8 max = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { @@ -2507,8 +2599,8 @@ inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape, input_data[Offset(input_shape, batch, in_y, in_x, channel)]); } } - max = std::max<uint8>(max, output_activation_min); - max = std::min<uint8>(max, output_activation_max); + max = std::max<uint8>(max, params.quantized_activation_min); + max = std::min<uint8>(max, params.quantized_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, channel)] = static_cast<uint8>(max); } @@ -3442,9 +3534,9 @@ inline bool Reduce(const In* input_data, const int* input_dims, const int* output_dims, const int input_num_dims, const int output_num_dims, const int* axis, const int num_axis, int* input_iter, - Out reducer(Out current, const In in), Out* output_data) { + Out reducer(const Out current, const In in), + Out* output_data) { // Reset input iterator. - TFLITE_DCHECK(input_num_dims > 0); for (int idx = 0; idx < input_num_dims; ++idx) { input_iter[idx] = 0; } @@ -3460,11 +3552,16 @@ inline bool Reduce(const In* input_data, const int* input_dims, return true; } -inline bool ResolveAxis(const int num_dims, const int* axis, const int num_axis, - int* out_axis, int* out_num_axis) { +inline bool ResolveAxis(const int num_dims, const int* axis, + const int64_t num_axis, int* out_axis, + int* out_num_axis) { *out_num_axis = 0; // Just in case. + // Short-circuit axis resolution for scalars; the axis will go unused. + if (num_dims == 0) { + return true; + } // o(n^2) is fine since out_num_axis should be really small, mostly <= 4 - for (int idx = 0; idx < num_axis; ++idx) { + for (int64_t idx = 0; idx < num_axis; ++idx) { // Handle negative index. int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx]; TFLITE_DCHECK(current >= 0 && current < num_dims); @@ -3490,7 +3587,7 @@ inline bool ReduceSumImpl(const In* input_data, const int* input_dims, const int output_num_dims, const int* axis, const int num_axis, int* input_iter, Out* output_data) { - auto reducer = [](Out current, const In in) -> Out { + auto reducer = [](const Out current, const In in) -> Out { const Out actual_in = static_cast<Out>(in); return current + actual_in; }; @@ -3499,6 +3596,24 @@ inline bool ReduceSumImpl(const In* input_data, const int* input_dims, output_data); } +template <typename T> +inline bool InitTensorDataForReduce(const int* dims, const int num_dims, + const T init_value, T* data) { + size_t num_elements = 1; + for (int idx = 0; idx < num_dims; ++idx) { + size_t current = static_cast<size_t>(dims[idx]); + // Overflow prevention. + if (num_elements > std::numeric_limits<size_t>::max() / current) { + return false; + } + num_elements *= current; + } + for (size_t idx = 0; idx < num_elements; ++idx) { + data[idx] = init_value; + } + return true; +} + // Computes the sum of elements across dimensions given in axis. template <typename T> inline bool Sum(const T* input_data, const int* input_dims, @@ -3507,17 +3622,9 @@ inline bool Sum(const T* input_data, const int* input_dims, const int* axis, const int num_axis_dimensions, bool keep_dims, int* temp_index, int* resolved_axis) { // Reset output data. - size_t num_outputs = 1; - for (int idx = 0; idx < output_num_dims; ++idx) { - size_t current = static_cast<size_t>(output_dims[idx]); - // Overflow prevention. - if (num_outputs > std::numeric_limits<size_t>::max() / current) { - return false; - } - num_outputs *= current; - } - for (size_t idx = 0; idx < num_outputs; ++idx) { - output_data[idx] = T(); + if (!InitTensorDataForReduce(output_dims, output_num_dims, static_cast<T>(0), + output_data)) { + return false; } // Resolve axis. @@ -3532,6 +3639,61 @@ inline bool Sum(const T* input_data, const int* input_dims, num_resolved_axis, temp_index, output_data); } +// Computes the max of elements across dimensions given in axis. +template <typename T> +inline bool ReduceMax(const T* input_data, const int* input_dims, + const int input_num_dims, T* output_data, + const int* output_dims, const int output_num_dims, + const int* axis, const int64_t num_axis_dimensions, + bool keep_dims, int* temp_index, int* resolved_axis) { + T init_value = std::numeric_limits<T>::lowest(); + // Reset output data. + if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value, + output_data)) { + return false; + } + + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, + &num_resolved_axis)) { + return false; + } + + auto reducer = [](const T current, const T in) -> T { + return (in > current) ? in : current; + }; + return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims, + output_num_dims, resolved_axis, num_resolved_axis, + temp_index, reducer, output_data); +} + +// Computes the prod of elements across dimensions given in axis. +template <typename T> +inline bool ReduceProd(const T* input_data, const int* input_dims, + const int input_num_dims, T* output_data, + const int* output_dims, const int output_num_dims, + const int* axis, const int64_t num_axis_dimensions, + bool keep_dims, int* temp_index, int* resolved_axis) { + // Reset output data. + if (!InitTensorDataForReduce(output_dims, output_num_dims, static_cast<T>(1), + output_data)) { + return false; + } + + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, + &num_resolved_axis)) { + return false; + } + + auto reducer = [](const T current, const T in) -> T { return in * current; }; + return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims, + output_num_dims, resolved_axis, num_resolved_axis, + temp_index, reducer, output_data); +} + // Computes the mean of elements across dimensions given in axis. // It does so in two stages, first calculates the sum of elements along the axis // then divides it by the number of element in axis. @@ -3624,38 +3786,6 @@ inline void Mean(const T* input_data, const Dims<4>& input_dims, } template <typename T> -void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, - const Dims<4>& input2_dims, T* output_data, - const Dims<4>& output_dims) { - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - input1_data[SubscriptToIndex(desc1, c, x, y, b)] - - input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - } - } - } - } -} - -template <typename T> void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, T* output_data, const Dims<4>& output_dims) { @@ -3704,9 +3834,9 @@ void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims, } } -template <typename T1, typename T2, typename T3> -void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, - T2* output_data, const Dims<4>& output_dims) { +template <typename T1, typename T2, typename T3, typename Cmp> +void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, + T2* output_data, const Dims<4>& output_dims, const Cmp& cmp) { // The current ArgMax implemention can only determine the index of the maximum // value in the last dimension. So the axis argument is ignored. @@ -3719,19 +3849,28 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, const int depth = ArraySize(input_dims, 0); for (int i = 0; i < outer_size; ++i) { - auto max_value = input_data[i * depth]; - int max_index = 0; + auto min_max_value = input_data[i * depth]; + int min_max_index = 0; for (int d = 1; d < depth; ++d) { const auto& curr_value = input_data[i * depth + d]; - if (curr_value > max_value) { - max_value = curr_value; - max_index = d; + if (cmp(curr_value, min_max_value)) { + min_max_value = curr_value; + min_max_index = d; } } - output_data[i] = max_index; + output_data[i] = min_max_index; } } +// TODO(renjieliu): Remove this one. +template <typename T1, typename T2, typename T3> +void ArgMax(const T3* axis, const T1* input_data, + const tflite::Dims<4>& input_dims, T2* output_data, + const tflite::Dims<4>& output_dims) { + ArgMinMax(axis, input_data, input_dims, output_data, output_dims, + std::greater<T1>()); +} + template <typename T> void Transpose(const T* input, const Dims<4>& input_dims, T* output, const Dims<4>& output_dims, const int* permuted_axes) { |