diff options
Diffstat (limited to 'tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h')
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h | 1028 |
1 files changed, 318 insertions, 710 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 1b8a7205e6..78567d52ea 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -41,10 +41,13 @@ namespace optimized_ops { // Unoptimized reference ops: using reference_ops::ArgMax; +using reference_ops::ArgMinMax; +using reference_ops::BroadcastAdd4DSlow; using reference_ops::BroadcastGreater; using reference_ops::BroadcastGreaterEqual; using reference_ops::BroadcastLess; using reference_ops::BroadcastLessEqual; +using reference_ops::BroadcastSub4DSlow; using reference_ops::Concatenation; using reference_ops::DepthConcatenation; using reference_ops::Dequantize; @@ -59,6 +62,7 @@ using reference_ops::Mean; using reference_ops::RankOneSelect; using reference_ops::Relu1; using reference_ops::Relu6; +using reference_ops::ReluX; using reference_ops::Select; using reference_ops::SpaceToBatchND; using reference_ops::StridedSlice; @@ -215,98 +219,6 @@ SaturatingRoundingMultiplyByPOTParam( SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent)); } -// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE -// BROADCASTING. -// -// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional -// rectangular array of numbers. -// -// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. -// However, as Dims<N> is to be deprecated, this class exists as an adaptor -// to enable simple unoptimized implementations of element-wise broadcasting -// operations. -template <int N> -struct NdArrayDesc { - // The "extent" of each dimension. Indices along dimension d must be in the - // half-open interval [0, extents[d]). - int extents[N]; - - // The number of *elements* (not bytes) between consecutive indices of each - // dimension. - int strides[N]; -}; - -// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING -// ELEMENT-WISE BROADCASTING. -// -// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. -inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, - int i3) { - TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); - TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); - TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); - TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); - return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + - i3 * desc.strides[3]; -} - -// Given the dimensions of the operands for an element-wise binary broadcast, -// adjusts them so that they can be directly iterated over with simple loops. -// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and -// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. -// -// This function assumes that the two input shapes are compatible up to -// broadcasting and the shorter one has already been prepended with 1s to be the -// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), -// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that -// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be -// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). -// -// When two shapes are compatible up to broadcasting, for each dimension d, -// the input extents are either equal, or one of them is 1. -// -// This function performs the following for each dimension d: -// - If the extents are equal, then do nothing since the loop that walks over -// both of the input arrays is correct. -// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 -// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows -// array0 to be referenced *at any index* in dimension d and still access the -// same slice. -template <int N> -inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, - const Dims<N>& input1_dims, - NdArrayDesc<N>* desc0_out, - NdArrayDesc<N>* desc1_out) { - TFLITE_DCHECK(desc0_out != nullptr); - TFLITE_DCHECK(desc1_out != nullptr); - - // Copy dims to desc. - for (int i = 0; i < N; ++i) { - desc0_out->extents[i] = input0_dims.sizes[i]; - desc0_out->strides[i] = input0_dims.strides[i]; - desc1_out->extents[i] = input1_dims.sizes[i]; - desc1_out->strides[i] = input1_dims.strides[i]; - } - - // Walk over each dimension. If the extents are equal do nothing. - // Otherwise, set the desc with extent 1 to have extent equal to the other and - // stride 0. - for (int i = 0; i < N; ++i) { - const int extent0 = ArraySize(input0_dims, i); - const int extent1 = ArraySize(input1_dims, i); - if (extent0 != extent1) { - if (extent0 == 1) { - desc0_out->strides[i] = 0; - desc0_out->extents[i] = extent1; - } else { - TFLITE_DCHECK_EQ(extent1, 1); - desc1_out->strides[i] = 0; - desc1_out->extents[i] = extent0; - } - } - } -} - inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) { for (int i = 0; i < 4; i++) { if (dims1.sizes[i] != dims2.sizes[i]) { @@ -2476,20 +2388,17 @@ inline void L2Normalization(const uint8* input_data, } } -inline void Add(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const float* input1_data, + const RuntimeShape& input2_shape, const float* input2_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Add"); - TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); int i = 0; - const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims); + const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); #ifdef USE_NEON - const auto activation_min = vdupq_n_f32(output_activation_min); - const auto activation_max = vdupq_n_f32(output_activation_max); + const auto activation_min = vdupq_n_f32(params.float_activation_min); + const auto activation_max = vdupq_n_f32(params.float_activation_max); for (; i <= size - 16; i += 16) { auto a10 = vld1q_f32(input1_data + i); auto a11 = vld1q_f32(input1_data + i + 4); @@ -2528,29 +2437,26 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims, for (; i < size; i++) { auto x = input1_data[i] + input2_data[i]; - output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min, - output_activation_max); + output_data[i] = ActivationFunctionWithMinMax( + x, params.float_activation_min, params.float_activation_max); } } // Element-wise add that can often be used for inner loop of broadcast add as // well as the non-broadcast add. -inline void AddElementwise(int size, int left_shift, const uint8* input1_data, - int32 input1_offset, int32 input1_multiplier, - int input1_shift, const uint8* input2_data, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data) { +inline void AddElementwise(int size, const ArithmeticParams& params, + const uint8* input1_data, const uint8* input2_data, + uint8* output_data) { int i = 0; - TFLITE_DCHECK_GT(input1_offset, -256); - TFLITE_DCHECK_GT(input2_offset, -256); - TFLITE_DCHECK_LT(input1_offset, 256); - TFLITE_DCHECK_LT(input2_offset, 256); + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_LT(params.input2_offset, 256); #ifdef USE_NEON - const auto output_activation_min_vector = vdup_n_u8(output_activation_min); - const auto output_activation_max_vector = vdup_n_u8(output_activation_max); + const auto output_activation_min_vector = + vdup_n_u8(params.quantized_activation_min); + const auto output_activation_max_vector = + vdup_n_u8(params.quantized_activation_max); for (; i <= size - 8; i += 8) { const auto input1_val_original = vld1_u8(input1_data + i); const auto input2_val_original = vld1_u8(input2_data + i); @@ -2559,9 +2465,9 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data, const auto input2_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); const auto input1_val = - vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset)); + vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset)); const auto input2_val = - vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset)); + vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset)); const auto input1_val_high = vget_high_s16(input1_val); const auto input1_val_low = vget_low_s16(input1_val); const auto input2_val_high = vget_high_s16(input2_val); @@ -2570,32 +2476,32 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data, auto x12 = vmovl_s16(input1_val_high); auto x21 = vmovl_s16(input2_val_low); auto x22 = vmovl_s16(input2_val_high); - const auto left_shift_dup = vdupq_n_s32(left_shift); + const auto left_shift_dup = vdupq_n_s32(params.left_shift); x11 = vshlq_s32(x11, left_shift_dup); x12 = vshlq_s32(x12, left_shift_dup); x21 = vshlq_s32(x21, left_shift_dup); x22 = vshlq_s32(x22, left_shift_dup); - x11 = vqrdmulhq_n_s32(x11, input1_multiplier); - x12 = vqrdmulhq_n_s32(x12, input1_multiplier); - x21 = vqrdmulhq_n_s32(x21, input2_multiplier); - x22 = vqrdmulhq_n_s32(x22, input2_multiplier); - const auto input1_shift_dup = vdupq_n_s32(-input1_shift); - const auto input2_shift_dup = vdupq_n_s32(-input2_shift); + x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier); + x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier); + x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier); + x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier); + const auto input1_shift_dup = vdupq_n_s32(params.input1_shift); + const auto input2_shift_dup = vdupq_n_s32(params.input2_shift); x11 = vshlq_s32(x11, input1_shift_dup); x12 = vshlq_s32(x12, input1_shift_dup); x21 = vshlq_s32(x21, input2_shift_dup); x22 = vshlq_s32(x22, input2_shift_dup); auto s1 = vaddq_s32(x11, x21); auto s2 = vaddq_s32(x12, x22); - s1 = vqrdmulhq_n_s32(s1, output_multiplier); - s2 = vqrdmulhq_n_s32(s2, output_multiplier); + s1 = vqrdmulhq_n_s32(s1, params.output_multiplier); + s2 = vqrdmulhq_n_s32(s2, params.output_multiplier); using gemmlowp::RoundingDivideByPOT; - s1 = RoundingDivideByPOT(s1, output_shift); - s2 = RoundingDivideByPOT(s2, output_shift); + s1 = RoundingDivideByPOT(s1, -params.output_shift); + s2 = RoundingDivideByPOT(s2, -params.output_shift); const auto s1_narrowed = vmovn_s32(s1); const auto s2_narrowed = vmovn_s32(s2); const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), - vdupq_n_s16(output_offset)); + vdupq_n_s16(params.output_offset)); const auto clamped = vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s))); @@ -2604,101 +2510,74 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data, #endif // NEON for (; i < size; ++i) { - const int32 input1_val = input1_offset + input1_data[i]; - const int32 input2_val = input2_offset + input2_data[i]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); + const int32 input1_val = params.input1_offset + input1_data[i]; + const int32 input2_val = params.input2_offset + input2_data[i]; + const int32 shifted_input1_val = input1_val * (1 << params.left_shift); + const int32 shifted_input2_val = input2_val * (1 << params.left_shift); const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, - kReverseShift * input1_shift); + shifted_input1_val, params.input1_multiplier, params.input1_shift); const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, - kReverseShift * input2_shift); + shifted_input2_val, params.input2_multiplier, params.input2_shift); const int32 raw_sum = scaled_input1_val + scaled_input2_val; const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, output_multiplier, kReverseShift * output_shift) + - output_offset; - const int32 clamped_output = std::min( - output_activation_max, std::max(output_activation_min, raw_output)); + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32 clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); output_data[i] = static_cast<uint8>(clamped_output); } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Add(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, - output_activation_max, output_data, output_dims); -} - -template <FusedActivationFunctionType Ac> -inline void Add(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, int input2_shift, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const uint8* input1_data, + const RuntimeShape& input2_shape, const uint8* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); gemmlowp::ScopedProfilingLabel label("Add/8bit"); - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); - TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); - - TFLITE_DCHECK_GT(input1_offset, -256); - TFLITE_DCHECK_GT(input2_offset, -256); - TFLITE_DCHECK_LT(input1_offset, 256); - TFLITE_DCHECK_LT(input2_offset, 256); - AddElementwise(flat_size, left_shift, input1_data, input1_offset, - input1_multiplier, input1_shift, input2_data, input2_offset, - input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + AddElementwise(flat_size, params, input1_data, input2_data, output_data); } -inline void Add(const int16* input1_data, const Dims<4>& input1_dims, - int input1_shift, const int16* input2_data, - const Dims<4>& input2_dims, int input2_shift, - int16 output_activation_min, int16 output_activation_max, - int16* output_data, const Dims<4>& output_dims) { +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data) { gemmlowp::ScopedProfilingLabel label("Add/Int16"); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); - - TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0); - TFLITE_DCHECK_GE(input1_shift, 0); - TFLITE_DCHECK_GE(input2_shift, 0); + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + + const int input1_shift = params.input1_shift; + const int flat_size = + MatchingFlatSize(output_shape, input1_shape, input2_shape); + const int16 output_activation_min = params.quantized_activation_min; + const int16 output_activation_max = params.quantized_activation_max; + + TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0); + TFLITE_DCHECK_LE(input1_shift, 0); + TFLITE_DCHECK_LE(params.input2_shift, 0); const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data; const int16* shift_input = input1_shift == 0 ? input2_data : input1_data; - const int input_shift = input1_shift == 0 ? input2_shift : input1_shift; + const int input_right_shift = + input1_shift == 0 ? -params.input2_shift : -input1_shift; for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]); - F0 scaled_input = - F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift)); + F0 scaled_input = F0::FromRaw( + gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift)); F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled); const int16 raw_output = result.raw(); const int16 clamped_output = std::min( @@ -2707,195 +2586,59 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims, } } -inline void Add(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32 output_activation_min, int32 output_activation_max, - int32* output_data, const Dims<4>& output_dims) { +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int32* input1_data, + const RuntimeShape& input2_shape, const int32* input2_data, + const RuntimeShape& output_shape, int32* output_data) { gemmlowp::ScopedProfilingLabel label("Add/int32"); - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); - for (int i = 0; i < flat_size; ++i) { - output_data[i] = ActivationFunctionWithMinMax( - input1_data[i] + input2_data[i], output_activation_min, - output_activation_max); - } -} - -template <FusedActivationFunctionType Ac> -inline void Add(const int16* input1_data, const Dims<4>& input1_dims, - int input1_shift, const int16* input2_data, - const Dims<4>& input2_dims, int input2_shift, - int16 output_activation_min, int16 output_activation_max, - int16* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, -32768); - TFLITE_DCHECK_EQ(output_activation_max, 32767); - } - - Add(input1_data, input1_dims, input1_shift, input2_data, input2_dims, - input2_shift, output_activation_min, output_activation_max, output_data, - output_dims); -} - -template <FusedActivationFunctionType Ac> -void Add(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Add/int32"); - TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); - - auto input1_map = MapAsVector(input1_data, input1_dims); - auto input2_map = MapAsVector(input2_data, input2_dims); - auto output_map = MapAsVector(output_data, output_dims); - if (AreSameDims(input1_dims, input2_dims)) { + auto input1_map = MapAsVector(input1_data, input1_shape); + auto input2_map = MapAsVector(input2_data, input2_shape); + auto output_map = MapAsVector(output_data, output_shape); + if (input1_shape == input2_shape) { output_map.array() = input1_map.array() + input2_map.array(); - } else if (FlatSize(input2_dims) == 1) { + } else if (input2_shape.FlatSize() == 1) { auto scalar = input2_data[0]; output_map.array() = input1_map.array() + scalar; - } else if (FlatSize(input1_dims) == 1) { + } else if (input1_shape.FlatSize() == 1) { auto scalar = input1_data[0]; output_map.array() = scalar + input2_map.array(); } else { // Should not come here. TFLITE_DCHECK(false); } + output_map = output_map.cwiseMax(params.quantized_activation_min); + output_map = output_map.cwiseMin(params.quantized_activation_max); } -// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. -// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from -// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> -// is no longer referenced in this file, move NdArrayDesc<T> from types.h to -// reference_ops.h. -template <typename T> -void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] + - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); - } - } - } - } -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac, typename T> -void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - T output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims, - output_activation_min, output_activation_max, output_data, - output_dims); -} - -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAddGeneric/8bit"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, - kReverseShift * input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, - kReverseShift * input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, output_multiplier, kReverseShift * output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } - } -} - -inline void BroadcastAddFivefold( - int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { +inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, + const RuntimeShape& unswitched_input1_shape, + const uint8* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const uint8* unswitched_input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit"); + ArithmeticParams switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input1_multiplier = unswitched_params.input2_multiplier; + switched_params.input1_shift = unswitched_params.input2_shift; + switched_params.input2_offset = unswitched_params.input1_offset; + switched_params.input2_multiplier = unswitched_params.input1_multiplier; + switched_params.input2_shift = unswitched_params.input1_shift; + + const bool use_unswitched = + unswitched_params.broadcast_category == + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const ArithmeticParams& params = + use_unswitched ? unswitched_params : switched_params; + const uint8* input1_data = + use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const uint8* input2_data = + use_unswitched ? unswitched_input2_data : unswitched_input1_data; + // Fivefold nested loops. The second input resets its position for each // iteration of the second loop. The first input resets its position at the // beginning of the fourth loop. The innermost loop is an elementwise add of @@ -2903,82 +2646,29 @@ inline void BroadcastAddFivefold( uint8* output_data_ptr = output_data; const uint8* input1_data_ptr = input1_data; const uint8* input2_data_reset = input2_data; - for (int i4 = 0; i4 < y4; ++i4) { + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + for (int i0 = 0; i0 < y0; ++i0) { const uint8* input2_data_ptr; - for (int i3 = 0; i3 < y3; ++i3) { + for (int i1 = 0; i1 < y1; ++i1) { input2_data_ptr = input2_data_reset; for (int i2 = 0; i2 < y2; ++i2) { - for (int i1 = 0; i1 < y1; ++i1) { - AddElementwise( - y0, left_shift, input1_data_ptr, input1_offset, input1_multiplier, - input1_shift, input2_data_ptr, input2_offset, input2_multiplier, - input2_shift, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data_ptr); - input2_data_ptr += y0; - output_data_ptr += y0; + for (int i3 = 0; i3 < y3; ++i3) { + AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; } - input1_data_ptr += y0; + input1_data_ptr += y4; } } input2_data_reset = input2_data_ptr; } } -template <FusedActivationFunctionType Ac> -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset, - input1_multiplier, input1_shift, input2_data, input2_dims, - input2_offset, input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims); -} - -template <FusedActivationFunctionType Ac> -inline void BroadcastAddFivefold( - int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims, - input1_offset, input1_multiplier, input1_shift, - input2_data, input2_dims, input2_offset, - input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims); -} - inline void Mul(const float* input1_data, const Dims<4>& input1_dims, const float* input2_data, const Dims<4>& input2_dims, float output_activation_min, float output_activation_max, @@ -3052,6 +2742,20 @@ void Mul(const float* input1_data, const Dims<4>& input1_dims, output_activation_max, output_data, output_dims); } +inline void Mul(const int32* input1_data, const Dims<4>& input1_dims, + const int32* input2_data, const Dims<4>& input2_dims, + int32 output_activation_min, int32 output_activation_max, + int32* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Mul/int32"); + + const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] * input2_data[i], output_activation_min, + output_activation_max); + } +} + template <FusedActivationFunctionType Ac> void Mul(const int32* input1_data, const Dims<4>& input1_dims, const int32* input2_data, const Dims<4>& input2_dims, @@ -3289,122 +2993,78 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, } // TODO(aselle): This is not actually optimized yet. -inline void Sub(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Sub"); - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); +inline void SubNonBroadcast(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + gemmlowp::ScopedProfilingLabel label("SubNonBroadcast"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( - input1_data[i] - input2_data[i], output_activation_min, - output_activation_max); + input1_data[i] - input2_data[i], params.float_activation_min, + params.float_activation_max); } } -// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. -// TODO(benoitjacob): BroadcastSub is intentionally duplicated from -// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> -// is no longer referenced in this file, move NdArrayDesc<T> from types.h to -// reference_ops.h. -template <typename T> -void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastSub"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] - - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); - } - } - } +inline void SubWithActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + gemmlowp::ScopedProfilingLabel label("SubWithActivation/int32"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, input2_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.quantized_activation_min, + params.quantized_activation_max); } } -inline void BroadcastSub(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit"); +inline void SubWithActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + gemmlowp::ScopedProfilingLabel label("SubWithActivation/float"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, input2_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.float_activation_min, + params.float_activation_max); + } +} - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); +template <typename T> +void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, + T* output_data) { + gemmlowp::ScopedProfilingLabel label("Sub"); - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, - kReverseShift * input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, - kReverseShift * input2_shift); - const int32 raw_sub = scaled_input1_val - scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sub, output_multiplier, kReverseShift * output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } + auto input1_map = MapAsVector(input1_data, input1_shape); + auto input2_map = MapAsVector(input2_data, input2_shape); + auto output_map = MapAsVector(output_data, output_shape); + if (input1_shape == input2_shape) { + output_map.array() = input1_map.array() - input2_map.array(); + } else if (input1_shape.FlatSize() == 1) { + auto scalar = input1_data[0]; + output_map.array() = scalar - input2_map.array(); + } else if (input2_shape.FlatSize() == 1) { + auto scalar = input2_data[0]; + output_map.array() = input1_map.array() - scalar; + } else { + BroadcastSub4DSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); } } @@ -3770,21 +3430,20 @@ inline int NodeOffset(int b, int h, int w, int height, int width) { return (b * height + h) * width + w; } -inline void AveragePool(const float* input_data, - const RuntimeShape& input_shape, int stride_width, - int stride_height, int pad_width, int pad_height, - int kwidth, int kheight, float output_activation_min, - float output_activation_max, float* output_data, - const RuntimeShape& output_shape) { +inline void AveragePool(const PoolParams& params, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("AveragePool"); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int depth = MatchingDim(input_shape, 3, output_shape, 3); const int input_height = input_shape.Dims(1); const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; // TODO(benoitjacob) make this a proper reference impl without Eigen! const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); @@ -3799,12 +3458,15 @@ inline void AveragePool(const float* input_data, for (int w = 0; w < input_width; ++w) { // (h_start, h_end) * (w_start, w_end) is the range that the input // vector projects to. - int hpad = h + pad_height; - int wpad = w + pad_width; - int h_start = - (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = (hpad < params.filter_height) + ? 0 + : (hpad - params.filter_height) / stride_height + 1; int h_end = std::min(hpad / stride_height + 1, output_height); - int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; + int w_start = (wpad < params.filter_width) + ? 0 + : (wpad - params.filter_width) / stride_width + 1; int w_end = std::min(wpad / stride_width + 1, output_width); // compute elementwise sum for (int ph = h_start; ph < h_end; ++ph) { @@ -3822,29 +3484,21 @@ inline void AveragePool(const float* input_data, TFLITE_DCHECK_GT(out_count.minCoeff(), 0); out_mat.array().rowwise() /= out_count.transpose().array(); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < output_height; ++y) { - for (int x = 0; x < output_width; ++x) { - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_shape, b, y, x, c)] = - ActivationFunctionWithMinMax( - output_data[Offset(output_shape, b, y, x, c)], - output_activation_min, output_activation_max); - } - } - } + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], + params.float_activation_min, + params.float_activation_max); } } -inline void AveragePool(const uint8* input_data, - const RuntimeShape& input_shape, int stride_width, - int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const RuntimeShape& output_shape) { +inline void AveragePool(const PoolParams& params, + const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { gemmlowp::ScopedProfilingLabel label("AveragePool/8bit"); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); @@ -3853,17 +3507,21 @@ inline void AveragePool(const uint8* input_data, const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); const int filter_count = (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); // 1280 required by Inception v3 @@ -3911,18 +3569,18 @@ inline void AveragePool(const uint8* input_data, output_data + Offset(output_shape, batch, out_y, out_x, 0); int channel = 0; #ifdef USE_NEON -#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ - if (filter_count == FILTER_COUNT) { \ - for (; channel <= depth - 8; channel += 8) { \ - uint16 buf[8]; \ - for (int i = 0; i < 8; i++) { \ - buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ - } \ - uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ - buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); \ - buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); \ - vst1_u8(output_ptr + channel, buf8); \ - } \ +#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ + if (filter_count == FILTER_COUNT) { \ + for (; channel <= depth - 8; channel += 8) { \ + uint16 buf[8]; \ + for (int i = 0; i < 8; i++) { \ + buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ + } \ + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ + buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \ + buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \ + vst1_u8(output_ptr + channel, buf8); \ + } \ } AVGPOOL_DIVIDING_BY(9) AVGPOOL_DIVIDING_BY(15) @@ -3933,15 +3591,15 @@ inline void AveragePool(const uint8* input_data, buf[i] = (acc[channel + i] + filter_count / 2) / filter_count; } uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); - buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); - buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); + buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); + buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); vst1_u8(output_ptr + channel, buf8); } #endif for (; channel < depth; ++channel) { uint16 a = (acc[channel] + filter_count / 2) / filter_count; - a = std::max<uint16>(a, output_activation_min); - a = std::min<uint16>(a, output_activation_max); + a = std::max<uint16>(a, params.quantized_activation_min); + a = std::min<uint16>(a, params.quantized_activation_max); output_ptr[channel] = static_cast<uint8>(a); } } @@ -3949,20 +3607,19 @@ inline void AveragePool(const uint8* input_data, } } -inline void MaxPool(const float* input_data, const RuntimeShape& input_shape, - int stride_width, int stride_height, int pad_width, - int pad_height, int kwidth, int kheight, - float output_activation_min, float output_activation_max, - float* output_data, const RuntimeShape& output_shape) { +inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("MaxPool"); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int depth = MatchingDim(input_shape, 3, output_shape, 3); const int input_height = input_shape.Dims(1); const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); @@ -3973,12 +3630,15 @@ inline void MaxPool(const float* input_data, const RuntimeShape& input_shape, for (int w = 0; w < input_width; ++w) { // (h_start, h_end) * (w_start, w_end) is the range that the input // vector projects to. - int hpad = h + pad_height; - int wpad = w + pad_width; - int h_start = - (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = (hpad < params.filter_height) + ? 0 + : (hpad - params.filter_height) / stride_height + 1; int h_end = std::min(hpad / stride_height + 1, output_height); - int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; + int w_start = (wpad < params.filter_width) + ? 0 + : (wpad - params.filter_width) / stride_width + 1; int w_end = std::min(wpad / stride_width + 1, output_width); // compute elementwise sum for (int ph = h_start; ph < h_end; ++ph) { @@ -3993,28 +3653,20 @@ inline void MaxPool(const float* input_data, const RuntimeShape& input_shape, } } } - - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < output_height; ++y) { - for (int x = 0; x < output_width; ++x) { - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_shape, b, y, x, c)] = - ActivationFunctionWithMinMax( - output_data[Offset(output_shape, b, y, x, c)], - output_activation_min, output_activation_max); - } - } - } + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], + params.float_activation_min, + params.float_activation_max); } } -inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const RuntimeShape& output_shape) { +inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("MaxPool/8bit"); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); @@ -4023,17 +3675,21 @@ inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape, const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); // 2048 required by Inception v3 static constexpr int kAccBufferMaxSize = 2048; TFLITE_DCHECK_LE(depth, kAccBufferMaxSize); @@ -4076,21 +3732,21 @@ inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape, #ifdef USE_NEON for (; channel <= depth - 16; channel += 16) { uint8x16_t a = vld1q_u8(acc + channel); - a = vminq_u8(a, vdupq_n_u8(output_activation_max)); - a = vmaxq_u8(a, vdupq_n_u8(output_activation_min)); + a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max)); + a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min)); vst1q_u8(output_ptr + channel, a); } for (; channel <= depth - 8; channel += 8) { uint8x8_t a = vld1_u8(acc + channel); - a = vmin_u8(a, vdup_n_u8(output_activation_max)); - a = vmax_u8(a, vdup_n_u8(output_activation_min)); + a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max)); + a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min)); vst1_u8(output_ptr + channel, a); } #endif for (; channel < depth; ++channel) { uint8 a = acc[channel]; - a = std::max<uint8>(a, output_activation_min); - a = std::min<uint8>(a, output_activation_max); + a = std::max<uint8>(a, params.quantized_activation_min); + a = std::min<uint8>(a, params.quantized_activation_max); output_ptr[channel] = static_cast<uint8>(a); } } @@ -4098,11 +3754,9 @@ inline void MaxPool(const uint8* input_data, const RuntimeShape& input_shape, } } -inline void L2Pool(const float* input_data, const RuntimeShape& input_shape, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - float output_activation_min, float output_activation_max, - float* output_data, const RuntimeShape& output_shape) { +inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("L2Pool"); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); @@ -4111,6 +3765,8 @@ inline void L2Pool(const float* input_data, const RuntimeShape& input_shape, const int input_width = input_shape.Dims(2); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; // Actually carry out L2 Pool. Code is written in forward mode: we go through // the input values once, and write to all the pooled regions that it maps to. const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); @@ -4125,15 +3781,17 @@ inline void L2Pool(const float* input_data, const RuntimeShape& input_shape, for (int w = 0; w < input_width; ++w) { // (h_start, h_end) * (w_start, w_end) is the range that the input // vector projects to. - const int hpad = h + pad_height; - const int wpad = w + pad_width; - const int h_start = (hpad < filter_height) - ? 0 - : (hpad - filter_height) / stride_height + 1; + const int hpad = h + params.padding_values.height; + const int wpad = w + params.padding_values.width; + const int h_start = + (hpad < params.filter_height) + ? 0 + : (hpad - params.filter_height) / stride_height + 1; const int h_end = std::min(hpad / stride_height + 1, output_height); - const int w_start = (wpad < filter_width) - ? 0 - : (wpad - filter_width) / stride_width + 1; + const int w_start = + (wpad < params.filter_width) + ? 0 + : (wpad - params.filter_width) / stride_width + 1; const int w_end = std::min(wpad / stride_width + 1, output_width); // pre-compute square const int in_offset = w + input_width * (h + input_height * b); @@ -4154,6 +3812,13 @@ inline void L2Pool(const float* input_data, const RuntimeShape& input_shape, out_count = out_count.array().inverse(); out_mat = (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt(); + + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], + params.float_activation_min, + params.float_activation_max); + } } inline void LocalResponseNormalization(const float* input_data, @@ -5842,63 +5507,6 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims, } template <typename T> -void GenericBroadcastSub(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("GenericBroadcastSub"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - input1_data[SubscriptToIndex(desc1, c, x, y, b)] - - input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - } - } - } - } -} - -template <typename T> -void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, - const Dims<4>& input2_dims, T* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Sub"); - - auto input1_map = MapAsVector(input1_data, input1_dims); - auto input2_map = MapAsVector(input2_data, input2_dims); - auto output_map = MapAsVector(output_data, output_dims); - if (AreSameDims(input1_dims, input2_dims)) { - output_map.array() = input1_map.array() - input2_map.array(); - } else if (FlatSize(input1_dims) == 1) { - auto scalar = input1_data[0]; - output_map.array() = scalar - input2_map.array(); - } else if (FlatSize(input2_dims) == 1) { - auto scalar = input2_data[0]; - output_map.array() = input1_map.array() - scalar; - } else { - GenericBroadcastSub(input1_data, input1_dims, input2_data, input2_dims, - output_data, output_dims); - } -} - -template <typename T> void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, T* output_data, const Dims<4>& output_dims) { |