aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h')
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h476
1 files changed, 346 insertions, 130 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index b870789772..2d172315da 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -47,6 +47,7 @@ using reference_ops::BroadcastGreater;
using reference_ops::BroadcastGreaterEqual;
using reference_ops::BroadcastLess;
using reference_ops::BroadcastLessEqual;
+using reference_ops::BroadcastMul4DSlow;
using reference_ops::BroadcastSub4DSlow;
using reference_ops::Concatenation;
using reference_ops::DepthConcatenation;
@@ -75,6 +76,11 @@ using reference_ops::Transpose;
// Used mainly to convert from old-style shifts (right) to new-style (left).
static constexpr int kReverseShift = -1;
+inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
+ return RuntimeShape(
+ {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
// Make a local VectorMap typedef allowing to map a float array
// as a Eigen vector expression. The std::conditional here is to
// construct the suitable Eigen type for the constness of the
@@ -1978,12 +1984,12 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
int32 input_offset, const uint8* filter_data,
const Dims<4>& filter_dims, int32 filter_offset,
const int32* bias_data, const Dims<4>& bias_dims,
- int stride_width, int stride_height, int pad_width,
- int pad_height, int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims, uint8* im2col_data,
- const Dims<4>& im2col_dims,
+ int stride_width, int stride_height, int dilation_width_factor,
+ int dilation_height_factor, int pad_width, int pad_height,
+ int32 output_offset, int32 output_multiplier, int output_shift,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims,
+ uint8* im2col_data, const Dims<4>& im2col_dims,
gemmlowp::GemmContext* gemm_context) {
gemmlowp::ScopedProfilingLabel label("Conv/8bit");
@@ -1995,9 +2001,22 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
const Dims<4>* gemm_input_dims = nullptr;
const int filter_width = ArraySize(filter_dims, 1);
const int filter_height = ArraySize(filter_dims, 2);
+ const bool need_dilated_im2col =
+ dilation_width_factor != 1 || dilation_height_factor != 1;
const bool need_im2col = stride_width != 1 || stride_height != 1 ||
filter_width != 1 || filter_height != 1;
- if (need_im2col) {
+ if (need_dilated_im2col) {
+ TFLITE_DCHECK(im2col_data);
+ const int input_zero_point = -input_offset;
+ TFLITE_DCHECK_GE(input_zero_point, 0);
+ TFLITE_DCHECK_LE(input_zero_point, 255);
+ DilatedIm2col(input_data, input_dims, filter_dims, stride_width,
+ stride_height, dilation_width_factor, dilation_height_factor,
+ pad_width, pad_height, output_dims, input_zero_point,
+ im2col_data);
+ gemm_input_data = im2col_data;
+ gemm_input_dims = &im2col_dims;
+ } else if (need_im2col) {
TFLITE_DCHECK(im2col_data);
const int input_zero_point = -input_offset;
TFLITE_DCHECK_GE(input_zero_point, 0);
@@ -2053,6 +2072,24 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
input_offset, output_pipeline);
}
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims, uint8* im2col_data,
+ const Dims<4>& im2col_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+ pad_width, pad_height, output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data, output_dims,
+ im2col_data, im2col_dims, gemm_context);
+}
+
// legacy, for compatibility with old checked-in code
template <FusedActivationFunctionType Ac>
inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
@@ -2904,68 +2941,130 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
output_dims);
}
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
- int32 input1_offset, const uint8* input2_data,
- const Dims<4>& input2_dims, int32 input2_offset,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+ const uint8* input1_data, const uint8* input2_data,
+ uint8* output_data) {
+ for (int i = 0; i < size; ++i) {
+ const int32 input1_val = params.input1_offset + input1_data[i];
+ const int32 input2_val = params.input2_offset + input2_data[i];
+ const int32 unclamped_result =
+ params.output_offset +
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+ params.output_multiplier,
+ params.output_shift);
+ const int32 clamped_output =
+ std::min(params.quantized_activation_max,
+ std::max(params.quantized_activation_min, unclamped_result));
+ output_data[i] = static_cast<uint8>(clamped_output);
+ }
+}
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+// Broadcast mul that can often be used for inner loop of broadcast Mul.
+inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
+ const uint8 broadcast_value,
+ const uint8* input2_data, uint8* output_data) {
+ const int32 input1_val = params.input1_offset + broadcast_value;
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- const int32 input1_val =
- input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
- const int32 input2_val =
- input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- const int32 unclamped_result =
- output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
- input1_val * input2_val, output_multiplier,
- kReverseShift * output_shift);
- const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, unclamped_result));
- output_data[Offset(output_dims, c, x, y, b)] =
- static_cast<uint8>(clamped_output);
+ for (int i = 0; i < size; ++i) {
+ const int32 input2_val = params.input2_offset + input2_data[i];
+ const int32 unclamped_result =
+ params.output_offset +
+ MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+ params.output_multiplier,
+ params.output_shift);
+ const int32 clamped_output =
+ std::min(params.quantized_activation_max,
+ std::max(params.quantized_activation_min, unclamped_result));
+ output_data[i] = static_cast<uint8>(clamped_output);
+ }
+}
+
+inline void Mul(const ArithmeticParams& params,
+ const RuntimeShape& input1_shape, const uint8* input1_data,
+ const RuntimeShape& input2_shape, const uint8* input2_data,
+ const RuntimeShape& output_shape, uint8* output_data) {
+ TFLITE_DCHECK_LE(params.quantized_activation_min,
+ params.quantized_activation_max);
+ gemmlowp::ScopedProfilingLabel label("Mul/8bit");
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+ MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+ const RuntimeShape& unswitched_input1_shape,
+ const uint8* unswitched_input1_data,
+ const RuntimeShape& unswitched_input2_shape,
+ const uint8* unswitched_input2_data,
+ const RuntimeShape& output_shape,
+ uint8* output_data) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastMulFivefold/8bit");
+
+ ArithmeticParams switched_params = unswitched_params;
+ switched_params.input1_offset = unswitched_params.input2_offset;
+ switched_params.input2_offset = unswitched_params.input1_offset;
+
+ const bool use_unswitched =
+ unswitched_params.broadcast_category ==
+ tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+ const ArithmeticParams& params =
+ use_unswitched ? unswitched_params : switched_params;
+ const uint8* input1_data =
+ use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+ const uint8* input2_data =
+ use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+ // Fivefold nested loops. The second input resets its position for each
+ // iteration of the second loop. The first input resets its position at the
+ // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+ // sections of the arrays.
+ uint8* output_data_ptr = output_data;
+ const uint8* input1_data_ptr = input1_data;
+ const uint8* input2_data_reset = input2_data;
+ int y0 = params.broadcast_shape[0];
+ int y1 = params.broadcast_shape[1];
+ int y2 = params.broadcast_shape[2];
+ int y3 = params.broadcast_shape[3];
+ int y4 = params.broadcast_shape[4];
+ if (y4 > 1) {
+ for (int i0 = 0; i0 < y0; ++i0) {
+ const uint8* input2_data_ptr;
+ for (int i1 = 0; i1 < y1; ++i1) {
+ input2_data_ptr = input2_data_reset;
+ for (int i2 = 0; i2 < y2; ++i2) {
+ for (int i3 = 0; i3 < y3; ++i3) {
+ MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+ output_data_ptr);
+ input2_data_ptr += y4;
+ output_data_ptr += y4;
+ }
+ input1_data_ptr += y4;
}
}
+ input2_data_reset = input2_data_ptr;
+ }
+ } else {
+ for (int i0 = 0; i0 < y0; ++i0) {
+ const uint8* input2_data_ptr;
+ for (int i1 = 0; i1 < y1; ++i1) {
+ input2_data_ptr = input2_data_reset;
+ for (int i2 = 0; i2 < y2; ++i2) {
+ MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+ output_data_ptr);
+ input2_data_ptr += y3;
+ output_data_ptr += y3;
+ ++input1_data_ptr;
+ }
+ }
+ input2_data_reset = input2_data_ptr;
}
}
}
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
- int32 input1_offset, const uint8* input2_data,
- const Dims<4>& input2_dims, int32 input2_offset,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
- input2_dims, input2_offset, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_data, output_dims);
-}
-
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
@@ -5383,31 +5482,53 @@ void TypedMemset(void* ptr, T value, size_t num) {
}
}
-template <typename T>
-inline void PadV2(const T* input_data, const Dims<4>& input_dims,
- const std::vector<int>& left_paddings,
- const std::vector<int>& right_paddings, T* output_data,
- const Dims<4>& output_dims, const T pad_value) {
+// There are two versions of pad: Pad and PadV2. In PadV2 there is a second
+// scalar input that provides the padding value. Therefore pad_value_ptr can be
+// equivalent to a simple input1_data. For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32 is considered a
+// specialization distinct from P=int32.
+template <typename T, typename P>
+inline void PadImpl(const tflite::PadParams& op_params,
+ const RuntimeShape& input_shape, const T* input_data,
+ const P* pad_value_ptr, const RuntimeShape& output_shape,
+ T* output_data) {
gemmlowp::ScopedProfilingLabel label("Pad");
- TFLITE_DCHECK_EQ(left_paddings.size(), 4);
- TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+ RuntimeShape ext_input_shape = RuntimeShape::ExtendedShape(4, input_shape);
+ RuntimeShape ext_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+ TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
+ TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
+
+ // Runtime calls are currently fixed at 4 dimensions. Copy inputs so
+ // we can pad them to 4 dims (yes, we are "padding the padding").
+ std::vector<int> left_padding_copy(4, 0);
+ for (int i = 0; i < op_params.left_padding_count; ++i) {
+ left_padding_copy[i] = op_params.left_padding[i];
+ }
+ std::vector<int> right_padding_copy(4, 0);
+ for (int i = 0; i < op_params.right_padding_count; ++i) {
+ right_padding_copy[i] = op_params.right_padding[i];
+ }
- const int output_batch = ArraySize(output_dims, 3);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
- const int output_depth = ArraySize(output_dims, 0);
+ const int output_batch = ext_output_shape.Dims(0);
+ const int output_height = ext_output_shape.Dims(1);
+ const int output_width = ext_output_shape.Dims(2);
+ const int output_depth = ext_output_shape.Dims(3);
- const int left_b_padding = left_paddings[3];
- const int left_h_padding = left_paddings[2];
- const int left_w_padding = left_paddings[1];
- const int left_d_padding = left_paddings[0];
+ const int left_b_padding = left_padding_copy[0];
+ const int left_h_padding = left_padding_copy[1];
+ const int left_w_padding = left_padding_copy[2];
+ const int left_d_padding = left_padding_copy[3];
- const int right_b_padding = right_paddings[3];
- const int right_h_padding = right_paddings[2];
- const int right_w_padding = right_paddings[1];
- const int right_d_padding = right_paddings[0];
+ const int right_b_padding = right_padding_copy[0];
+ const int right_h_padding = right_padding_copy[1];
+ const int right_w_padding = right_padding_copy[2];
+ const int right_d_padding = right_padding_copy[3];
- const int input_depth = ArraySize(input_dims, 0);
+ const int input_depth = ext_input_shape.Dims(3);
+ // const T pad_value = ExtractFloatOrInt<T>(op_params.pad_value);
+ const T pad_value = *pad_value_ptr;
if (left_b_padding != 0) {
TypedMemset<T>(
@@ -5417,61 +5538,113 @@ inline void PadV2(const T* input_data, const Dims<4>& input_dims,
for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
++out_b) {
if (left_h_padding != 0) {
- TypedMemset<T>(output_data + Offset(output_dims, 0, 0, 0, out_b),
+ TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, 0, 0, 0),
pad_value, left_h_padding * output_width * output_depth);
}
for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
++out_h) {
if (left_w_padding != 0) {
- TypedMemset<T>(output_data + Offset(output_dims, 0, 0, out_h, out_b),
- pad_value, left_w_padding * output_depth);
+ TypedMemset<T>(
+ output_data + Offset(ext_output_shape, out_b, out_h, 0, 0),
+ pad_value, left_w_padding * output_depth);
}
for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
++out_w) {
if (left_d_padding != 0) {
TypedMemset<T>(
- output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+ output_data + Offset(ext_output_shape, out_b, out_h, out_w, 0),
pad_value, left_d_padding);
}
T* out = output_data +
- Offset(output_dims, left_d_padding, out_w, out_h, out_b);
- const T* in =
- input_data + Offset(input_dims, 0, out_w - left_w_padding,
- out_h - left_h_padding, out_b - left_b_padding);
+ Offset(ext_output_shape, out_b, out_h, out_w, left_d_padding);
+ const T* in = input_data +
+ Offset(ext_input_shape, out_b - left_b_padding,
+ out_h - left_h_padding, out_w - left_w_padding, 0);
memcpy(out, in, input_depth * sizeof(T));
if (right_d_padding != 0) {
TypedMemset<T>(
- output_data + Offset(output_dims, output_depth - right_d_padding,
- out_w, out_h, out_b),
+ output_data + Offset(ext_output_shape, out_b, out_h, out_w,
+ output_depth - right_d_padding),
pad_value, right_d_padding);
}
}
if (right_w_padding != 0) {
- TypedMemset<T>(
- output_data + Offset(output_dims, 0, output_width - right_w_padding,
- out_h, out_b),
- pad_value, right_w_padding * output_depth);
+ TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, out_h,
+ output_width - right_w_padding, 0),
+ pad_value, right_w_padding * output_depth);
}
}
if (right_h_padding != 0) {
TypedMemset<T>(
- output_data +
- Offset(output_dims, 0, 0, output_height - right_h_padding, out_b),
+ output_data + Offset(ext_output_shape, out_b,
+ output_height - right_h_padding, 0, 0),
pad_value, right_h_padding * output_width * output_depth);
}
}
if (right_b_padding != 0) {
TypedMemset<T>(
output_data +
- Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
+ Offset(ext_output_shape, output_batch - right_b_padding, 0, 0, 0),
pad_value,
right_b_padding * output_height * output_width * output_depth);
}
}
-// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T, typename P>
+inline void Pad(const tflite::PadParams& op_params,
+ const RuntimeShape& input_shape, const T* input_data,
+ const P* pad_value_ptr, const RuntimeShape& output_shape,
+ T* output_data) {
+ PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+ output_data);
+}
+
+// The second (pad-value) input can be int32 when, say, the first is uint8.
+template <typename T>
+inline void Pad(const tflite::PadParams& op_params,
+ const RuntimeShape& input_shape, const T* input_data,
+ const int32* pad_value_ptr, const RuntimeShape& output_shape,
+ T* output_data) {
+ const T converted_pad_value = static_cast<T>(*pad_value_ptr);
+ PadImpl(op_params, input_shape, input_data, &converted_pad_value,
+ output_shape, output_data);
+}
+
+// This version avoids conflicting template matching.
+template <>
+inline void Pad(const tflite::PadParams& op_params,
+ const RuntimeShape& input_shape, const int32* input_data,
+ const int32* pad_value_ptr, const RuntimeShape& output_shape,
+ int32* output_data) {
+ PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+ output_data);
+}
+
+// Legacy signature, function covered both Pad and PadV2.
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+ const std::vector<int>& left_paddings,
+ const std::vector<int>& right_paddings, T* output_data,
+ const Dims<4>& output_dims, const T pad_value) {
+ TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+ TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+ tflite::PadParams op_params;
+ op_params.left_padding_count = 4;
+ op_params.right_padding_count = 4;
+ for (int i = 0; i < 4; ++i) {
+ op_params.left_padding[i] = left_paddings[3 - i];
+ op_params.right_padding[i] = right_paddings[3 - i];
+ }
+ // SetFloatOrInt(pad_value, &op_params.pad_value);
+ const T pad_value_copy = pad_value;
+
+ Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy,
+ DimsToShape(output_dims), output_data);
+}
+
+// Old Pad that calls legacy PadV2.
template <typename T>
inline void Pad(const T* input_data, const Dims<4>& input_dims,
const std::vector<int>& left_paddings,
@@ -5482,34 +5655,45 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
output_dims, converted_pad_value);
}
+// Old Pad that only padded with 0.
template <typename T>
inline void Pad(const T* input_data, const Dims<4>& input_dims,
const std::vector<int>& left_paddings,
const std::vector<int>& right_paddings, T* output_data,
const Dims<4>& output_dims) {
- Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
- output_dims, 0);
+ const T pad_value = static_cast<T>(0);
+ PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+ output_dims, pad_value);
}
template <typename T>
-inline void Slice(const T* input_data, const Dims<4>& input_dims,
- const std::vector<int>& begin, const std::vector<int>& size,
- T* output_data, const Dims<4>& output_dims) {
- // TODO(dkalenichenko): This op only supports 4D tensors.
- TFLITE_DCHECK_EQ(begin.size(), 4);
- TFLITE_DCHECK_EQ(size.size(), 4);
- const int start_b = begin[3];
- const int stop_b =
- size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
- const int start_h = begin[2];
- const int stop_h =
- size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
- const int start_w = begin[1];
- const int stop_w =
- size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
- const int start_d = begin[0];
- const int stop_d =
- size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+inline void Slice(const tflite::SliceParams& op_params,
+ const RuntimeShape& input_shape, const T* input_data,
+ const RuntimeShape& output_shape, T* output_data) {
+ gemmlowp::ScopedProfilingLabel label("Slice");
+ RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape);
+ // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
+ TFLITE_DCHECK_LE(op_params.begin_count, 4);
+ TFLITE_DCHECK_LE(op_params.size_count, 4);
+ const int begin_count = op_params.begin_count;
+ const int size_count = op_params.size_count;
+ // We front-pad the begin and size vectors.
+ const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
+ const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
+ ? ext_shape.Dims(0) - start_b
+ : start_b + op_params.size[0];
+ const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
+ const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
+ ? ext_shape.Dims(1) - start_h
+ : start_h + op_params.size[size_count - 3];
+ const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
+ const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
+ ? ext_shape.Dims(2) - start_w
+ : start_w + op_params.size[size_count - 2];
+ const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
+ const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
+ ? ext_shape.Dims(3) - start_d
+ : start_d + op_params.size[size_count - 1];
T* out_ptr = output_data;
for (int in_b = start_b; in_b < stop_b; ++in_b) {
@@ -5517,7 +5701,7 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
for (int in_w = start_w; in_w < stop_w; ++in_w) {
const int len = stop_d - start_d;
memcpy(out_ptr,
- input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
+ input_data + Offset(ext_shape, in_b, in_h, in_w, start_d),
len * sizeof(T));
out_ptr += len;
}
@@ -5526,28 +5710,60 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
}
template <typename T>
-void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, T* output_data,
- const Dims<4>& output_dims) {
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+ const std::vector<int>& begin, const std::vector<int>& size,
+ T* output_data, const Dims<4>& output_dims) {
+ tflite::SliceParams op_params;
+ op_params.begin_count = 4;
+ op_params.size_count = 4;
+ for (int i = 0; i < 4; ++i) {
+ op_params.begin[i] = begin[3 - i];
+ op_params.size[i] = size[3 - i];
+ }
+
+ Slice(op_params, DimsToShape(input_dims), input_data,
+ DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+ const T* input2_data, const RuntimeShape& output_shape,
+ T* output_data) {
gemmlowp::ScopedProfilingLabel label("TensorFlowMinimum");
- auto input1_map = MapAsVector(input1_data, input1_dims);
- auto output_map = MapAsVector(output_data, output_dims);
+ auto input1_map = MapAsVector(input1_data, input1_shape);
+ auto output_map = MapAsVector(output_data, output_shape);
auto min_value = input2_data[0];
output_map.array() = input1_map.array().min(min_value);
}
template <typename T>
-void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
- const T* input2_data, T* output_data,
- const Dims<4>& output_dims) {
+void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+ const T* input2_data, const RuntimeShape& output_shape,
+ T* output_data) {
gemmlowp::ScopedProfilingLabel label("TensorFlowMaximum");
- auto input1_map = MapAsVector(input1_data, input1_dims);
- auto output_map = MapAsVector(output_data, output_dims);
+ auto input1_map = MapAsVector(input1_data, input1_shape);
+ auto output_map = MapAsVector(output_data, output_shape);
auto max_value = input2_data[0];
output_map.array() = input1_map.array().max(max_value);
}
template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, T* output_data,
+ const Dims<4>& output_dims) {
+ Minimum(DimsToShape(input1_dims), input1_data, input2_data,
+ DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, T* output_data,
+ const Dims<4>& output_dims) {
+ Maximum(DimsToShape(input1_dims), input1_data, input2_data,
+ DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
void TransposeIm2col(const T* input_data, const Dims<4>& input_dims,
const Dims<4>& filter_dims, int stride_width,
int stride_height, int pad_width, int pad_height,