diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-09-17 08:21:43 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-17 08:26:16 -0700 |
commit | e0d6830999a6e7c92f047e6e89c3aba20911cc8c (patch) | |
tree | 09ac1884b657de19a4dc4b2f796b8682ca803003 | |
parent | 055e5a0f71c83bab3f645d1c2e2cadeff5ff654f (diff) |
Convert more kernel signatures to use runtime shapes.
PiperOrigin-RevId: 213275003
4 files changed, 206 insertions, 94 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h index 70810ca784..f2d1319801 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h @@ -907,25 +907,40 @@ inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, } } -inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims) { +inline void DepthwiseConv( + const DepthwiseParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("DepthwiseConv"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + // TODO(suharshs): Optimized implementation of dilation depthwise conv need to + // be implemented. + TFLITE_DCHECK_EQ(params.dilation_width_factor, 1); + TFLITE_DCHECK_EQ(params.dilation_height_factor, 1); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); static const int kAccBufferMaxSize = 2048; float acc_buffer[kAccBufferMaxSize]; @@ -990,6 +1005,10 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, row_accum_func = FloatDepthwiseConvAccumRowGeneric; } + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + // Now that we have determined row_accum_func, we can start work. float* output_ptr = output_data; for (int b = 0; b < batches; ++b) { @@ -1014,13 +1033,12 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { const int in_y = in_y_origin + filter_y; - row_accum_func(stride_width, input_depth, input_width, - input_data + in_y * input_dims.strides[2] + - b * input_dims.strides[3], - pad_width, depth_multiplier, filter_width, - filter_data + filter_y * filter_dims.strides[2], - out_x_buffer_start, out_x_buffer_end, output_depth, - acc_buffer); + row_accum_func( + stride_width, input_depth, input_width, + input_data + in_y * input_height_stride + b * input_batch_stride, + pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_height_stride, out_x_buffer_start, + out_x_buffer_end, output_depth, acc_buffer); } // Finished accumulating. Now store to destination. const int num_output_values = output_depth * num_output_pixels; @@ -1067,6 +1085,8 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, } } +// TODO(b/80418076): Move to legacy ops file, update invocations. +// Legacy. inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, const float* filter_data, const Dims<4>& filter_dims, const float* bias_data, const Dims<4>& bias_dims, @@ -1078,15 +1098,43 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, const Dims<4>& output_dims) { // TODO(suharshs): Optimized implementation of dilation depthwise conv need to // be implemented. - TFLITE_DCHECK(dilation_width_factor == 1); - TFLITE_DCHECK(dilation_height_factor == 1); + TFLITE_DCHECK_EQ(dilation_width_factor, 1); + TFLITE_DCHECK_EQ(dilation_height_factor, 1); + tflite::DepthwiseParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.depth_multiplier = depth_multiplier; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + DepthwiseConv(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data); +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data, - bias_dims, stride_width, stride_height, pad_width, pad_height, - depth_multiplier, output_activation_min, output_activation_max, - output_data, output_dims); + bias_dims, stride_width, stride_height, 1, 1, pad_width, + pad_height, depth_multiplier, output_activation_min, + output_activation_max, output_data, output_dims); } +// TODO(b/80418076): Move to legacy ops file, update invocations. // legacy, for compatibility with old checked-in code template <FusedActivationFunctionType Ac> void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, @@ -1103,6 +1151,7 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, output_data, output_dims); } +// TODO(b/80418076): Move to legacy ops file, update invocations. // legacy, for compatibility with old checked-in code template <FusedActivationFunctionType Ac> void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h index f707279600..ccb9d1654f 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h @@ -1669,33 +1669,50 @@ inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, } } -inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { +inline void DepthwiseConv( + const DepthwiseParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + // TODO(suharshs): Optimized implementation of dilation depthwise conv need to + // be implemented. + TFLITE_DCHECK_EQ(params.dilation_width_factor, 1); + TFLITE_DCHECK_EQ(params.dilation_height_factor, 1); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); #ifdef USE_NEON const bool shift_left = (output_shift <= 0); const int32 multiplier_power_of_two = shift_left ? (1 << -output_shift) : 1; #endif - TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on // Jetson TX-2. This compiler does not support the offsetof() macro. @@ -1703,14 +1720,11 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, // Call kernel optimized for depthwise convolutions using 3x3 filters if // parameters are supported. if (Fast3x3FilterKernelSupported( - input_dims, filter_dims, stride_width, stride_height, pad_width, - pad_height, depth_multiplier, output_dims, output_shift)) { - DepthwiseConv3x3Filter(input_data, input_dims, input_offset, filter_data, - filter_dims, filter_offset, bias_data, bias_dims, - stride_width, stride_height, pad_width, pad_height, - depth_multiplier, output_offset, output_multiplier, - output_shift, output_activation_min, - output_activation_max, output_data, output_dims); + input_shape, filter_shape, stride_width, stride_height, pad_width, + pad_height, depth_multiplier, output_shape, output_shift)) { + DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + output_data); return; } #endif @@ -1785,6 +1799,10 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, #undef TFMINI_USE_DEPTHWISECONV_KERNEL + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + // Now that we have determined row_accum_func, we can start work. uint8* output_ptr = output_data; for (int b = 0; b < batches; ++b) { @@ -1811,10 +1829,9 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, const int in_y = in_y_origin + filter_y; row_accum_func( stride_width, input_depth, input_width, - input_data + in_y * input_dims.strides[2] + - b * input_dims.strides[3], + input_data + in_y * input_height_stride + b * input_batch_stride, input_offset, pad_width, depth_multiplier, filter_width, - filter_data + filter_y * filter_dims.strides[2], filter_offset, + filter_data + filter_y * filter_height_stride, filter_offset, out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); } // Finished accumulating int32 values. Now need to convert them to @@ -1964,6 +1981,8 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, } } +// TODO(b/80418076): Move to legacy ops file, update invocations. +// Legacy. inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, int32 input_offset, const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset, @@ -1975,19 +1994,48 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, int output_shift, int32 output_activation_min, int32 output_activation_max, uint8* output_data, const Dims<4>& output_dims) { - // TODO(suharshs): Optimized implementation of dilation depthwise is not - // supported yet. - TFLITE_DCHECK(dilation_width_factor == 1); - TFLITE_DCHECK(dilation_height_factor == 1); + tflite::DepthwiseParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.depth_multiplier = depth_multiplier; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = output_shift; + + DepthwiseConv(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data); +} +inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, filter_offset, bias_data, bias_dims, stride_width, - stride_height, pad_width, pad_height, depth_multiplier, + stride_height, 1, 1, pad_width, pad_height, depth_multiplier, output_offset, output_multiplier, output_shift, output_activation_min, output_activation_max, output_data, output_dims); } +// TODO(b/80418076): Move to legacy ops file, update invocations. // Legacy, for compatibility with old checked-in code. template <FusedActivationFunctionType Ac> void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, @@ -2011,6 +2059,7 @@ void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, output_dims); } +// TODO(b/80418076): Move to legacy ops file, update invocations. // Legacy, for compatibility with old checked-in code. template <FusedActivationFunctionType Ac> void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h index 0ce64f8c70..9fed53cafb 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h @@ -3175,16 +3175,17 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data, } inline bool Fast3x3FilterKernelSupported( - const Dims<4>& input_dims, const Dims<4>& filter_dims, int32 stride_width, - int32 stride_height, int32 pad_width, int32 pad_height, - int32 depth_multiplier, const Dims<4>& output_dims, int32 output_shift) { - const int32 input_height = ArraySize(input_dims, 2); - const int32 input_width = ArraySize(input_dims, 1); - const int32 input_depth = ArraySize(input_dims, 0); - const int32 filter_height = ArraySize(filter_dims, 2); - const int32 filter_width = ArraySize(filter_dims, 1); - const int32 output_height = ArraySize(output_dims, 2); - const int32 output_width = ArraySize(output_dims, 1); + const RuntimeShape& input_shape, const RuntimeShape& filter_shape, + int32 stride_width, int32 stride_height, int32 pad_width, int32 pad_height, + int32 depth_multiplier, const RuntimeShape& output_shape, + int32 output_shift) { + const int32 input_height = input_shape.Dims(1); + const int32 input_width = input_shape.Dims(2); + const int32 input_depth = input_shape.Dims(3); + const int32 filter_height = filter_shape.Dims(1); + const int32 filter_width = filter_shape.Dims(2); + const int32 output_height = output_shape.Dims(1); + const int32 output_width = output_shape.Dims(2); bool supported = filter_width == 3 && filter_height == 3 && depth_multiplier == 1 && @@ -3234,26 +3235,37 @@ inline bool Fast3x3FilterKernelSupported( } inline void DepthwiseConv3x3Filter( - const uint8* input_data, const Dims<4>& input_dims, int32 input_offset, - const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int32 stride_width, - int32 stride_height, int32 pad_width, int32 pad_height, - int32 depth_multiplier, int32 output_offset, int32 output_multiplier, - int32 output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { + const DepthwiseParams& rt_params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); DepthwiseConvParams params; - params.input_depth = ArraySize(input_dims, 0); - params.input_width = ArraySize(input_dims, 1); - params.input_height = ArraySize(input_dims, 2); + + const int32 stride_width = rt_params.stride_width; + const int32 stride_height = rt_params.stride_height; + const int32 pad_width = rt_params.padding_values.width; + const int32 pad_height = rt_params.padding_values.height; + const int32 depth_multiplier = rt_params.depth_multiplier; + const int32 output_activation_min = rt_params.quantized_activation_min; + const int32 output_activation_max = rt_params.quantized_activation_max; + const int32 input_offset = rt_params.input_offset; + const int32 filter_offset = rt_params.weights_offset; + const int32 output_offset = rt_params.output_offset; + const int32 output_multiplier = rt_params.output_multiplier; + const int32 output_shift = rt_params.output_shift; + + params.input_depth = input_shape.Dims(3); + params.input_width = input_shape.Dims(2); + params.input_height = input_shape.Dims(1); params.input_row_size = params.input_depth * params.input_width; params.input_offset = input_offset; params.stride_width = stride_width; params.stride_height = stride_height; - params.output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - params.output_width = ArraySize(output_dims, 1); - params.output_height = ArraySize(output_dims, 2); + params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + params.output_width = output_shape.Dims(2); + params.output_height = output_shape.Dims(1); params.output_row_size = params.output_depth * params.output_width; params.output_offset = output_offset; params.filter_offset = filter_offset; @@ -3262,8 +3274,8 @@ inline void DepthwiseConv3x3Filter( params.output_activation_min = output_activation_min; params.output_activation_max = output_activation_max; - const int32 filter_height = ArraySize(filter_dims, 2); - const int32 filter_width = ArraySize(filter_dims, 1); + const int32 filter_height = filter_shape.Dims(1); + const int32 filter_width = filter_shape.Dims(2); params.filter_row_size = params.output_depth * filter_width; // Algorithm assumes below constraints. It is optimized for depth @@ -3279,7 +3291,7 @@ inline void DepthwiseConv3x3Filter( TFLITE_DCHECK(pad_width == 0 || pad_width == 1); TFLITE_DCHECK(pad_width == pad_height); - const int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int32 batches = MatchingDim(input_shape, 0, output_shape, 0); const int64_t input_batch_size = params.input_row_size * params.input_height; const int64_t output_batch_size = params.output_row_size * params.output_height; diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h index f6636acc58..ac4626bc30 100644 --- a/tensorflow/contrib/lite/kernels/internal/types.h +++ b/tensorflow/contrib/lite/kernels/internal/types.h @@ -772,6 +772,8 @@ struct DepthwiseParams { PaddingValues padding_values; int16 stride_width; int16 stride_height; + int16 dilation_width_factor; + int16 dilation_height_factor; int16 depth_multiplier; // uint8 inference params. // TODO(b/65838351): Use smaller types if appropriate. |