diff options
author | 2017-08-23 16:58:50 -0700 | |
---|---|---|
committer | 2017-08-23 17:02:42 -0700 | |
commit | 1f41602a82cb68fc7bc7e51cf9590a87ee5baf4d (patch) | |
tree | 329ba37a6761ae9506d94088b9c8d3c2e90d5803 /tensorflow/contrib/fused_conv/kernels | |
parent | 2272987f13be76105fcd24dd38cf768c2d4fec0d (diff) |
Add int8 version of fused_conv2d_bias_activation operator for the forward phase,
and support side_input and scaling parameters in float and int8 versions.
PiperOrigin-RevId: 166276461
Diffstat (limited to 'tensorflow/contrib/fused_conv/kernels')
3 files changed, 454 insertions, 311 deletions
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc index dc0701b234..fcdf03b596 100644 --- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc +++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc @@ -31,8 +31,8 @@ limitations under the License. #include "tensorflow/core/kernels/conv_2d.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/util/padding.h" -#include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/use_cudnn.h" #if GOOGLE_CUDA @@ -40,38 +40,72 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/util/activation_mode.h" #endif // GOOGLE_CUDA + namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; -template <typename Device, typename T> -struct LaunchConvOp; +template <typename T> +struct RawType { + using type = T; +}; -template <typename Device, typename T> +template <> +struct RawType<qint8> { + using type = int8; +}; + +// T is the element type of the conv_input, filter and side_input tensors. +// BiasType is the element type of the bias tensor, which can be different. +// ScaleType is the type used for conv_input_scale, side_input_scale. +template <typename Device, typename T, typename BiasType, typename ScaleType> class FusedConv2DBiasActivationOp : public OpKernel { public: explicit FusedConv2DBiasActivationOp(OpKernelConstruction* context) : OpKernel(context) { - string data_format; - OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); - OP_REQUIRES(context, FormatFromString(data_format, &data_format_), + string data_format_str, filter_format_str; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str)); + OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_), errors::InvalidArgument("Invalid data format")); + OP_REQUIRES_OK(context, + context->GetAttr("filter_format", &filter_format_str)); OP_REQUIRES(context, - (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW), - errors::InvalidArgument("Current implementation only supports " - "NHWC and NCHW data formats.")); - OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); - OP_REQUIRES(context, strides_.size() == 4, + FilterFormatFromString(filter_format_str, &filter_format_), + errors::InvalidArgument("Invalid filter format")); + + std::vector<int32> strides; + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides)); + OP_REQUIRES(context, strides.size() == 4, errors::InvalidArgument("Sliding window strides field must " "specify 4 dimensions")); + + stride_rows_ = GetTensorDim(strides, data_format_, 'H'); + stride_cols_ = GetTensorDim(strides, data_format_, 'W'); + OP_REQUIRES( + context, + (GetTensorDim(strides, data_format_, 'N') == 1 && + GetTensorDim(strides, data_format_, 'C') == 1), + errors::InvalidArgument("Convolutional strides are not supported in " + "the batch or depth dimensions.")); + + // Note: Only NCHW_VECT_C format is supported for int8. + // This is because it is expected to be the fastest, and our previous tests + // found cudnn 6 does not fully support the other formats for int8 mode. OP_REQUIRES( context, - (GetTensorDim(strides_, data_format_, 'N') == 1 && - GetTensorDim(strides_, data_format_, 'C') == 1), - errors::InvalidArgument("Current implementation does not yet support " - "strides in the batch and depth dimensions.")); - OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + (std::is_same<T, qint8>::value == (data_format_ == FORMAT_NCHW_VECT_C)), + errors::InvalidArgument( + "qint8 should be used with data_format NCHW_VECT_C.")); + + OP_REQUIRES(context, + (std::is_same<T, qint8>::value == + (filter_format_ == FORMAT_OIHW_VECT_I)), + errors::InvalidArgument( + "qint8 should be used with filter_format OIHW_VECT_I.")); + + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_type_)); + eigen_padding_type_ = BrainPadding2EigenPadding(padding_type_); string activation_mode_str; OP_REQUIRES_OK(context, context->GetAttr("activation_mode", &activation_mode_str)); @@ -79,130 +113,111 @@ class FusedConv2DBiasActivationOp : public OpKernel { &activation_mode_)); OP_REQUIRES(context, activation_mode_ == ActivationMode::RELU, errors::InvalidArgument("Current implementation only supports " - "relu as the activation mode.")); + "RELU as the activation function.")); cudnn_use_autotune_ = CudnnUseAutotune(); + float conv_input_scale_flt, side_input_scale_flt; + OP_REQUIRES_OK(context, + context->GetAttr("conv_input_scale", &conv_input_scale_flt)); + OP_REQUIRES_OK(context, + context->GetAttr("side_input_scale", &side_input_scale_flt)); + conv_input_scale_ = conv_input_scale_flt; + side_input_scale_ = side_input_scale_flt; + } + + Status CheckShape(const Tensor& tensor, const string& tensor_name) { + const int num_dims = tensor.dims(); + for (int i = 0; i < num_dims; i++) { + if (!FastBoundsCheck(tensor.dim_size(i), + std::numeric_limits<int32>::max())) { + return errors::InvalidArgument(tensor_name, " dimension ", i, + " too large"); + } + } + // If there is a 5th dimension it is the VECT_C or VECT_I dimension. + if (num_dims == 5 && tensor.dim_size(4) != 4) { + return errors::InvalidArgument("The last dimension of ", tensor_name, + " must be of size 4 for qint8."); + } + return Status::OK(); } void Compute(OpKernelContext* context) override { - // Input tensor is one of the following shapes: - // [ batch, in_rows, in_cols, in_depth ] (for NHWC data format) - // [ batch, in_depth, in_rows, in_cols ] (for NCHW data format) - const Tensor& input = context->input(0); + // The conv_input tensor is one of the following formats: + // NHWC, NCHW, NCHW_VECT_C. + const Tensor& conv_input = context->input(0); + OP_REQUIRES_OK(context, CheckShape(conv_input, "conv_input")); - // Input filter is of the following dimensions: - // [ filter_rows, filter_cols, in_depth, out_depth ] + // The filter tensor is one of the following formats: + // HWIO, OIHW, OIHW_VECT_I. const Tensor& filter = context->input(1); + OP_REQUIRES_OK(context, CheckShape(filter, "filter")); - // Input bias is a 1-D tensor the size of the last - // dimension of Output tensor + // Input bias is a 1-D tensor, with size matching output depth. const Tensor& bias = context->input(2); + OP_REQUIRES_OK(context, CheckShape(bias, "conv_input")); - // For 2D convolution, there should be 4 dimensions. - OP_REQUIRES(context, input.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional", - input.shape().DebugString())); - OP_REQUIRES(context, filter.dims() == 4, - errors::InvalidArgument("filter must be 4-dimensional: ", - filter.shape().DebugString())); - - // Bias should be a 1-D tensor. - OP_REQUIRES(context, bias.dims() == 1, - errors::InvalidArgument("bias must be 1-dimensional: ", - bias.shape().DebugString())); - - for (int i = 0; i < 4; i++) { - OP_REQUIRES(context, - FastBoundsCheck(filter.dim_size(i), - std::numeric_limits<int32>::max()), - errors::InvalidArgument("filter dimension too large")); - OP_REQUIRES( - context, - FastBoundsCheck(input.dim_size(i), std::numeric_limits<int32>::max()), - errors::InvalidArgument("input dimension too large")); + // If side_input_scale != 0, then side_input is not ignored and + // has the same type and dimensions as the output. + const Tensor& side_input = context->input(3); + if (side_input_scale_ != 0) { + OP_REQUIRES_OK(context, CheckShape(side_input, "side_input")); } - // The last dimension for input is in_depth. It must be the same as the - // filter's in_depth. - const int64 in_depth = GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES(context, in_depth == filter.dim_size(2), - errors::InvalidArgument( - "input and filter must have the same depth: ", in_depth, - " vs ", filter.dim_size(2))); - - // The last dimension for filter is out_depth. - const int32 out_depth = static_cast<int32>(filter.dim_size(3)); - - // The second dimension for input is rows/height. - // The first dimension for filter is rows/height. - const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); - const int32 input_rows = static_cast<int32>(input_rows_raw); - const int32 filter_rows = static_cast<int32>(filter.dim_size(0)); - - // The third dimension for input is columns/width. - // The second dimension for filter is columns/width. - const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); - const int32 input_cols = static_cast<int32>(input_cols_raw); - const int32 filter_cols = static_cast<int32>(filter.dim_size(1)); - - // The first dimension for input is batch. - const int64 batch_raw = GetTensorDim(input, data_format_, 'N'); - const int32 batch = static_cast<int32>(batch_raw); - - // For now we take the stride from the second and third dimensions only (we - // do not support striding on the batch or depth dimension). - const int32 stride_rows = - static_cast<int32>(GetTensorDim(strides_, data_format_, 'H')); - const int32 stride_cols = - static_cast<int32>(GetTensorDim(strides_, data_format_, 'W')); - const int32 bias_size = static_cast<int32>(bias.dim_size(0)); - - int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; - OP_REQUIRES_OK(context, - GetWindowedOutputSize(input_rows, filter_rows, stride_rows, - padding_, &out_rows, &pad_rows)); - OP_REQUIRES_OK(context, - GetWindowedOutputSize(input_cols, filter_cols, stride_cols, - padding_, &out_cols, &pad_cols)); - // Output tensor is of the following dimensions: - // [ in_batch, out_rows, out_cols, out_depth ] - TensorShape out_shape = - ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth); + // TODO(pauldonnelly): Switch to a more efficient mechanism to access + // dimension indexes and per-dimension attributes. + const int32 filter_rows = GetFilterDim(filter, filter_format_, 'H'); + const int32 filter_cols = GetFilterDim(filter, filter_format_, 'W'); + const int32 output_depth = GetFilterDim(filter, filter_format_, 'O'); + + const int32 batch_size = GetTensorDim(conv_input, data_format_, 'N'); + const int32 conv_input_rows = GetTensorDim(conv_input, data_format_, 'H'); + const int32 conv_input_cols = GetTensorDim(conv_input, data_format_, 'W'); + + int64 output_rows = 0, output_cols = 0, pad_rows = 0, pad_cols = 0; + OP_REQUIRES_OK(context, GetWindowedOutputSize(conv_input_rows, filter_rows, + stride_rows_, padding_type_, + &output_rows, &pad_rows)); + OP_REQUIRES_OK(context, GetWindowedOutputSize(conv_input_cols, filter_cols, + stride_cols_, padding_type_, + &output_cols, &pad_cols)); + // Initialize the output tensor shape according to data_format_ + TensorShape output_shape = ShapeFromFormat( + data_format_, batch_size, output_rows, output_cols, output_depth); Tensor* output = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); - - // Bias size should be the same as the size of the channel dimension of - // output. - OP_REQUIRES(context, bias_size == out_depth, - errors::InvalidArgument( - "bias size should equal the channel " - "dimension size of output. bias shape: ", - bias.shape().DebugString() + - ", output shape: " + output->shape().DebugString())); + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); - VLOG(2) << "FusedConv2DBiasActivation: in_depth = " << in_depth - << ", input_cols = " << input_cols + VLOG(2) << "FusedConv2DBiasActivation: conv_input_cols = " + << conv_input_cols << ", conv_input_rows = " << conv_input_rows << ", filter_cols = " << filter_cols - << ", input_rows = " << input_rows << ", filter_rows = " << filter_rows - << ", stride_rows = " << stride_rows - << ", stride_cols = " << stride_cols - << ", bias_size = " << bias_size << ", out_depth = " << out_depth; + << ", stride_cols = " << stride_cols_ + << ", stride_rows = " << stride_rows_ + << ", output_depth = " << output_depth + << ", output_cols = " << output_cols + << ", output_rows = " << output_rows + << ", output_shape.num_elements = " << output_shape.num_elements(); // If there is nothing to compute, return. - if (out_shape.num_elements() == 0) { + if (output_shape.num_elements() == 0) { return; } - launcher_.launch(context, cudnn_use_autotune_, input, filter, stride_rows, - stride_cols, bias, activation_mode_, - BrainPadding2EigenPadding(padding_), data_format_, output); + + launcher_.launch(context, cudnn_use_autotune_, conv_input, + conv_input_scale_, filter, stride_rows_, stride_cols_, + eigen_padding_type_, side_input, side_input_scale_, bias, + activation_mode_, data_format_, filter_format_, output); } private: - std::vector<int32> strides_; - Padding padding_; + int32 stride_rows_, stride_cols_; + Padding padding_type_; + Eigen::PaddingType eigen_padding_type_; ActivationMode activation_mode_; TensorFormat data_format_; - LaunchFusedConv2DBiasActivationOp<Device, T> launcher_; + FilterTensorFormat filter_format_; + ScaleType conv_input_scale_; + ScaleType side_input_scale_; + LaunchFusedConv2DBiasActivationOp<Device, T, BiasType, ScaleType> launcher_; bool cudnn_use_autotune_; TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DBiasActivationOp); @@ -211,67 +226,71 @@ class FusedConv2DBiasActivationOp : public OpKernel { #if GOOGLE_CUDA namespace dnn = ::perftools::gputools::dnn; -dnn::ActivationMode BrainActivationMode2CudnnActivationMode( - ActivationMode activation_mode) { - switch (activation_mode) { - case ActivationMode::SIGMOID: - return dnn::ActivationMode::kSigmoid; - case ActivationMode::RELU: - return dnn::ActivationMode::kRelu; - case ActivationMode::RELUX: - return dnn::ActivationMode::kReluX; - case ActivationMode::RELU6: - return dnn::ActivationMode::kRelu6; - case ActivationMode::TANH: - return dnn::ActivationMode::kTanh; - case ActivationMode::BANDPASS: - return dnn::ActivationMode::kBandPass; - } - // Prevent compiler warning about missing return - return dnn::ActivationMode::kRelu; -} - // A dummy type to group forward convolution autotune results together. struct ConvBiasActivationAutoTuneGroup { static string name() { return "ConvBiasActivation"; } }; -typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, ConvParameters, - perftools::gputools::dnn::AlgorithmConfig> +typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, FusedConvParameters, + dnn::AlgorithmConfig> AutoTuneConvBiasActivation; -template <typename T> -void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch( - OpKernelContext* ctx, bool cudnn_use_autotune, const Tensor& input_param, - const Tensor& filter, int32 row_stride, int32 col_stride, - const Tensor& bias, const ActivationMode& activation_mode, - const Eigen::PaddingType& padding, TensorFormat data_format, - Tensor* output) { - using perftools::gputools::dnn::AlgorithmConfig; - using perftools::gputools::dnn::AlgorithmType; - using perftools::gputools::dnn::ProfileResult; - using perftools::gputools::dnn::kDefaultAlgorithm; +// Allocates 'transformed_tensor' and transforms 'nhwc_tensor' into it +// using the specified 'batch_size', 'rows', 'cols', and 'depth' dimensions. +template <typename T, size_t NDIMS> +Status TransformNHWCToNCHW(OpKernelContext* ctx, const Tensor& nhwc_tensor, + int batch_size, int rows, int cols, int depth, + Tensor* transformed_tensor, const Tensor** result) { + TensorShape nchw_shape = + ShapeFromFormat(FORMAT_NCHW, batch_size, rows, cols, depth); + if (depth > 1) { + TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, nchw_shape, + transformed_tensor)); + functor::NHWCToNCHW<GPUDevice, T, NDIMS>()( + ctx->eigen_device<GPUDevice>(), nhwc_tensor.tensor<T, NDIMS>(), + transformed_tensor->tensor<T, NDIMS>()); + } else { + // If depth <= 1, then just reshape. + CHECK(transformed_tensor->CopyFrom(nhwc_tensor, nchw_shape)); + } + *result = transformed_tensor; + return Status::OK(); +} + +template <typename T, typename BiasType, typename ScaleType> +void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>:: + launch(OpKernelContext* ctx, bool cudnn_use_autotune, + const Tensor& conv_input_param, ScaleType conv_input_scale, + const Tensor& filter_param, int32 row_stride, int32 col_stride, + const Eigen::PaddingType& padding, const Tensor& side_input_param, + ScaleType side_input_scale, const Tensor& bias, + ActivationMode activation_mode, TensorFormat data_format, + FilterTensorFormat filter_format, Tensor* output_param) { auto* stream = ctx->op_device_context()->stream(); OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); - Tensor input = input_param; - - perftools::gputools::dnn::ActivationMode cudnn_activation_mode = - BrainActivationMode2CudnnActivationMode(activation_mode); - // TODO(yangzihao): refactor all the complicated/duplicated code in regular // conv ops to a shared conv utility. - int32 padding_rows = 0; - int32 padding_cols = 0; - const int64 in_batch = GetTensorDim(input, data_format, 'N'); - int64 in_rows = GetTensorDim(input, data_format, 'H'); - int64 in_cols = GetTensorDim(input, data_format, 'W'); - const int64 in_depths = GetTensorDim(input, data_format, 'C'); - const int64 out_batch = GetTensorDim(*output, data_format, 'N'); - const int64 out_rows = GetTensorDim(*output, data_format, 'H'); - const int64 out_cols = GetTensorDim(*output, data_format, 'W'); - const int64 out_depths = GetTensorDim(*output, data_format, 'C'); - const int64 patch_rows = filter.dim_size(0); - const int64 patch_cols = filter.dim_size(1); + + // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here. + constexpr int rank = std::is_same<T, qint8>::value ? 5 : 4; + constexpr int vect = std::is_same<T, qint8>::value ? 4 : 1; + + const int batch_size = GetTensorDim(conv_input_param, data_format, 'N'); + int conv_input_rows = GetTensorDim(conv_input_param, data_format, 'H'); + int conv_input_cols = GetTensorDim(conv_input_param, data_format, 'W'); + + const int conv_input_depth = + GetTensorDim(conv_input_param, data_format, 'C') * vect; + const int output_rows = GetTensorDim(*output_param, data_format, 'H'); + const int output_cols = GetTensorDim(*output_param, data_format, 'W'); + const int output_depth = GetFilterDim(filter_param, filter_format, 'O'); + const int filter_rows = GetFilterDim(filter_param, filter_format, 'H'); + const int filter_cols = GetFilterDim(filter_param, filter_format, 'W'); + + int padding_rows = 0; + int padding_cols = 0; + const Tensor* conv_input = &conv_input_param; + Tensor maybe_padded_conv_input; if (padding == Eigen::PADDING_SAME) { // Total padding on rows and cols is // Pr = (R' - 1) * S + Kr - R @@ -281,114 +300,146 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch( // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top // and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means // we pad more on the right and bottom than on the top and left. - padding_rows = - std::max<int32>(0, (out_rows - 1) * row_stride + patch_rows - in_rows); - padding_cols = - std::max<int32>(0, (out_cols - 1) * col_stride + patch_cols - in_cols); - const int rows_parity = padding_rows & 1; - const int cols_parity = padding_cols & 1; - if ((rows_parity | cols_parity) != 0) { + padding_rows = std::max<int>( + 0, (output_rows - 1) * row_stride + filter_rows - conv_input_rows); + padding_cols = std::max<int>( + 0, (output_cols - 1) * col_stride + filter_cols - conv_input_cols); + const int padding_rows_parity = padding_rows & 1; + const int padding_cols_parity = padding_cols & 1; + if ((padding_rows_parity | padding_cols_parity) != 0) { Tensor transformed_input; - int64 new_in_rows = in_rows + rows_parity; - int64 new_in_cols = in_cols + cols_parity; + const int new_conv_input_rows = conv_input_rows + padding_rows_parity; + const int new_conv_input_cols = conv_input_cols + padding_cols_parity; + OP_REQUIRES_OK( - ctx, - ctx->allocate_temp(DataTypeToEnum<T>::value, - ShapeFromFormat(data_format, in_batch, new_in_rows, - new_in_cols, in_depths), - &transformed_input)); - - functor::PadInput<GPUDevice, T, int, 4>()( - ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()), - {{0, 0}}, {{rows_parity, cols_parity}}, - To32Bit(transformed_input.tensor<T, 4>()), data_format); - - input = transformed_input; - in_rows = new_in_rows; - in_cols = new_in_cols; + ctx, ctx->allocate_temp( + DataTypeToEnum<T>::value, + ShapeFromFormat(data_format, batch_size, new_conv_input_rows, + new_conv_input_cols, conv_input_depth), + &maybe_padded_conv_input)); + + functor::PadInput<GPUDevice, T, int, rank>()( + ctx->eigen_device<GPUDevice>(), + To32Bit(conv_input_param.tensor<T, rank>()), {{0, 0}}, + {{padding_rows_parity, padding_cols_parity}}, + To32Bit(maybe_padded_conv_input.tensor<T, rank>()), data_format); + + conv_input = &maybe_padded_conv_input; + conv_input_rows = new_conv_input_rows; + conv_input_cols = new_conv_input_cols; } } - if (data_format == FORMAT_NHWC) { - // Convert the input tensor from NHWC to NCHW. - TensorShape nchw_shape = - ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths); - if (in_depths > 1) { - Tensor transformed_input; - OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, - nchw_shape, &transformed_input)); - functor::NHWCToNCHW<GPUDevice, T, 4>()( - ctx->eigen_device<GPUDevice>(), - const_cast<const Tensor&>(input).tensor<T, 4>(), - transformed_input.tensor<T, 4>()); - input = transformed_input; - } else { - // If depth <= 1, then just reshape. - CHECK(input.CopyFrom(input, nchw_shape)); + Tensor maybe_transformed_conv_input, maybe_transformed_side_input; + Tensor maybe_transformed_output; + const Tensor* side_input = &side_input_param; + Tensor* output = output_param; + + // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here. + if (!std::is_same<T, qint8>::value && (data_format == FORMAT_NHWC)) { + OP_REQUIRES_OK(ctx, (TransformNHWCToNCHW<T, rank>( + ctx, *conv_input, batch_size, conv_input_rows, + conv_input_cols, conv_input_depth, + &maybe_transformed_conv_input, &conv_input))); + if (side_input_scale != 0) { + OP_REQUIRES_OK( + ctx, (TransformNHWCToNCHW<T, rank>( + ctx, side_input_param, batch_size, output_rows, output_cols, + output_depth, &maybe_transformed_side_input, &side_input))); + } + if (output_depth > 1) { + // Allocate a tensor for the NCHW output of the kernel and point output + // to it. Afterwards, we will transform it to NHWC while copying back to + // 'output_param'. + TensorShape nchw_shape = ShapeFromFormat( + FORMAT_NCHW, batch_size, output_rows, output_cols, output_depth); + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<T>::value, nchw_shape, + &maybe_transformed_output)); + output = &maybe_transformed_output; } } - CHECK(padding_rows >= 0 && padding_cols >= 0) - << "Negative row or col paddings: (" << padding_rows << ", " - << padding_cols << ")"; - perftools::gputools::dnn::BatchDescriptor input_desc; - input_desc.set_count(in_batch) - .set_feature_map_count(in_depths) - .set_height(in_rows) - .set_width(in_cols) - .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); - perftools::gputools::dnn::BatchDescriptor output_desc; - output_desc.set_count(out_batch) - .set_height(out_rows) - .set_width(out_cols) - .set_feature_map_count(out_depths) - .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); - perftools::gputools::dnn::FilterDescriptor filter_desc; - filter_desc.set_input_filter_height(filter.dim_size(0)) - .set_input_filter_width(filter.dim_size(1)) - .set_input_feature_map_count(filter.dim_size(2)) - .set_output_feature_map_count(filter.dim_size(3)); - perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here. + constexpr auto data_layout = std::is_same<T, qint8>::value + ? dnn::DataLayout::kBatchDepthYX4 + : dnn::DataLayout::kBatchDepthYX; + constexpr auto filter_layout = std::is_same<T, qint8>::value + ? dnn::FilterLayout::kOutputInputYX4 + : dnn::FilterLayout::kOutputInputYX; + + dnn::BatchDescriptor conv_input_desc; + conv_input_desc.set_count(batch_size) + .set_feature_map_count(conv_input_depth) + .set_height(conv_input_rows) + .set_width(conv_input_cols) + .set_layout(data_layout); + dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(filter_rows) + .set_input_filter_width(filter_cols) + .set_input_feature_map_count(conv_input_depth) + .set_output_feature_map_count(output_depth) + .set_layout(filter_layout); + dnn::BatchDescriptor side_input_desc; + side_input_desc.set_count(batch_size) + .set_height(output_rows) + .set_width(output_cols) + .set_feature_map_count(output_depth) + .set_layout(data_layout); + dnn::BatchDescriptor bias_desc; + bias_desc.set_count(1) + .set_height(1) + .set_width(1) + .set_feature_map_count(output_depth) + .set_layout(dnn::DataLayout::kBatchDepthYX); + dnn::BatchDescriptor output_desc; + output_desc.set_count(batch_size) + .set_height(output_rows) + .set_width(output_cols) + .set_feature_map_count(output_depth) + .set_layout(data_layout); + dnn::ConvolutionDescriptor conv_desc; conv_desc.set_vertical_filter_stride(row_stride) .set_horizontal_filter_stride(col_stride) .set_zero_padding_height(padding_rows / 2) .set_zero_padding_width(padding_cols / 2); - // Shuffles a filter tensor from: - // [<spatial_dims>, in, out] - // to: - // [out, in, <spatial_dims>] - // TODO(yangzihao): Support a data layout tag for the filter weights, and only - // do the transform if the weights are not already in the correct layout. - Tensor transformed_filter; - OP_REQUIRES_OK(ctx, ctx->allocate_temp( - DataTypeToEnum<T>::value, - TensorShape({filter.dim_size(3), filter.dim_size(2), - filter.dim_size(0), filter.dim_size(1)}), - &transformed_filter)); - - functor::TransformFilter<GPUDevice, T, int, 4>()( - ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()), - To32Bit(transformed_filter.tensor<T, 4>())); - - Tensor transformed_output; - OP_REQUIRES_OK( - ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, - ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows, - out_cols, out_depths), - &transformed_output)); - - auto input_ptr = AsDeviceMemory(input.template flat<T>().data(), - input.template flat<T>().size()); + Tensor maybe_transformed_filter; + const Tensor* filter; + if (std::is_same<T, qint8>::value) { + // We have already checked filter is OIHW_VECT_I in the constructor. + filter = &filter_param; + } else if (filter_format == FORMAT_HWIO) { + // Shuffle filter tensor from HWIO to OIHW: + OP_REQUIRES_OK(ctx, ctx->allocate_temp( + DataTypeToEnum<T>::value, + ShapeFromFilterFormat( + FORMAT_OIHW, filter_param.shape(), FORMAT_HWIO), + &maybe_transformed_filter)); + functor::TransformFilter<GPUDevice, T, int, 4>()( + ctx->eigen_device<GPUDevice>(), To32Bit(filter_param.tensor<T, 4>()), + To32Bit(maybe_transformed_filter.tensor<T, 4>())); + filter = &maybe_transformed_filter; + } + + auto conv_input_ptr = + AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>( + conv_input->template flat<T>().data()), + conv_input->template flat<T>().size()); auto filter_ptr = - AsDeviceMemory(transformed_filter.template flat<T>().data(), - transformed_filter.template flat<T>().size()); + AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>( + filter->template flat<T>().data()), + filter->template flat<T>().size()); + auto side_input_ptr = + AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>( + side_input->template flat<T>().data()), + side_input->template flat<T>().size()); auto output_ptr = - AsDeviceMemory(transformed_output.template flat<T>().data(), - transformed_output.template flat<T>().size()); - - auto bias_ptr = AsDeviceMemory(bias.template flat<T>().data(), - bias.template flat<T>().size()); + AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>( + output->template flat<T>().data()), + output->template flat<T>().size()); + auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(), + bias.template flat<BiasType>().size()); static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit( // default value is in bytes despite the name of the environment variable @@ -396,38 +447,42 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch( ); int device_id = stream->parent()->device_ordinal(); - DataType dtype = input.dtype(); - ConvParameters conv_parameters = { - in_batch, - in_depths, - {{in_rows, in_cols}}, - out_depths, - {{patch_rows, patch_cols}}, + FusedConvParameters fused_conv_parameters = { + batch_size, + conv_input_depth, + {{conv_input_rows, conv_input_cols}}, + output_depth, + {{filter_rows, filter_cols}}, {{row_stride, col_stride}}, {{padding_rows, padding_cols}}, - dtype, + conv_input->dtype(), device_id, + (side_input_scale != 0), + activation_mode, }; - AlgorithmConfig algorithm_config; + dnn::AlgorithmConfig algorithm_config; if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find( - conv_parameters, &algorithm_config)) { - std::vector<AlgorithmType> algorithms; + fused_conv_parameters, &algorithm_config)) { + std::vector<dnn::AlgorithmType> algorithms; CHECK(stream->parent()->GetConvolveAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms)); - ProfileResult best_result; - ProfileResult best_result_no_scratch; + fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), + &algorithms)); + dnn::ProfileResult best_result; + dnn::ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { // TODO(zhengxq): profile each algorithm multiple times to better // accuracy. CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx); - ProfileResult profile_result; + dnn::ProfileResult profile_result; bool cudnn_launch_status = stream - ->ThenConvolveWithAlgorithm( - input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, - bias_ptr, cudnn_activation_mode, output_desc, &output_ptr, - &scratch_allocator, AlgorithmConfig(profile_algorithm), + ->ThenFusedConvolveWithAlgorithm( + conv_input_desc, conv_input_ptr, conv_input_scale, + filter_desc, filter_ptr, conv_desc, side_input_ptr, + side_input_scale, bias_desc, bias_ptr, + dnn::ActivationMode::kRelu, output_desc, &output_ptr, + &scratch_allocator, dnn::AlgorithmConfig(profile_algorithm), &profile_result) .ok(); if (cudnn_launch_status) { @@ -454,42 +509,53 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch( algorithm_config.set_algorithm_no_scratch( best_result_no_scratch.algorithm()); } - AutoTuneConvBiasActivation::GetInstance()->Insert(conv_parameters, + AutoTuneConvBiasActivation::GetInstance()->Insert(fused_conv_parameters, algorithm_config); } CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx); bool cudnn_launch_status = stream - ->ThenConvolveWithAlgorithm( - input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, - bias_ptr, cudnn_activation_mode, output_desc, &output_ptr, - &scratch_allocator, algorithm_config, + ->ThenFusedConvolveWithAlgorithm( + conv_input_desc, conv_input_ptr, conv_input_scale, filter_desc, + filter_ptr, conv_desc, side_input_ptr, side_input_scale, + bias_desc, bias_ptr, dnn::ActivationMode::kRelu, output_desc, + &output_ptr, &scratch_allocator, algorithm_config, /*output_profile_result=*/nullptr) .ok(); if (!cudnn_launch_status) { - ctx->SetStatus(errors::Internal( - "cuDNN launch failure : input shape(", input.shape().DebugString(), - ") filter shape(", filter.shape().DebugString(), ")")); + ctx->SetStatus(errors::Internal("cuDNN launch failure : conv_input shape(", + conv_input->shape().DebugString(), + ") filter shape(", + filter->shape().DebugString(), ")")); } - // Convert the output tensor back from NCHW to NHWC. - if (data_format == FORMAT_NHWC) { + // Convert the output tensor back from NCHW to NHWC if necessary. + if (!std::is_same<T, qint8>::value && (data_format == FORMAT_NHWC) && + (output_depth > 1)) { functor::NCHWToNHWC<GPUDevice, T, 4>()( ctx->eigen_device<GPUDevice>(), - const_cast<const Tensor&>(transformed_output).tensor<T, 4>(), - output->tensor<T, 4>()); - } else { - *output = transformed_output; + const_cast<const Tensor*>(output)->tensor<T, 4>(), + output_param->tensor<T, 4>()); } } // Registration of the GPU implementations. -REGISTER_KERNEL_BUILDER(Name("FusedConv2DBiasActivation") - .Device(DEVICE_GPU) - .TypeConstraint<float>("T"), - FusedConv2DBiasActivationOp<GPUDevice, float>); + +REGISTER_KERNEL_BUILDER( + Name("FusedConv2DBiasActivation") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .TypeConstraint<float>("Tbias"), + FusedConv2DBiasActivationOp<GPUDevice, float, float, float>); + +REGISTER_KERNEL_BUILDER( + Name("FusedConv2DBiasActivation") + .Device(DEVICE_GPU) + .TypeConstraint<qint8>("T") + .TypeConstraint<float>("Tbias"), + FusedConv2DBiasActivationOp<GPUDevice, qint8, float, float>); #endif // GOOGLE_CUDA diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h index d71b26cf1d..7534f5797c 100644 --- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h +++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h @@ -24,7 +24,7 @@ limitations under the License. #if GOOGLE_CUDA #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h" #include "tensorflow/core/platform/stream_executor.h" #endif // GOOGLE_CUDA @@ -33,27 +33,30 @@ namespace tensorflow { // Forward declaration. class OpKernelContext; -template <typename Device, typename T> +template <typename Device, typename T, typename BiasType, typename ScaleType> class LaunchFusedConv2DBiasActivationOp { public: void launch(OpKernelContext* ctx, bool cudnn_use_autotune, - const Tensor& input, const Tensor& filter, int row_stride, - int col_stride, const Tensor& bias, - const ActivationMode& activation_mode, - const Eigen::PaddingType& padding, TensorFormat data_format, - Tensor* output); + const Tensor& conv_input, ScaleType conv_input_scale, + const Tensor& filter, int32 row_stride, int32 col_stride, + const Eigen::PaddingType& padding, const Tensor& side_input, + ScaleType side_input_scale, const Tensor& bias, + ActivationMode activation_mode, TensorFormat data_format, + FilterTensorFormat filter_format, Tensor* output); }; #ifdef GOOGLE_CUDA -template <typename T> -class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T> { +template <typename T, typename BiasType, typename ScaleType> +class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T, BiasType, + ScaleType> { public: void launch(OpKernelContext* ctx, bool cudnn_use_autotune, - const Tensor& input, const Tensor& filter, int32 row_stride, - int32 col_stride, const Tensor& bias, - const ActivationMode& activation_mode, - const Eigen::PaddingType& padding, TensorFormat data_format, - Tensor* output); + const Tensor& conv_input, ScaleType conv_input_scale, + const Tensor& filter, int32 row_stride, int32 col_stride, + const Eigen::PaddingType& padding, const Tensor& side_input, + ScaleType side_input_scale, const Tensor& bias, + ActivationMode activation_mode, TensorFormat data_format, + FilterTensorFormat filter_format, Tensor* output); }; #endif // GOOGLE_CUDA diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h new file mode 100644 index 0000000000..dc43af1158 --- /dev/null +++ b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h @@ -0,0 +1,74 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_ +#define THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_ + +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/util/activation_mode.h" + +// TODO(pauldonnelly): Merge this file into core/kernels/conv_ops_gpu.h. + +namespace tensorflow { + +// Add additional parameters specific to fused convolutions. +class FusedConvParameters : public ConvParameters { + public: + FusedConvParameters(int64 batch, int64 in_depths, const SpatialArray& in, + int64 out_depths, const SpatialArray& filter, + const SpatialArray& stride, const SpatialArray& padding, + DataType dtype, int device_id, bool has_side_input, + ActivationMode activation_mode) + : ConvParameters(batch, in_depths, in, out_depths, filter, stride, + padding, dtype, device_id), + activation_mode_(activation_mode), + has_side_input_(has_side_input) { + hash_code_ = Hash64Combine(hash_code_, has_side_input); + hash_code_ = Hash64Combine(hash_code_, activation_mode); + } + + bool operator==(const FusedConvParameters& other) const { + return this->get_data_as_tuple() == other.get_data_as_tuple(); + } + + bool operator!=(const FusedConvParameters& other) const { + return !(*this == other); + } + + string ToString() const { + return strings::StrCat(ConvParameters::ToString(), ", ", has_side_input_, + ", ", activation_mode_, ", "); + } + + private: + using ParameterDataType = + std::tuple<ConvParameters::ParameterDataType, bool, ActivationMode>; + + ParameterDataType get_data_as_tuple() const { + return std::make_tuple(ConvParameters::get_data_as_tuple(), has_side_input_, + activation_mode_); + } + + ActivationMode activation_mode_; + bool has_side_input_; +}; + +} // namespace tensorflow + +#endif // GOOGLE_CUDA + +#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_ |