diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-05-22 17:16:44 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-05-22 17:19:42 -0700 |
commit | 25ad31da87086a88d1d14ed5db8731bb9fc90787 (patch) | |
tree | 48ae94be34a6c769ee2859a0a95a0ddb150ee128 /tensorflow/stream_executor | |
parent | 09620a1fd3f28cc23f6627884927b6098717355e (diff) |
Add convolution with NHWC layout to stream executor.
PiperOrigin-RevId: 197650067
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.cc | 70 | ||||
-rw-r--r-- | tensorflow/stream_executor/dnn.h | 2 |
2 files changed, 7 insertions, 65 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 5ece80e551..c2c0c283b3 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -457,6 +457,9 @@ class ScopedFilterDescriptor { case dnn::FilterLayout::kOutputInputYX: format = CUDNN_TENSOR_NCHW; break; + case dnn::FilterLayout::kOutputYXInput: + format = CUDNN_TENSOR_NHWC; + break; case dnn::FilterLayout::kOutputInputYX4: format = CUDNN_TENSOR_NCHW_VECT_C; break; @@ -3046,53 +3049,6 @@ bool CudnnSupport::DoFusedConvolve( output_profile_result); } -namespace { -// NOTE(keveman): Temporary data layout transformation until cuDNN supports -// kBatchYXDepth for backward pass. This function allocates temporary memory, -// lays out the source data into the temporary but in the kBatchDepthXY -// layout, and returns the temporary memory. The caller is responsible for -// deallocating the temporary. Since the allocation is done using Stream's -// AllocateTemporaryMemory, a later BlockHostUntilDone could be used for -// deallocation. -// -// transform_scratch is populated with a legitimate temporary allocation iff -// the original output data needs to be transformed. -template <class T> -DeviceMemory<T> MaybeTransformLayout( - Stream* stream, const CudnnHandle& cudnn, - dnn::BatchDescriptor* output_descriptor, - DeviceMemory<T> backward_output_data, - std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch) { - if (output_descriptor->layout() == dnn::DataLayout::kBatchDepthYX) { - return backward_output_data; - } - CHECK(output_descriptor->layout() == dnn::DataLayout::kBatchYXDepth); - *transform_scratch = - stream->AllocateTemporaryArray<T>(backward_output_data.ElementCount()) - .ConsumeValueOrDie(); - dnn::BatchDescriptor transformed_output_descriptor; - transformed_output_descriptor.CloneFrom(*output_descriptor); - transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX); - cudnnDataType_t cudnn_type = GetCudnnDataType<T>(); - ScopedTensorDescriptor orig_out_back_nd(*output_descriptor, cudnn_type); - ScopedTensorDescriptor transformed_out_back_nd(transformed_output_descriptor, - cudnn_type); - - float alpha = 1.0f; - float beta = 0.0f; - auto status = cudnnTransformTensor( - cudnn.handle(), &alpha, orig_out_back_nd.handle(), - backward_output_data.opaque(), &beta, transformed_out_back_nd.handle(), - (*transform_scratch)->mutable_device_memory()->opaque()); - - if (status != CUDNN_STATUS_SUCCESS) { - LOG(FATAL) << "Failed to transform the data layout."; - } - output_descriptor->set_layout(dnn::DataLayout::kBatchDepthYX); - return (*transform_scratch)->device_memory(); -} -} // namespace - bool CudnnSupport::DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc, dnn::DataType input_type, @@ -3124,7 +3080,7 @@ template <class T> bool CudnnSupport::DoConvolveBackwardDataImpl( Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory<T>& filter_data, - const dnn::BatchDescriptor& output_descriptor_in, + const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T> backward_output_data, const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& input_descriptor, @@ -3145,14 +3101,6 @@ bool CudnnSupport::DoConvolveBackwardDataImpl( auto cudnn = cudnn_->GetHandle(parent_, stream); - // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass. - dnn::BatchDescriptor output_descriptor; - output_descriptor.CloneFrom(output_descriptor_in); - std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch; - backward_output_data = - MaybeTransformLayout(stream, cudnn, &output_descriptor, - backward_output_data, &transform_scratch); - ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type); ScopedTensorDescriptor in_back_nd(input_descriptor, cudnn_type); ScopedFilterDescriptor filter(filter_descriptor, cudnn_type); @@ -3386,7 +3334,7 @@ template <class T> bool CudnnSupport::DoConvolveBackwardFilterImpl( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory<T>& input_data, - const dnn::BatchDescriptor& output_descriptor_in, + const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T> backward_output_data, const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::FilterDescriptor& filter_descriptor, @@ -3407,14 +3355,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl( auto cudnn = cudnn_->GetHandle(parent_, stream); - // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass. - dnn::BatchDescriptor output_descriptor; - output_descriptor.CloneFrom(output_descriptor_in); - std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch; - backward_output_data = - MaybeTransformLayout(stream, cudnn, &output_descriptor, - backward_output_data, &transform_scratch); - ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type); ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type); ScopedFilterDescriptor filter(filter_descriptor, cudnn_type); diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 38abc66079..3df5365c23 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -349,6 +349,8 @@ enum class FilterLayout : int64 { kOutputInputYX = 0, // cuDNN's default filter layout, laid out as: // (major) output feature maps >> input feature maps >> // rows >> columns (minor). + kOutputYXInput, // major to minor: + // (output features, row, columns, input features) kOutputInputYX4, // laid out the same as kOutputInputYX but each element is a // vector of 4 feature maps. kInputYXOutput, // Same as dist_belief's default filter layout. |