diff options
author | Yao Zhang <yaozhang@google.com> | 2016-09-15 20:44:49 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2016-09-15 21:47:12 -0700 |
commit | 04df8d868fab5df0002fa0ec2765dc2e0aeb68d6 (patch) | |
tree | ff4155ad9ced636cc1cca1451aa702805401a516 | |
parent | 4e96e274443805df8afad5cb48f654fbf1776a4a (diff) |
Add the interface in steam executor to call cuDNN batch normalization functions.
Change: 133345765
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.cc | 158 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.h | 46 | ||||
-rw-r--r-- | tensorflow/stream_executor/dnn.h | 77 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.cc | 56 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.h | 21 |
5 files changed, 346 insertions, 12 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 55535f9ce5..6757690832 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -171,6 +171,9 @@ size_t cudnnGetVersion() { // clang-format off #define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnBatchNormalizationBackward) \ + __macro(cudnnBatchNormalizationForwardInference) \ + __macro(cudnnBatchNormalizationForwardTraining) \ __macro(cudnnGetConvolutionNdForwardOutputDim) \ __macro(cudnnGetConvolutionForwardAlgorithm) \ __macro(cudnnCreateTensorDescriptor) \ @@ -778,6 +781,16 @@ class ScopedActivationDescriptor { #endif namespace { +cudnnDataType_t ToCudnnDataType(dnn::DataType data_type) { + switch (data_type) { + case dnn::DataType::kFloat: + case dnn::DataType::kDouble: + case dnn::DataType::kHalf: + return static_cast<cudnnDataType_t>(data_type); + default: + LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type); + } +} #if CUDNN_VERSION >= 5000 @@ -815,17 +828,6 @@ cudnnRNNMode_t ToCudnnRnnMode(dnn::RnnMode rnn_mode) { } } -cudnnDataType_t ToCudnnDataType(dnn::DataType data_type) { - switch (data_type) { - case dnn::DataType::kFloat: - case dnn::DataType::kDouble: - case dnn::DataType::kHalf: - return static_cast<cudnnDataType_t>(data_type); - default: - LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type); - } -} - int CudnnDataTypeToByteSize(cudnnDataType_t data_type) { switch (data_type) { case CUDNN_DATA_FLOAT: @@ -2022,6 +2024,140 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms( return true; } +bool CudnnSupport::DoBatchNormalizationForward( + Stream* stream, const DeviceMemory<float>& x, + const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, + const DeviceMemory<float>& estimated_mean, + const DeviceMemory<float>& estimated_variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<float>* y, DeviceMemory<float>* batch_mean, + DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean, + DeviceMemory<float>* saved_inv_var, bool is_training, + std::function<const DeviceMemory<float>&()> var_to_inv_var, + std::function<void()> inv_var_to_var) { + return DoBatchNormalizationForwardImpl<float>( + stream, dnn::DataType::kFloat, x, scale, offset, estimated_mean, + estimated_variance, x_desc, scale_offset_desc, epsilon, y, batch_mean, + batch_var, saved_mean, saved_inv_var, is_training, + std::move(var_to_inv_var), std::move(inv_var_to_var)); +} + +template <class T> +bool CudnnSupport::DoBatchNormalizationForwardImpl( + Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x, + const DeviceMemory<T>& scale, const DeviceMemory<T>& offset, + const DeviceMemory<T>& estimated_mean, + const DeviceMemory<T>& estimated_variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<T>* y, DeviceMemory<T>* batch_mean, DeviceMemory<T>* batch_var, + DeviceMemory<T>* saved_mean, DeviceMemory<T>* saved_inv_var, + bool is_training, std::function<const DeviceMemory<T>&()> var_to_inv_var, + std::function<void()> inv_var_to_var) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + ScopedTensorDescriptor x_descriptor{parent_, x_desc, + ToCudnnDataType(data_type)}; + ScopedTensorDescriptor scale_offset_descriptor{parent_, scale_offset_desc, + ToCudnnDataType(data_type)}; + cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; + float one = 1.0; + float zero = 0.0; + + if (is_training) { + stream->ThenMemZero(batch_mean, batch_mean->size()); + stream->ThenMemZero(batch_var, batch_var->size()); + status = dynload::cudnnBatchNormalizationForwardTraining( + parent_, ToHandle(dnn_handle_), mode, &one, &zero, + x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(), + scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0, + batch_mean->opaque(), batch_var->opaque(), epsilon, + saved_mean->opaque(), saved_inv_var->opaque()); +#if CUDNN_VERSION < 5000 + CHECK(inv_var_to_var); + inv_var_to_var(); +#endif + } else { +#if CUDNN_VERSION < 5000 + CHECK(var_to_inv_var); + const void* maybe_inv_var = var_to_inv_var().opaque(); +#else + const void* maybe_inv_var = estimated_variance.opaque(); +#endif + status = dynload::cudnnBatchNormalizationForwardInference( + parent_, ToHandle(dnn_handle_), mode, &one, &zero, + x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(), + scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), + estimated_mean.opaque(), maybe_inv_var, epsilon); + } + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue forward batch normalization on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoBatchNormalizationBackward( + Stream* stream, const DeviceMemory<float>& y_backprop, + const DeviceMemory<float>& x, const DeviceMemory<float>& scale, + const DeviceMemory<float>& mean, const DeviceMemory<float>& variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop, + DeviceMemory<float>* offset_backprop) { + return DoBatchNormalizationBackwardImpl( + stream, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean, variance, x_desc, + scale_offset_desc, epsilon, x_backprop, scale_backprop, offset_backprop); +} + +template <class T> +bool CudnnSupport::DoBatchNormalizationBackwardImpl( + Stream* stream, int cudnn_type, const DeviceMemory<T>& y_backprop, + const DeviceMemory<T>& x, const DeviceMemory<T>& scale, + const DeviceMemory<T>& mean, const DeviceMemory<T>& variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<T>* x_backprop, DeviceMemory<T>* scale_backprop, + DeviceMemory<T>* offset_backprop) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + ScopedTensorDescriptor x_descriptor{parent_, x_desc, + static_cast<cudnnDataType_t>(cudnn_type)}; + ScopedTensorDescriptor scale_offset_descriptor{ + parent_, scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_type)}; + cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; + float one = 1.0; + float zero = 0.0; + + status = dynload::cudnnBatchNormalizationBackward( + parent_, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero, + x_descriptor.handle(), x.opaque(), x_descriptor.handle(), + y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(), + scale_offset_descriptor.handle(), scale.opaque(), + scale_backprop->opaque(), offset_backprop->opaque(), epsilon, + mean.opaque(), variance.opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue backward batch normalization on stream: " + << ToString(status); + return false; + } + return true; +} + bool CudnnSupport::DoConvolve( Stream* stream, const BatchDescriptor& batch_descriptor, const DeviceMemory<float>& input_data, diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index f8bc0c493f..8101ebf258 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -112,6 +112,28 @@ class CudnnSupport : public dnn::DnnSupport { bool GetConvolveBackwardFilterAlgorithms( std::vector<dnn::AlgorithmType>* out_algorithms) override; + bool DoBatchNormalizationForward( + Stream* stream, const DeviceMemory<float>& x, + const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, + const DeviceMemory<float>& estimated_mean, + const DeviceMemory<float>& estimated_variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<float>* y, DeviceMemory<float>* batch_mean, + DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean, + DeviceMemory<float>* saved_inv_var, bool is_training, + std::function<const DeviceMemory<float>&()> var_to_inv_var, + std::function<void()> inv_var_to_var) override; + + bool DoBatchNormalizationBackward( + Stream* stream, const DeviceMemory<float>& y_backprop, + const DeviceMemory<float>& x, const DeviceMemory<float>& scale, + const DeviceMemory<float>& mean, const DeviceMemory<float>& variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop, + DeviceMemory<float>* offset_backprop) override; + bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory<float>& input_data, const dnn::FilterDescriptor& filter_descriptor, @@ -379,6 +401,30 @@ class CudnnSupport : public dnn::DnnSupport { EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_); template <class T> + bool DoBatchNormalizationForwardImpl( + Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x, + const DeviceMemory<T>& scale, const DeviceMemory<T>& offset, + const DeviceMemory<T>& estimated_mean, + const DeviceMemory<T>& estimated_variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<T>* y, DeviceMemory<T>* batch_mean, + DeviceMemory<T>* batch_var, DeviceMemory<T>* saved_mean, + DeviceMemory<T>* saved_inv_var, bool is_training, + std::function<const DeviceMemory<T>&()> var_to_inv_var, + std::function<void()> inv_var_to_var); + + template <class T> + bool DoBatchNormalizationBackwardImpl( + Stream* stream, int cudnn_type, const DeviceMemory<T>& y_backprop, + const DeviceMemory<T>& x, const DeviceMemory<T>& scale, + const DeviceMemory<T>& mean, const DeviceMemory<T>& variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<T>* x_backprop, DeviceMemory<T>* scale_backprop, + DeviceMemory<T>* offset_backprop); + + template <class T> bool DoConvolveImpl(Stream* stream, int cudnn_type, // Actually cudnnDataType_t. const dnn::BatchDescriptor& batch_descriptor, diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 1c31178526..d83d3042d5 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -22,6 +22,7 @@ limitations under the License. #ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_ #define TENSORFLOW_STREAM_EXECUTOR_DNN_H_ +#include <functional> #include <limits> #include <memory> @@ -775,7 +776,8 @@ enum class ElementwiseOperation { kAdd, kMultiply }; string ElementwiseOperationString(ElementwiseOperation op); // Suite of operations typically used for implementing Deep/Convolutional Neural -// Nets. +// Nets. Note: A false return value of an operation indicates the +// implementation is not available. class DnnSupport { public: DnnSupport() {} @@ -783,6 +785,79 @@ class DnnSupport { virtual port::Status Init() = 0; + // Performs a single-precision forward batch normalization operation onto + // the stream. + // + // Arguments: + // stream: borrowed pointer to the stream that the batch normalization + // operation should be enqueued onto. + // x: input data. + // scale: scaling parameters. + // offset: offset parameters. + // estimated_mean: population mean estimated during training. + // Used for inference only; empty for training. + // estimated_variance: population variance estimated during traning, + // used for inference only; empty for training. + // x_desc: dimensions of the input data, which is the same as the dimensions + // of the output. + // scale_offset_desc: dimensions of scale and offset. + // epsilon: a small floating point number added to the variance of x. + // y: output data. + // batch_mean: batch mean, to be used to compute the running mean. + // batch_variance: batch variance, to be used to compute + // the running variance. + // reserve_space_1: saved mean, to be reused in the backward gradient + // computation. + // reserve_space_2: saved variance, to be reused in the backward gradient + // computation. + // is_training: Set to true for training, false for inference. + // var_to_inv_var: a function to convert the variance to inverted variance + // for cuDNN v4 forward inference. + // inv_var_to_var: a function to convert the inverted variance to + // variance for cuDNN v4 forward training, to be used for TensorFlow + // to calculate the running variance. + virtual bool DoBatchNormalizationForward( + Stream* stream, const DeviceMemory<float>& x, + const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, + const DeviceMemory<float>& estimated_mean, + const DeviceMemory<float>& estimated_variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<float>* y, DeviceMemory<float>* batch_mean, + DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1, + DeviceMemory<float>* reserve_space_2, bool is_training, + std::function<const DeviceMemory<float>&()> var_to_inv_var, + std::function<void()> inv_var_to_var) { + return false; + } + + // Performs a single-precision backward batch normalization gradient + // computation operation onto the stream. + // + // Arguments: + // stream: borrowed pointer to the stream that the batch normalization + // gradient computation operation should be enqueued onto. + // y_backprop: gradient with regard to output y. + // x: input data. + // scale: scaling parameters. + // x_desc: dimensions of the input data, which is the same as the dimensions + // of the output. + // scale_offset_desc: dimensions of scale and offset. + // epsilon: a small floating point number added to the variance of x. + // x_backprop: gradient with respect to input x. + // scale_backprop: gradient with respect to scale. + // offset_backprop: gradient with respect to offset. + virtual bool DoBatchNormalizationBackward( + Stream* stream, const DeviceMemory<float>& y_backprop, + const DeviceMemory<float>& x, const DeviceMemory<float>& scale, + const DeviceMemory<float>& mean, const DeviceMemory<float>& variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop, + DeviceMemory<float>* offset_backprop) { + return false; + } + // Enqueues a single-precision convolution operation onto the stream. // // Arguments (all borrowed): diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 8c0e45f1a6..512e882cad 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -277,6 +277,62 @@ Stream &Stream::ThenRecordEvent(Event *event) { return *this; } +Stream &Stream::ThenBatchNormalizationForward( + const DeviceMemory<float> &x, const DeviceMemory<float> &scale, + const DeviceMemory<float> &offset, + const DeviceMemory<float> &estimated_mean, + const DeviceMemory<float> &estimated_variance, + const dnn::BatchDescriptor &x_desc, + const dnn::BatchDescriptor &scale_offset_desc, const double epsilon, + DeviceMemory<float> *y, DeviceMemory<float> *batch_mean, + DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean, + DeviceMemory<float> *saved_inv_var, bool is_training, + std::function<const DeviceMemory<float> &()> var_to_inv_var, + std::function<void()> inv_var_to_var) { + VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc), + PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y)); + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoBatchNormalizationForward( + this, x, scale, offset, estimated_mean, estimated_variance, x_desc, + scale_offset_desc, epsilon, y, batch_mean, batch_var, saved_mean, + saved_inv_var, is_training, std::move(var_to_inv_var), + std::move(inv_var_to_var))); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenBatchNormalizationBackward( + const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x, + const DeviceMemory<float> &scale, const DeviceMemory<float> &mean, + const DeviceMemory<float> &variance, const dnn::BatchDescriptor &x_desc, + const dnn::BatchDescriptor &scale_offset_desc, const double epsilon, + DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop, + DeviceMemory<float> *offset_backprop) { + VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc), + PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop), + PARAM(scale_backprop), PARAM(offset_backprop)); + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoBatchNormalizationBackward( + this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc, + epsilon, x_backprop, scale_backprop, offset_backprop)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + Stream &Stream::ThenConvolveWithScratch( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory<Eigen::half> &input_data, diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index 61058528c2..0d16495a1d 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -215,6 +215,27 @@ class Stream { // // See DnnSupport::* for comments on the following methods. + Stream &ThenBatchNormalizationForward( + const DeviceMemory<float> &x, const DeviceMemory<float> &scale, + const DeviceMemory<float> &offset, + const DeviceMemory<float> &estimated_mean, + const DeviceMemory<float> &estimated_variance, + const dnn::BatchDescriptor &x_desc, + const dnn::BatchDescriptor &scale_offset_desc, const double epsilon, + DeviceMemory<float> *y, DeviceMemory<float> *batch_mean, + DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean, + DeviceMemory<float> *saved_inv_var, bool is_training, + std::function<const DeviceMemory<float> &()> var_to_inv_var, + std::function<void()> inv_var_to_var); + + Stream &ThenBatchNormalizationBackward( + const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x, + const DeviceMemory<float> &scale, const DeviceMemory<float> &mean, + const DeviceMemory<float> &variance, const dnn::BatchDescriptor &x_desc, + const dnn::BatchDescriptor &scale_offset_desc, const double epsilon, + DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop, + DeviceMemory<float> *offset_backprop); + // TODO(leary) add double-precision version of this interface. Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor, const DeviceMemory<float> &input_data, |