aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Yao Zhang <yaozhang@google.com>2016-09-15 20:44:49 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-09-15 21:47:12 -0700
commit04df8d868fab5df0002fa0ec2765dc2e0aeb68d6 (patch)
treeff4155ad9ced636cc1cca1451aa702805401a516
parent4e96e274443805df8afad5cb48f654fbf1776a4a (diff)
Add the interface in steam executor to call cuDNN batch normalization functions.
Change: 133345765
-rw-r--r--tensorflow/stream_executor/cuda/cuda_dnn.cc158
-rw-r--r--tensorflow/stream_executor/cuda/cuda_dnn.h46
-rw-r--r--tensorflow/stream_executor/dnn.h77
-rw-r--r--tensorflow/stream_executor/stream.cc56
-rw-r--r--tensorflow/stream_executor/stream.h21
5 files changed, 346 insertions, 12 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 55535f9ce5..6757690832 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -171,6 +171,9 @@ size_t cudnnGetVersion() {
// clang-format off
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
+ __macro(cudnnBatchNormalizationBackward) \
+ __macro(cudnnBatchNormalizationForwardInference) \
+ __macro(cudnnBatchNormalizationForwardTraining) \
__macro(cudnnGetConvolutionNdForwardOutputDim) \
__macro(cudnnGetConvolutionForwardAlgorithm) \
__macro(cudnnCreateTensorDescriptor) \
@@ -778,6 +781,16 @@ class ScopedActivationDescriptor {
#endif
namespace {
+cudnnDataType_t ToCudnnDataType(dnn::DataType data_type) {
+ switch (data_type) {
+ case dnn::DataType::kFloat:
+ case dnn::DataType::kDouble:
+ case dnn::DataType::kHalf:
+ return static_cast<cudnnDataType_t>(data_type);
+ default:
+ LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
+ }
+}
#if CUDNN_VERSION >= 5000
@@ -815,17 +828,6 @@ cudnnRNNMode_t ToCudnnRnnMode(dnn::RnnMode rnn_mode) {
}
}
-cudnnDataType_t ToCudnnDataType(dnn::DataType data_type) {
- switch (data_type) {
- case dnn::DataType::kFloat:
- case dnn::DataType::kDouble:
- case dnn::DataType::kHalf:
- return static_cast<cudnnDataType_t>(data_type);
- default:
- LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
- }
-}
-
int CudnnDataTypeToByteSize(cudnnDataType_t data_type) {
switch (data_type) {
case CUDNN_DATA_FLOAT:
@@ -2022,6 +2024,140 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
return true;
}
+bool CudnnSupport::DoBatchNormalizationForward(
+ Stream* stream, const DeviceMemory<float>& x,
+ const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+ const DeviceMemory<float>& estimated_mean,
+ const DeviceMemory<float>& estimated_variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
+ DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+ DeviceMemory<float>* saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<float>&()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) {
+ return DoBatchNormalizationForwardImpl<float>(
+ stream, dnn::DataType::kFloat, x, scale, offset, estimated_mean,
+ estimated_variance, x_desc, scale_offset_desc, epsilon, y, batch_mean,
+ batch_var, saved_mean, saved_inv_var, is_training,
+ std::move(var_to_inv_var), std::move(inv_var_to_var));
+}
+
+template <class T>
+bool CudnnSupport::DoBatchNormalizationForwardImpl(
+ Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x,
+ const DeviceMemory<T>& scale, const DeviceMemory<T>& offset,
+ const DeviceMemory<T>& estimated_mean,
+ const DeviceMemory<T>& estimated_variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<T>* y, DeviceMemory<T>* batch_mean, DeviceMemory<T>* batch_var,
+ DeviceMemory<T>* saved_mean, DeviceMemory<T>* saved_inv_var,
+ bool is_training, std::function<const DeviceMemory<T>&()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) {
+ mutex_lock lock{dnn_handle_mutex_};
+ auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+ AsCUDAStreamValue(stream));
+ if (status != CUDNN_STATUS_SUCCESS) {
+ LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+ return false;
+ }
+
+ ScopedTensorDescriptor x_descriptor{parent_, x_desc,
+ ToCudnnDataType(data_type)};
+ ScopedTensorDescriptor scale_offset_descriptor{parent_, scale_offset_desc,
+ ToCudnnDataType(data_type)};
+ cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+ float one = 1.0;
+ float zero = 0.0;
+
+ if (is_training) {
+ stream->ThenMemZero(batch_mean, batch_mean->size());
+ stream->ThenMemZero(batch_var, batch_var->size());
+ status = dynload::cudnnBatchNormalizationForwardTraining(
+ parent_, ToHandle(dnn_handle_), mode, &one, &zero,
+ x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
+ scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0,
+ batch_mean->opaque(), batch_var->opaque(), epsilon,
+ saved_mean->opaque(), saved_inv_var->opaque());
+#if CUDNN_VERSION < 5000
+ CHECK(inv_var_to_var);
+ inv_var_to_var();
+#endif
+ } else {
+#if CUDNN_VERSION < 5000
+ CHECK(var_to_inv_var);
+ const void* maybe_inv_var = var_to_inv_var().opaque();
+#else
+ const void* maybe_inv_var = estimated_variance.opaque();
+#endif
+ status = dynload::cudnnBatchNormalizationForwardInference(
+ parent_, ToHandle(dnn_handle_), mode, &one, &zero,
+ x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
+ scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(),
+ estimated_mean.opaque(), maybe_inv_var, epsilon);
+ }
+ if (status != CUDNN_STATUS_SUCCESS) {
+ LOG(ERROR) << "failed to enqueue forward batch normalization on stream: "
+ << ToString(status);
+ return false;
+ }
+ return true;
+}
+
+bool CudnnSupport::DoBatchNormalizationBackward(
+ Stream* stream, const DeviceMemory<float>& y_backprop,
+ const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
+ const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
+ DeviceMemory<float>* offset_backprop) {
+ return DoBatchNormalizationBackwardImpl(
+ stream, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean, variance, x_desc,
+ scale_offset_desc, epsilon, x_backprop, scale_backprop, offset_backprop);
+}
+
+template <class T>
+bool CudnnSupport::DoBatchNormalizationBackwardImpl(
+ Stream* stream, int cudnn_type, const DeviceMemory<T>& y_backprop,
+ const DeviceMemory<T>& x, const DeviceMemory<T>& scale,
+ const DeviceMemory<T>& mean, const DeviceMemory<T>& variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<T>* x_backprop, DeviceMemory<T>* scale_backprop,
+ DeviceMemory<T>* offset_backprop) {
+ mutex_lock lock{dnn_handle_mutex_};
+ auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+ AsCUDAStreamValue(stream));
+ if (status != CUDNN_STATUS_SUCCESS) {
+ LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+ return false;
+ }
+
+ ScopedTensorDescriptor x_descriptor{parent_, x_desc,
+ static_cast<cudnnDataType_t>(cudnn_type)};
+ ScopedTensorDescriptor scale_offset_descriptor{
+ parent_, scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_type)};
+ cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+ float one = 1.0;
+ float zero = 0.0;
+
+ status = dynload::cudnnBatchNormalizationBackward(
+ parent_, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
+ x_descriptor.handle(), x.opaque(), x_descriptor.handle(),
+ y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(),
+ scale_offset_descriptor.handle(), scale.opaque(),
+ scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
+ mean.opaque(), variance.opaque());
+ if (status != CUDNN_STATUS_SUCCESS) {
+ LOG(ERROR) << "failed to enqueue backward batch normalization on stream: "
+ << ToString(status);
+ return false;
+ }
+ return true;
+}
+
bool CudnnSupport::DoConvolve(
Stream* stream, const BatchDescriptor& batch_descriptor,
const DeviceMemory<float>& input_data,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index f8bc0c493f..8101ebf258 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -112,6 +112,28 @@ class CudnnSupport : public dnn::DnnSupport {
bool GetConvolveBackwardFilterAlgorithms(
std::vector<dnn::AlgorithmType>* out_algorithms) override;
+ bool DoBatchNormalizationForward(
+ Stream* stream, const DeviceMemory<float>& x,
+ const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+ const DeviceMemory<float>& estimated_mean,
+ const DeviceMemory<float>& estimated_variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
+ DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+ DeviceMemory<float>* saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<float>&()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) override;
+
+ bool DoBatchNormalizationBackward(
+ Stream* stream, const DeviceMemory<float>& y_backprop,
+ const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
+ const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
+ DeviceMemory<float>* offset_backprop) override;
+
bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
const DeviceMemory<float>& input_data,
const dnn::FilterDescriptor& filter_descriptor,
@@ -379,6 +401,30 @@ class CudnnSupport : public dnn::DnnSupport {
EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_);
template <class T>
+ bool DoBatchNormalizationForwardImpl(
+ Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x,
+ const DeviceMemory<T>& scale, const DeviceMemory<T>& offset,
+ const DeviceMemory<T>& estimated_mean,
+ const DeviceMemory<T>& estimated_variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<T>* y, DeviceMemory<T>* batch_mean,
+ DeviceMemory<T>* batch_var, DeviceMemory<T>* saved_mean,
+ DeviceMemory<T>* saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<T>&()> var_to_inv_var,
+ std::function<void()> inv_var_to_var);
+
+ template <class T>
+ bool DoBatchNormalizationBackwardImpl(
+ Stream* stream, int cudnn_type, const DeviceMemory<T>& y_backprop,
+ const DeviceMemory<T>& x, const DeviceMemory<T>& scale,
+ const DeviceMemory<T>& mean, const DeviceMemory<T>& variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<T>* x_backprop, DeviceMemory<T>* scale_backprop,
+ DeviceMemory<T>* offset_backprop);
+
+ template <class T>
bool DoConvolveImpl(Stream* stream,
int cudnn_type, // Actually cudnnDataType_t.
const dnn::BatchDescriptor& batch_descriptor,
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 1c31178526..d83d3042d5 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -22,6 +22,7 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_
#define TENSORFLOW_STREAM_EXECUTOR_DNN_H_
+#include <functional>
#include <limits>
#include <memory>
@@ -775,7 +776,8 @@ enum class ElementwiseOperation { kAdd, kMultiply };
string ElementwiseOperationString(ElementwiseOperation op);
// Suite of operations typically used for implementing Deep/Convolutional Neural
-// Nets.
+// Nets. Note: A false return value of an operation indicates the
+// implementation is not available.
class DnnSupport {
public:
DnnSupport() {}
@@ -783,6 +785,79 @@ class DnnSupport {
virtual port::Status Init() = 0;
+ // Performs a single-precision forward batch normalization operation onto
+ // the stream.
+ //
+ // Arguments:
+ // stream: borrowed pointer to the stream that the batch normalization
+ // operation should be enqueued onto.
+ // x: input data.
+ // scale: scaling parameters.
+ // offset: offset parameters.
+ // estimated_mean: population mean estimated during training.
+ // Used for inference only; empty for training.
+ // estimated_variance: population variance estimated during traning,
+ // used for inference only; empty for training.
+ // x_desc: dimensions of the input data, which is the same as the dimensions
+ // of the output.
+ // scale_offset_desc: dimensions of scale and offset.
+ // epsilon: a small floating point number added to the variance of x.
+ // y: output data.
+ // batch_mean: batch mean, to be used to compute the running mean.
+ // batch_variance: batch variance, to be used to compute
+ // the running variance.
+ // reserve_space_1: saved mean, to be reused in the backward gradient
+ // computation.
+ // reserve_space_2: saved variance, to be reused in the backward gradient
+ // computation.
+ // is_training: Set to true for training, false for inference.
+ // var_to_inv_var: a function to convert the variance to inverted variance
+ // for cuDNN v4 forward inference.
+ // inv_var_to_var: a function to convert the inverted variance to
+ // variance for cuDNN v4 forward training, to be used for TensorFlow
+ // to calculate the running variance.
+ virtual bool DoBatchNormalizationForward(
+ Stream* stream, const DeviceMemory<float>& x,
+ const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+ const DeviceMemory<float>& estimated_mean,
+ const DeviceMemory<float>& estimated_variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
+ DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
+ DeviceMemory<float>* reserve_space_2, bool is_training,
+ std::function<const DeviceMemory<float>&()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) {
+ return false;
+ }
+
+ // Performs a single-precision backward batch normalization gradient
+ // computation operation onto the stream.
+ //
+ // Arguments:
+ // stream: borrowed pointer to the stream that the batch normalization
+ // gradient computation operation should be enqueued onto.
+ // y_backprop: gradient with regard to output y.
+ // x: input data.
+ // scale: scaling parameters.
+ // x_desc: dimensions of the input data, which is the same as the dimensions
+ // of the output.
+ // scale_offset_desc: dimensions of scale and offset.
+ // epsilon: a small floating point number added to the variance of x.
+ // x_backprop: gradient with respect to input x.
+ // scale_backprop: gradient with respect to scale.
+ // offset_backprop: gradient with respect to offset.
+ virtual bool DoBatchNormalizationBackward(
+ Stream* stream, const DeviceMemory<float>& y_backprop,
+ const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
+ const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
+ DeviceMemory<float>* offset_backprop) {
+ return false;
+ }
+
// Enqueues a single-precision convolution operation onto the stream.
//
// Arguments (all borrowed):
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 8c0e45f1a6..512e882cad 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -277,6 +277,62 @@ Stream &Stream::ThenRecordEvent(Event *event) {
return *this;
}
+Stream &Stream::ThenBatchNormalizationForward(
+ const DeviceMemory<float> &x, const DeviceMemory<float> &scale,
+ const DeviceMemory<float> &offset,
+ const DeviceMemory<float> &estimated_mean,
+ const DeviceMemory<float> &estimated_variance,
+ const dnn::BatchDescriptor &x_desc,
+ const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+ DeviceMemory<float> *y, DeviceMemory<float> *batch_mean,
+ DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean,
+ DeviceMemory<float> *saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<float> &()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) {
+ VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
+ PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
+ if (ok()) {
+ if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+ CheckError(dnn->DoBatchNormalizationForward(
+ this, x, scale, offset, estimated_mean, estimated_variance, x_desc,
+ scale_offset_desc, epsilon, y, batch_mean, batch_var, saved_mean,
+ saved_inv_var, is_training, std::move(var_to_inv_var),
+ std::move(inv_var_to_var)));
+ } else {
+ SetError();
+ LOG(WARNING)
+ << "attempting to perform DNN operation using StreamExecutor "
+ "without DNN support";
+ }
+ }
+ return *this;
+}
+
+Stream &Stream::ThenBatchNormalizationBackward(
+ const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
+ const DeviceMemory<float> &scale, const DeviceMemory<float> &mean,
+ const DeviceMemory<float> &variance, const dnn::BatchDescriptor &x_desc,
+ const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+ DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
+ DeviceMemory<float> *offset_backprop) {
+ VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
+ PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
+ PARAM(scale_backprop), PARAM(offset_backprop));
+ if (ok()) {
+ if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+ CheckError(dnn->DoBatchNormalizationBackward(
+ this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc,
+ epsilon, x_backprop, scale_backprop, offset_backprop));
+ } else {
+ SetError();
+ LOG(WARNING)
+ << "attempting to perform DNN operation using StreamExecutor "
+ "without DNN support";
+ }
+ }
+ return *this;
+}
+
Stream &Stream::ThenConvolveWithScratch(
const dnn::BatchDescriptor &input_descriptor,
const DeviceMemory<Eigen::half> &input_data,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 61058528c2..0d16495a1d 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -215,6 +215,27 @@ class Stream {
//
// See DnnSupport::* for comments on the following methods.
+ Stream &ThenBatchNormalizationForward(
+ const DeviceMemory<float> &x, const DeviceMemory<float> &scale,
+ const DeviceMemory<float> &offset,
+ const DeviceMemory<float> &estimated_mean,
+ const DeviceMemory<float> &estimated_variance,
+ const dnn::BatchDescriptor &x_desc,
+ const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+ DeviceMemory<float> *y, DeviceMemory<float> *batch_mean,
+ DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean,
+ DeviceMemory<float> *saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<float> &()> var_to_inv_var,
+ std::function<void()> inv_var_to_var);
+
+ Stream &ThenBatchNormalizationBackward(
+ const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
+ const DeviceMemory<float> &scale, const DeviceMemory<float> &mean,
+ const DeviceMemory<float> &variance, const dnn::BatchDescriptor &x_desc,
+ const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+ DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
+ DeviceMemory<float> *offset_backprop);
+
// TODO(leary) add double-precision version of this interface.
Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
const DeviceMemory<float> &input_data,