diff options
author | 2017-09-27 12:58:14 -0700 | |
---|---|---|
committer | 2017-09-27 13:04:57 -0700 | |
commit | 759690f026a1a08b3ac5cc84d8498c05c32b2a7d (patch) | |
tree | 9c7ba12fef51b97226f4e0a07b9aa0eff7fccff1 /tensorflow/stream_executor | |
parent | 20370104cd8adf4c3f9068dfe95bde54cccadfa5 (diff) |
Add float16 support to tf.nn.fused_batch_norm on the GPU.
Scale, offset, mean, and variance must still be float32 if the input is float16.
PiperOrigin-RevId: 170239448
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.cc | 90 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.h | 56 | ||||
-rw-r--r-- | tensorflow/stream_executor/dnn.h | 32 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.cc | 51 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.h | 23 |
5 files changed, 209 insertions, 43 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 087ae556e7..fc205f61fa 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -2551,24 +2551,44 @@ bool CudnnSupport::DoBatchNormalizationForward( DeviceMemory<float>* saved_inv_var, bool is_training, std::function<const DeviceMemory<float>&()> var_to_inv_var, std::function<void()> inv_var_to_var) { - return DoBatchNormalizationForwardImpl<float>( - stream, dnn::DataType::kFloat, x, scale, offset, estimated_mean, - estimated_variance, x_desc, scale_offset_desc, epsilon, y, batch_mean, - batch_var, saved_mean, saved_inv_var, is_training, + return DoBatchNormalizationForwardImpl<float, float>( + stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale, offset, + estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y, + batch_mean, batch_var, saved_mean, saved_inv_var, is_training, std::move(var_to_inv_var), std::move(inv_var_to_var)); } -template <class T> +bool CudnnSupport::DoBatchNormalizationForward( + Stream* stream, const DeviceMemory<Eigen::half>& x, + const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, + const DeviceMemory<float>& estimated_mean, + const DeviceMemory<float>& estimated_variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean, + DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean, + DeviceMemory<float>* saved_inv_var, bool is_training, + std::function<const DeviceMemory<float>&()> var_to_inv_var, + std::function<void()> inv_var_to_var) { + return DoBatchNormalizationForwardImpl<Eigen::half, float>( + stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset, + estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y, + batch_mean, batch_var, saved_mean, saved_inv_var, is_training, + std::move(var_to_inv_var), std::move(inv_var_to_var)); +} + +template <class T, class U> bool CudnnSupport::DoBatchNormalizationForwardImpl( - Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x, - const DeviceMemory<T>& scale, const DeviceMemory<T>& offset, - const DeviceMemory<T>& estimated_mean, - const DeviceMemory<T>& estimated_variance, + Stream* stream, dnn::DataType input_data_type, + dnn::DataType scale_data_type, const DeviceMemory<T>& x, + const DeviceMemory<U>& scale, const DeviceMemory<U>& offset, + const DeviceMemory<U>& estimated_mean, + const DeviceMemory<U>& estimated_variance, const dnn::BatchDescriptor& x_desc, const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, - DeviceMemory<T>* y, DeviceMemory<T>* batch_mean, DeviceMemory<T>* batch_var, - DeviceMemory<T>* saved_mean, DeviceMemory<T>* saved_inv_var, - bool is_training, std::function<const DeviceMemory<T>&()> var_to_inv_var, + DeviceMemory<T>* y, DeviceMemory<U>* batch_mean, DeviceMemory<U>* batch_var, + DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var, + bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var, std::function<void()> inv_var_to_var) { mutex_lock lock{dnn_handle_mutex_}; auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_), @@ -2579,9 +2599,9 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl( } ScopedTensorDescriptor x_descriptor{parent_, x_desc, - ToCudnnDataType(data_type)}; - ScopedTensorDescriptor scale_offset_descriptor{parent_, scale_offset_desc, - ToCudnnDataType(data_type)}; + ToCudnnDataType(input_data_type)}; + ScopedTensorDescriptor scale_offset_descriptor{ + parent_, scale_offset_desc, ToCudnnDataType(scale_data_type)}; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; float one = 1.0; float zero = 0.0; @@ -2629,19 +2649,34 @@ bool CudnnSupport::DoBatchNormalizationBackward( DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop) { return DoBatchNormalizationBackwardImpl( - stream, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean, variance, x_desc, - scale_offset_desc, epsilon, x_backprop, scale_backprop, offset_backprop); + stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean, + variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop, + offset_backprop); } -template <class T> -bool CudnnSupport::DoBatchNormalizationBackwardImpl( - Stream* stream, int cudnn_type, const DeviceMemory<T>& y_backprop, - const DeviceMemory<T>& x, const DeviceMemory<T>& scale, - const DeviceMemory<T>& mean, const DeviceMemory<T>& variance, +bool CudnnSupport::DoBatchNormalizationBackward( + Stream* stream, const DeviceMemory<Eigen::half>& y_backprop, + const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale, + const DeviceMemory<float>& mean, const DeviceMemory<float>& variance, const dnn::BatchDescriptor& x_desc, const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, - DeviceMemory<T>* x_backprop, DeviceMemory<T>* scale_backprop, - DeviceMemory<T>* offset_backprop) { + DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop, + DeviceMemory<float>* offset_backprop) { + return DoBatchNormalizationBackwardImpl( + stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean, + variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop, + offset_backprop); +} + +template <class T, class U> +bool CudnnSupport::DoBatchNormalizationBackwardImpl( + Stream* stream, int cudnn_input_type, int cudnn_scale_type, + const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x, + const DeviceMemory<U>& scale, const DeviceMemory<U>& mean, + const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop, + DeviceMemory<U>* offset_backprop) { mutex_lock lock{dnn_handle_mutex_}; auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_), AsCUDAStreamValue(stream)); @@ -2650,10 +2685,11 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl( return false; } - ScopedTensorDescriptor x_descriptor{parent_, x_desc, - static_cast<cudnnDataType_t>(cudnn_type)}; + ScopedTensorDescriptor x_descriptor{ + parent_, x_desc, static_cast<cudnnDataType_t>(cudnn_input_type)}; ScopedTensorDescriptor scale_offset_descriptor{ - parent_, scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_type)}; + parent_, scale_offset_desc, + static_cast<cudnnDataType_t>(cudnn_scale_type)}; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; float one = 1.0; float zero = 0.0; diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index eaf06e179f..beb2f7d050 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -169,6 +169,19 @@ class CudnnSupport : public dnn::DnnSupport { std::function<const DeviceMemory<float>&()> var_to_inv_var, std::function<void()> inv_var_to_var) override; + bool DoBatchNormalizationForward( + Stream* stream, const DeviceMemory<Eigen::half>& x, + const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, + const DeviceMemory<float>& estimated_mean, + const DeviceMemory<float>& estimated_variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean, + DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean, + DeviceMemory<float>* saved_inv_var, bool is_training, + std::function<const DeviceMemory<float>&()> var_to_inv_var, + std::function<void()> inv_var_to_var) override; + bool DoBatchNormalizationBackward( Stream* stream, const DeviceMemory<float>& y_backprop, const DeviceMemory<float>& x, const DeviceMemory<float>& scale, @@ -178,6 +191,16 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop) override; + bool DoBatchNormalizationBackward( + Stream* stream, const DeviceMemory<Eigen::half>& y_backprop, + const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale, + const DeviceMemory<float>& mean, const DeviceMemory<float>& variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half>* x_backprop, + DeviceMemory<float>* scale_backprop, + DeviceMemory<float>* offset_backprop) override; + bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor, const DeviceMemory<float>& input_data, const dnn::FilterDescriptor& filter_descriptor, @@ -553,29 +576,30 @@ class CudnnSupport : public dnn::DnnSupport { std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch) EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_); - template <class T> + template <class T, class U> bool DoBatchNormalizationForwardImpl( - Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x, - const DeviceMemory<T>& scale, const DeviceMemory<T>& offset, - const DeviceMemory<T>& estimated_mean, - const DeviceMemory<T>& estimated_variance, + Stream* stream, dnn::DataType input_data_type, + dnn::DataType scale_data_type, const DeviceMemory<T>& x, + const DeviceMemory<U>& scale, const DeviceMemory<U>& offset, + const DeviceMemory<U>& estimated_mean, + const DeviceMemory<U>& estimated_variance, const dnn::BatchDescriptor& x_desc, const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, - DeviceMemory<T>* y, DeviceMemory<T>* batch_mean, - DeviceMemory<T>* batch_var, DeviceMemory<T>* saved_mean, - DeviceMemory<T>* saved_inv_var, bool is_training, - std::function<const DeviceMemory<T>&()> var_to_inv_var, + DeviceMemory<T>* y, DeviceMemory<U>* batch_mean, + DeviceMemory<U>* batch_var, DeviceMemory<U>* saved_mean, + DeviceMemory<U>* saved_inv_var, bool is_training, + std::function<const DeviceMemory<U>&()> var_to_inv_var, std::function<void()> inv_var_to_var); - template <class T> + template <class T, class U> bool DoBatchNormalizationBackwardImpl( - Stream* stream, int cudnn_type, const DeviceMemory<T>& y_backprop, - const DeviceMemory<T>& x, const DeviceMemory<T>& scale, - const DeviceMemory<T>& mean, const DeviceMemory<T>& variance, - const dnn::BatchDescriptor& x_desc, + Stream* stream, int cudnn_input_type, int cudnn_scale_type, + const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x, + const DeviceMemory<U>& scale, const DeviceMemory<U>& mean, + const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc, const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, - DeviceMemory<T>* x_backprop, DeviceMemory<T>* scale_backprop, - DeviceMemory<T>* offset_backprop); + DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop, + DeviceMemory<U>* offset_backprop); template <class T> bool DoConvolveImpl(Stream* stream, diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index b11c6417be..4beb46090c 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -900,6 +900,23 @@ class DnnSupport { return false; } + // Performs a half-precision forwards batch normalization operation onto the + // stream. See DoBatchNormalizationForward above for argument details. + virtual bool DoBatchNormalizationForward( + Stream* stream, const DeviceMemory<Eigen::half>& x, + const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, + const DeviceMemory<float>& estimated_mean, + const DeviceMemory<float>& estimated_variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean, + DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1, + DeviceMemory<float>* reserve_space_2, bool is_training, + std::function<const DeviceMemory<float>&()> var_to_inv_var, + std::function<void()> inv_var_to_var) { + return false; + } + // Performs a single-precision backward batch normalization gradient // computation operation onto the stream. // @@ -927,6 +944,21 @@ class DnnSupport { return false; } + // Performs a half-precision backward batch normalization gradient computation + // operation onto the stream. See DoBatchNormalizationBackward above for + // argument details. + virtual bool DoBatchNormalizationBackward( + Stream* stream, const DeviceMemory<Eigen::half>& y_backprop, + const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale, + const DeviceMemory<float>& mean, const DeviceMemory<float>& variance, + const dnn::BatchDescriptor& x_desc, + const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half>* x_backprop, + DeviceMemory<float>* scale_backprop, + DeviceMemory<float>* offset_backprop) { + return false; + } + // Enqueues a fused convolution operation onto the stream. // We provide several variants with different types for inputs, biases and // scaling parameters. diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index dc768e0273..6d756ab191 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -361,6 +361,57 @@ Stream &Stream::ThenBatchNormalizationBackward( return *this; } +Stream &Stream::ThenBatchNormalizationForward( + const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale, + const DeviceMemory<float> &offset, + const DeviceMemory<float> &estimated_mean, + const DeviceMemory<float> &estimated_variance, + const dnn::BatchDescriptor &x_desc, + const dnn::BatchDescriptor &scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half> *y, DeviceMemory<float> *batch_mean, + DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean, + DeviceMemory<float> *saved_inv_var, bool is_training, + std::function<const DeviceMemory<float> &()> var_to_inv_var, + std::function<void()> inv_var_to_var) { + VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc), + PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y)); + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoBatchNormalizationForward( + this, x, scale, offset, estimated_mean, estimated_variance, x_desc, + scale_offset_desc, epsilon, y, batch_mean, batch_var, saved_mean, + saved_inv_var, is_training, std::move(var_to_inv_var), + std::move(inv_var_to_var))); + } else { + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenBatchNormalizationBackward( + const DeviceMemory<Eigen::half> &y_backprop, + const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale, + const DeviceMemory<float> &mean, const DeviceMemory<float> &variance, + const dnn::BatchDescriptor &x_desc, + const dnn::BatchDescriptor &scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half> *x_backprop, DeviceMemory<float> *scale_backprop, + DeviceMemory<float> *offset_backprop) { + VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc), + PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop), + PARAM(scale_backprop), PARAM(offset_backprop)); + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoBatchNormalizationBackward( + this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc, + epsilon, x_backprop, scale_backprop, offset_backprop)); + } else { + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + Stream &Stream::ThenFusedConvolveWithScratch( const dnn::BatchDescriptor &conv_input_descriptor, const DeviceMemory<int8> &conv_input_data, float conv_input_scale, diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index 98484eb850..a72ee804c1 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -239,6 +239,29 @@ class Stream { DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop); + Stream &ThenBatchNormalizationForward( + const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale, + const DeviceMemory<float> &offset, + const DeviceMemory<float> &estimated_mean, + const DeviceMemory<float> &estimated_variance, + const dnn::BatchDescriptor &x_desc, + const dnn::BatchDescriptor &scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half> *y, DeviceMemory<float> *batch_mean, + DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean, + DeviceMemory<float> *saved_inv_var, bool is_training, + std::function<const DeviceMemory<float> &()> var_to_inv_var, + std::function<void()> inv_var_to_var); + + Stream &ThenBatchNormalizationBackward( + const DeviceMemory<Eigen::half> &y_backprop, + const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale, + const DeviceMemory<float> &mean, const DeviceMemory<float> &variance, + const dnn::BatchDescriptor &x_desc, + const dnn::BatchDescriptor &scale_offset_desc, const double epsilon, + DeviceMemory<Eigen::half> *x_backprop, + DeviceMemory<float> *scale_backprop, + DeviceMemory<float> *offset_backprop); + // TODO(leary) add double-precision version of this interface. Stream &ThenFusedConvolve( const dnn::BatchDescriptor &conv_input_descriptor, |