aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar Reed Wanderman-Milne <reedwm@google.com>2017-09-27 12:58:14 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-09-27 13:04:57 -0700
commit759690f026a1a08b3ac5cc84d8498c05c32b2a7d (patch)
tree9c7ba12fef51b97226f4e0a07b9aa0eff7fccff1 /tensorflow/stream_executor
parent20370104cd8adf4c3f9068dfe95bde54cccadfa5 (diff)
Add float16 support to tf.nn.fused_batch_norm on the GPU.
Scale, offset, mean, and variance must still be float32 if the input is float16. PiperOrigin-RevId: 170239448
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_dnn.cc90
-rw-r--r--tensorflow/stream_executor/cuda/cuda_dnn.h56
-rw-r--r--tensorflow/stream_executor/dnn.h32
-rw-r--r--tensorflow/stream_executor/stream.cc51
-rw-r--r--tensorflow/stream_executor/stream.h23
5 files changed, 209 insertions, 43 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 087ae556e7..fc205f61fa 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -2551,24 +2551,44 @@ bool CudnnSupport::DoBatchNormalizationForward(
DeviceMemory<float>* saved_inv_var, bool is_training,
std::function<const DeviceMemory<float>&()> var_to_inv_var,
std::function<void()> inv_var_to_var) {
- return DoBatchNormalizationForwardImpl<float>(
- stream, dnn::DataType::kFloat, x, scale, offset, estimated_mean,
- estimated_variance, x_desc, scale_offset_desc, epsilon, y, batch_mean,
- batch_var, saved_mean, saved_inv_var, is_training,
+ return DoBatchNormalizationForwardImpl<float, float>(
+ stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale, offset,
+ estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y,
+ batch_mean, batch_var, saved_mean, saved_inv_var, is_training,
std::move(var_to_inv_var), std::move(inv_var_to_var));
}
-template <class T>
+bool CudnnSupport::DoBatchNormalizationForward(
+ Stream* stream, const DeviceMemory<Eigen::half>& x,
+ const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+ const DeviceMemory<float>& estimated_mean,
+ const DeviceMemory<float>& estimated_variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
+ DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+ DeviceMemory<float>* saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<float>&()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) {
+ return DoBatchNormalizationForwardImpl<Eigen::half, float>(
+ stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset,
+ estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y,
+ batch_mean, batch_var, saved_mean, saved_inv_var, is_training,
+ std::move(var_to_inv_var), std::move(inv_var_to_var));
+}
+
+template <class T, class U>
bool CudnnSupport::DoBatchNormalizationForwardImpl(
- Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x,
- const DeviceMemory<T>& scale, const DeviceMemory<T>& offset,
- const DeviceMemory<T>& estimated_mean,
- const DeviceMemory<T>& estimated_variance,
+ Stream* stream, dnn::DataType input_data_type,
+ dnn::DataType scale_data_type, const DeviceMemory<T>& x,
+ const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
+ const DeviceMemory<U>& estimated_mean,
+ const DeviceMemory<U>& estimated_variance,
const dnn::BatchDescriptor& x_desc,
const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
- DeviceMemory<T>* y, DeviceMemory<T>* batch_mean, DeviceMemory<T>* batch_var,
- DeviceMemory<T>* saved_mean, DeviceMemory<T>* saved_inv_var,
- bool is_training, std::function<const DeviceMemory<T>&()> var_to_inv_var,
+ DeviceMemory<T>* y, DeviceMemory<U>* batch_mean, DeviceMemory<U>* batch_var,
+ DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
+ bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
std::function<void()> inv_var_to_var) {
mutex_lock lock{dnn_handle_mutex_};
auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
@@ -2579,9 +2599,9 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
}
ScopedTensorDescriptor x_descriptor{parent_, x_desc,
- ToCudnnDataType(data_type)};
- ScopedTensorDescriptor scale_offset_descriptor{parent_, scale_offset_desc,
- ToCudnnDataType(data_type)};
+ ToCudnnDataType(input_data_type)};
+ ScopedTensorDescriptor scale_offset_descriptor{
+ parent_, scale_offset_desc, ToCudnnDataType(scale_data_type)};
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
float one = 1.0;
float zero = 0.0;
@@ -2629,19 +2649,34 @@ bool CudnnSupport::DoBatchNormalizationBackward(
DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
DeviceMemory<float>* offset_backprop) {
return DoBatchNormalizationBackwardImpl(
- stream, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean, variance, x_desc,
- scale_offset_desc, epsilon, x_backprop, scale_backprop, offset_backprop);
+ stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
+ variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+ offset_backprop);
}
-template <class T>
-bool CudnnSupport::DoBatchNormalizationBackwardImpl(
- Stream* stream, int cudnn_type, const DeviceMemory<T>& y_backprop,
- const DeviceMemory<T>& x, const DeviceMemory<T>& scale,
- const DeviceMemory<T>& mean, const DeviceMemory<T>& variance,
+bool CudnnSupport::DoBatchNormalizationBackward(
+ Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
+ const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
+ const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
const dnn::BatchDescriptor& x_desc,
const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
- DeviceMemory<T>* x_backprop, DeviceMemory<T>* scale_backprop,
- DeviceMemory<T>* offset_backprop) {
+ DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop,
+ DeviceMemory<float>* offset_backprop) {
+ return DoBatchNormalizationBackwardImpl(
+ stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
+ variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+ offset_backprop);
+}
+
+template <class T, class U>
+bool CudnnSupport::DoBatchNormalizationBackwardImpl(
+ Stream* stream, int cudnn_input_type, int cudnn_scale_type,
+ const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
+ const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
+ const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
+ DeviceMemory<U>* offset_backprop) {
mutex_lock lock{dnn_handle_mutex_};
auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
AsCUDAStreamValue(stream));
@@ -2650,10 +2685,11 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
return false;
}
- ScopedTensorDescriptor x_descriptor{parent_, x_desc,
- static_cast<cudnnDataType_t>(cudnn_type)};
+ ScopedTensorDescriptor x_descriptor{
+ parent_, x_desc, static_cast<cudnnDataType_t>(cudnn_input_type)};
ScopedTensorDescriptor scale_offset_descriptor{
- parent_, scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_type)};
+ parent_, scale_offset_desc,
+ static_cast<cudnnDataType_t>(cudnn_scale_type)};
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
float one = 1.0;
float zero = 0.0;
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index eaf06e179f..beb2f7d050 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -169,6 +169,19 @@ class CudnnSupport : public dnn::DnnSupport {
std::function<const DeviceMemory<float>&()> var_to_inv_var,
std::function<void()> inv_var_to_var) override;
+ bool DoBatchNormalizationForward(
+ Stream* stream, const DeviceMemory<Eigen::half>& x,
+ const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+ const DeviceMemory<float>& estimated_mean,
+ const DeviceMemory<float>& estimated_variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
+ DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+ DeviceMemory<float>* saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<float>&()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) override;
+
bool DoBatchNormalizationBackward(
Stream* stream, const DeviceMemory<float>& y_backprop,
const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
@@ -178,6 +191,16 @@ class CudnnSupport : public dnn::DnnSupport {
DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
DeviceMemory<float>* offset_backprop) override;
+ bool DoBatchNormalizationBackward(
+ Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
+ const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
+ const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half>* x_backprop,
+ DeviceMemory<float>* scale_backprop,
+ DeviceMemory<float>* offset_backprop) override;
+
bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
const DeviceMemory<float>& input_data,
const dnn::FilterDescriptor& filter_descriptor,
@@ -553,29 +576,30 @@ class CudnnSupport : public dnn::DnnSupport {
std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch)
EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_);
- template <class T>
+ template <class T, class U>
bool DoBatchNormalizationForwardImpl(
- Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x,
- const DeviceMemory<T>& scale, const DeviceMemory<T>& offset,
- const DeviceMemory<T>& estimated_mean,
- const DeviceMemory<T>& estimated_variance,
+ Stream* stream, dnn::DataType input_data_type,
+ dnn::DataType scale_data_type, const DeviceMemory<T>& x,
+ const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
+ const DeviceMemory<U>& estimated_mean,
+ const DeviceMemory<U>& estimated_variance,
const dnn::BatchDescriptor& x_desc,
const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
- DeviceMemory<T>* y, DeviceMemory<T>* batch_mean,
- DeviceMemory<T>* batch_var, DeviceMemory<T>* saved_mean,
- DeviceMemory<T>* saved_inv_var, bool is_training,
- std::function<const DeviceMemory<T>&()> var_to_inv_var,
+ DeviceMemory<T>* y, DeviceMemory<U>* batch_mean,
+ DeviceMemory<U>* batch_var, DeviceMemory<U>* saved_mean,
+ DeviceMemory<U>* saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<U>&()> var_to_inv_var,
std::function<void()> inv_var_to_var);
- template <class T>
+ template <class T, class U>
bool DoBatchNormalizationBackwardImpl(
- Stream* stream, int cudnn_type, const DeviceMemory<T>& y_backprop,
- const DeviceMemory<T>& x, const DeviceMemory<T>& scale,
- const DeviceMemory<T>& mean, const DeviceMemory<T>& variance,
- const dnn::BatchDescriptor& x_desc,
+ Stream* stream, int cudnn_input_type, int cudnn_scale_type,
+ const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
+ const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
+ const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
- DeviceMemory<T>* x_backprop, DeviceMemory<T>* scale_backprop,
- DeviceMemory<T>* offset_backprop);
+ DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
+ DeviceMemory<U>* offset_backprop);
template <class T>
bool DoConvolveImpl(Stream* stream,
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index b11c6417be..4beb46090c 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -900,6 +900,23 @@ class DnnSupport {
return false;
}
+ // Performs a half-precision forwards batch normalization operation onto the
+ // stream. See DoBatchNormalizationForward above for argument details.
+ virtual bool DoBatchNormalizationForward(
+ Stream* stream, const DeviceMemory<Eigen::half>& x,
+ const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+ const DeviceMemory<float>& estimated_mean,
+ const DeviceMemory<float>& estimated_variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
+ DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
+ DeviceMemory<float>* reserve_space_2, bool is_training,
+ std::function<const DeviceMemory<float>&()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) {
+ return false;
+ }
+
// Performs a single-precision backward batch normalization gradient
// computation operation onto the stream.
//
@@ -927,6 +944,21 @@ class DnnSupport {
return false;
}
+ // Performs a half-precision backward batch normalization gradient computation
+ // operation onto the stream. See DoBatchNormalizationBackward above for
+ // argument details.
+ virtual bool DoBatchNormalizationBackward(
+ Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
+ const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
+ const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+ const dnn::BatchDescriptor& x_desc,
+ const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half>* x_backprop,
+ DeviceMemory<float>* scale_backprop,
+ DeviceMemory<float>* offset_backprop) {
+ return false;
+ }
+
// Enqueues a fused convolution operation onto the stream.
// We provide several variants with different types for inputs, biases and
// scaling parameters.
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index dc768e0273..6d756ab191 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -361,6 +361,57 @@ Stream &Stream::ThenBatchNormalizationBackward(
return *this;
}
+Stream &Stream::ThenBatchNormalizationForward(
+ const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
+ const DeviceMemory<float> &offset,
+ const DeviceMemory<float> &estimated_mean,
+ const DeviceMemory<float> &estimated_variance,
+ const dnn::BatchDescriptor &x_desc,
+ const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half> *y, DeviceMemory<float> *batch_mean,
+ DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean,
+ DeviceMemory<float> *saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<float> &()> var_to_inv_var,
+ std::function<void()> inv_var_to_var) {
+ VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
+ PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
+ if (ok()) {
+ if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+ CheckError(dnn->DoBatchNormalizationForward(
+ this, x, scale, offset, estimated_mean, estimated_variance, x_desc,
+ scale_offset_desc, epsilon, y, batch_mean, batch_var, saved_mean,
+ saved_inv_var, is_training, std::move(var_to_inv_var),
+ std::move(inv_var_to_var)));
+ } else {
+ SetErrorAndLogNoDnnSupport();
+ }
+ }
+ return *this;
+}
+
+Stream &Stream::ThenBatchNormalizationBackward(
+ const DeviceMemory<Eigen::half> &y_backprop,
+ const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
+ const DeviceMemory<float> &mean, const DeviceMemory<float> &variance,
+ const dnn::BatchDescriptor &x_desc,
+ const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half> *x_backprop, DeviceMemory<float> *scale_backprop,
+ DeviceMemory<float> *offset_backprop) {
+ VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
+ PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
+ PARAM(scale_backprop), PARAM(offset_backprop));
+ if (ok()) {
+ if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+ CheckError(dnn->DoBatchNormalizationBackward(
+ this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc,
+ epsilon, x_backprop, scale_backprop, offset_backprop));
+ } else {
+ SetErrorAndLogNoDnnSupport();
+ }
+ }
+ return *this;
+}
+
Stream &Stream::ThenFusedConvolveWithScratch(
const dnn::BatchDescriptor &conv_input_descriptor,
const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 98484eb850..a72ee804c1 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -239,6 +239,29 @@ class Stream {
DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
DeviceMemory<float> *offset_backprop);
+ Stream &ThenBatchNormalizationForward(
+ const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
+ const DeviceMemory<float> &offset,
+ const DeviceMemory<float> &estimated_mean,
+ const DeviceMemory<float> &estimated_variance,
+ const dnn::BatchDescriptor &x_desc,
+ const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half> *y, DeviceMemory<float> *batch_mean,
+ DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean,
+ DeviceMemory<float> *saved_inv_var, bool is_training,
+ std::function<const DeviceMemory<float> &()> var_to_inv_var,
+ std::function<void()> inv_var_to_var);
+
+ Stream &ThenBatchNormalizationBackward(
+ const DeviceMemory<Eigen::half> &y_backprop,
+ const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
+ const DeviceMemory<float> &mean, const DeviceMemory<float> &variance,
+ const dnn::BatchDescriptor &x_desc,
+ const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+ DeviceMemory<Eigen::half> *x_backprop,
+ DeviceMemory<float> *scale_backprop,
+ DeviceMemory<float> *offset_backprop);
+
// TODO(leary) add double-precision version of this interface.
Stream &ThenFusedConvolve(
const dnn::BatchDescriptor &conv_input_descriptor,