aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-05-18 02:37:52 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-05-18 02:41:48 -0700
commit4b052cc98201a9f07ff9e451913a8adfbb74ab11 (patch)
tree3b4877629509ac2311a967395d7dddfb9a69c394 /tensorflow/stream_executor
parent609b2ce3fe8ebecf4031670b8c2186468369b0ba (diff)
Dropping support for cuDNN < 6.
Enable CUDNN_FFT_TILING_FORWARD for cuDNN >= 7. PiperOrigin-RevId: 197118212
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_dnn.cc245
1 files changed, 29 insertions, 216 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 7ace7fd303..d82d36c691 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -53,6 +53,8 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
namespace {
+static_assert(CUDNN_VERSION >= 6000, "cuDNN needs to be version 6.0 or higher");
+
// Converts (via narrowing) a type T value to a type U, and checks that the
// value has no value change due to the conversion.
template <typename WideT, typename NarrowT>
@@ -93,7 +95,6 @@ string ToString(cudnnStatus_t status) {
}
}
-#if CUDNN_VERSION >= 6000
string ToString(libraryPropertyType type) {
switch (type) {
case MAJOR_VERSION:
@@ -107,7 +108,6 @@ string ToString(libraryPropertyType type) {
"<unknown libraryPropertyType: ", static_cast<int>(type), ">");
}
}
-#endif
template <typename T>
cudnnDataType_t GetCudnnDataType();
@@ -213,12 +213,8 @@ cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmDesc algorithm) {
case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
-#if CUDNN_VERSION >= 5000
case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
-#endif
-#if CUDNN_VERSION >= 5100
case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
-#endif
return algo;
default:
LOG(FATAL) << "Unsupported Cudnn convolution forward algorithm: "
@@ -235,12 +231,8 @@ cudnnConvolutionBwdDataAlgo_t ToConvBackwardDataAlgo(
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
-#if CUDNN_VERSION >= 5000
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
-#endif
-#if CUDNN_VERSION >= 5100
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
-#endif
return algo;
default:
LOG(FATAL)
@@ -258,11 +250,12 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
-#if CUDNN_VERSION >= 5100
// Based on cudnn.h, the following is not implemented.
// case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD:
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
-#endif
+ // Produces incorrect results for some shapes. Disabled for now, see
+ // NVIDIA bug 2072856. TODO(csigg): Only disable for subset of shapes.
+ // case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
return algo;
default:
LOG(FATAL)
@@ -271,7 +264,6 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
}
}
-#if CUDNN_VERSION >= 6000
port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
cudnnStatus_t status = cudnnGetProperty(type, value);
if (status != CUDNN_STATUS_SUCCESS) {
@@ -300,19 +292,11 @@ cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
}
}
}
-#endif
port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
-#if CUDNN_VERSION >= 6000
TF_RETURN_IF_ERROR(GetCudnnProperty(MAJOR_VERSION, &version->major_version));
TF_RETURN_IF_ERROR(GetCudnnProperty(MINOR_VERSION, &version->minor_version));
TF_RETURN_IF_ERROR(GetCudnnProperty(PATCH_LEVEL, &version->patch_level));
-#else
- size_t loaded_version = ::cudnnGetVersion();
- version->major_version = loaded_version / 1000;
- version->minor_version = (loaded_version / 100) % 10;
- version->patch_level = loaded_version % 100;
-#endif
return port::Status::OK();
}
@@ -418,7 +402,6 @@ class ScopedTensorDescriptor {
<< " to cudnn tensor descriptor: " << ToString(status);
}
} break;
-#if CUDNN_VERSION >= 6000
case dnn::DataLayout::kBatchDepthYX4: {
status = cudnnSetTensor4dDescriptor(
handle_, CUDNN_TENSOR_NCHW_VECT_C, elem_type,
@@ -430,7 +413,6 @@ class ScopedTensorDescriptor {
<< " to cudnn tensor descriptor: " << ToString(status);
}
} break;
-#endif
default:
LOG(FATAL) << "Unsupported tensor format "
<< DataLayoutString(batch_descriptor.layout());
@@ -466,7 +448,6 @@ class ScopedFilterDescriptor {
<< ToString(status);
}
-#if CUDNN_VERSION >= 5000
// TODO(b/23032134): Even if the filter layout is not supported,
// cudnnSetFilter4DDescriptor_v4 will return CUDNN_STATUS_SUCCESS because it
// does not take layout as an input. Maybe force cuDNN by giving wrong
@@ -476,17 +457,14 @@ class ScopedFilterDescriptor {
case dnn::FilterLayout::kOutputInputYX:
format = CUDNN_TENSOR_NCHW;
break;
-#if CUDNN_VERSION >= 6000
case dnn::FilterLayout::kOutputInputYX4:
format = CUDNN_TENSOR_NCHW_VECT_C;
break;
-#endif
default:
LOG(FATAL) << "Unsupported filter format "
<< FilterLayoutString(filter_descriptor.layout());
break;
}
-#endif
std::vector<int> dims(2 + filter_descriptor.ndims());
dims[0] = filter_descriptor.output_feature_map_count();
@@ -494,11 +472,8 @@ class ScopedFilterDescriptor {
const auto& spatial_dims = filter_descriptor.input_filter_dims();
std::copy(spatial_dims.begin(), spatial_dims.end(), dims.begin() + 2);
- status = cudnnSetFilterNdDescriptor(handle_, elem_type,
-#if CUDNN_VERSION >= 5000
- format,
-#endif
- dims.size(), dims.data());
+ status = cudnnSetFilterNdDescriptor(handle_, elem_type, format, dims.size(),
+ dims.data());
if (status != CUDNN_STATUS_SUCCESS) {
LOG(FATAL) << "could not set cudnn filter descriptor: "
<< ToString(status);
@@ -692,10 +667,8 @@ class ScopedPoolingDescriptor {
(pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
? CUDNN_POOLING_MAX
: CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
-#if CUDNN_VERSION >= 5000
- propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN,
-#endif
- nd, shape.data(), padding.data(), strides.data());
+ propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN, nd,
+ shape.data(), padding.data(), strides.data());
if (status != CUDNN_STATUS_SUCCESS) {
LOG(FATAL) << "could not set cudnn pooling descriptor: "
<< ToString(status);
@@ -771,7 +744,6 @@ class ScopedNormalizeDescriptor {
SE_DISALLOW_COPY_AND_ASSIGN(ScopedNormalizeDescriptor);
};
-#if CUDNN_VERSION >= 5000
// Turns a ActivationDescriptor structure into a cudnn activation
// descriptor handle within a scope.
class ScopedActivationDescriptor {
@@ -834,7 +806,6 @@ class ScopedActivationDescriptor {
SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivationDescriptor);
};
-#endif
cudnnDataType_t ToCudnnDataType(
dnn::DataType data_type,
@@ -844,18 +815,14 @@ cudnnDataType_t ToCudnnDataType(
case dnn::DataType::kDouble:
case dnn::DataType::kHalf:
return static_cast<cudnnDataType_t>(data_type);
-#if CUDNN_VERSION >= 6000
case dnn::DataType::kInt8:
return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
: CUDNN_DATA_INT8;
-#endif
default:
LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
}
}
-#if CUDNN_VERSION >= 5000
-
cudnnRNNInputMode_t ToCudnnRnnInputMode(dnn::RnnInputMode input_mode) {
switch (input_mode) {
case dnn::RnnInputMode::kRnnLinearSkip:
@@ -903,15 +870,11 @@ int CudnnDataTypeToByteSize(cudnnDataType_t data_type) {
}
}
-#endif // CUDNN_VERSION
-
template <typename Base>
class MixinBase : public Base {};
template <>
class MixinBase<void> {};
-#if CUDNN_VERSION >= 5000
-
#define CUDNN_RETURN_IF_FAIL(STATUS, ...) \
if (!SE_PREDICT_TRUE((STATUS) == CUDNN_STATUS_SUCCESS)) { \
string error_msg = port::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
@@ -1042,9 +1005,7 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
hidden_size_(hidden_size),
input_size_(input_size),
batch_size_(batch_size),
-#if CUDNN_VERSION >= 6000
rnn_plan_(nullptr),
-#endif
input_mode_(input_mode),
direction_mode_(direction_mode),
rnn_mode_(rnn_mode),
@@ -1062,7 +1023,6 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
// Create the RNN handle
cudnnStatus_t status = cudnnCreateRNNDescriptor(&rnn_desc_);
CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
-#if CUDNN_VERSION >= 6000
// TODO: allow the user to choose an algorithm.
rnn_algo_ = ToCudnnRNNAlgo(algorithm_config_.algorithm());
status = cudnnSetRNNDescriptor_v6(
@@ -1084,16 +1044,6 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
status = cudnnSetPersistentRNNPlan(rnn_desc_, rnn_plan_);
CUDNN_RETURN_IF_FAIL(status, "Unable to update persistent RNN plan.");
}
-#else
- CHECK(algorithm_config_.is_default())
- << "Non-default algorithm not supported for CUDA version < 6.0";
- status = cudnnSetRNNDescriptor(
- /*rnnDesc=*/rnn_desc_, /*hiddenSize=*/hidden_size,
- /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_handle(),
- /*inputMode=*/input_mode, /*direction=*/direction_mode,
- /*mode=*/rnn_mode, /*dataType=*/compute_type);
- CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
-#endif
// Create the params handle.
cudnn_params_desc_.reset(new CudnnRnnParamsDescriptor(cudnn, *this));
@@ -1106,12 +1056,10 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
~CudnnRnnDescriptor() override {
if (rnn_desc_) {
cudnnStatus_t status;
-#if CUDNN_VERSION >= 6000
if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC && rnn_plan_) {
status = cudnnDestroyPersistentRNNPlan(rnn_plan_);
CUDNN_RETURN_IF_FAIL(status, "Unable to destroy persistent RNN plan.");
}
-#endif
status = cudnnDestroyRNNDescriptor(rnn_desc_);
CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN descriptor");
}
@@ -1172,10 +1120,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
// batch_size_ is set to -1 when not using CUDNN_RNN_ALGO_PERSIST_DYNAMIC
// algorithm.
int batch_size_;
-#if CUDNN_VERSION >= 6000
cudnnRNNAlgo_t rnn_algo_;
cudnnPersistentRNNPlan_t rnn_plan_;
-#endif
cudnnRNNInputMode_t input_mode_;
cudnnDirectionMode_t direction_mode_;
cudnnRNNMode_t rnn_mode_;
@@ -1806,8 +1752,6 @@ bool CudnnSupport::DoRnnBackwardImpl(
return true;
}
-#endif // CUDNN_VERSION
-
port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
CudnnSupport::createRnnDescriptor(
int num_layers, int hidden_size, int input_size, int batch_size,
@@ -1815,7 +1759,6 @@ CudnnSupport::createRnnDescriptor(
dnn::RnnMode rnn_mode, dnn::DataType data_type,
const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
ScratchAllocator* state_allocator) {
-#if CUDNN_VERSION >= 5000
// Setting up a cudnnRNNDescriptor requires a cuDNN handle, but because it's
// not enqueueing anything into a stream, we pass in the null stream.
auto cudnn = cudnn_->GetHandle(parent_, /*stream=*/nullptr);
@@ -1830,20 +1773,12 @@ CudnnSupport::createRnnDescriptor(
}
return port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>(
std::move(rnn_desc));
-#else
- string error_msg =
- port::StrCat("createRnnDescriptor needs at least Cudnn 5.0 to work. ",
- "Current Cudnn version: ", CUDNN_VERSION, ". ");
- LOG(ERROR) << error_msg;
- return port::Status(port::error::UNIMPLEMENTED, error_msg);
-#endif // CUDNN_VERSION
}
port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
CudnnSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
int data_size,
dnn::DataType data_type) {
-#if CUDNN_VERSION >= 5000
std::unique_ptr<CudnnRnnSequenceTensorDescriptor> seq_desc(
new CudnnRnnSequenceTensorDescriptor(parent_, seq_length, batch_size,
data_size,
@@ -1853,20 +1788,12 @@ CudnnSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
}
return port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>(
std::move(seq_desc));
-#else
- string error_msg = port::StrCat(
- "createRnnSequenceTensorDescriptor needs at least Cudnn 5.0 to work. ",
- "Current Cudnn version: ", CUDNN_VERSION, ". ");
- LOG(ERROR) << error_msg;
- return port::Status(port::error::UNIMPLEMENTED, error_msg);
-#endif // CUDNN_VERSION
}
port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
int data_size,
dnn::DataType data_type) {
-#if CUDNN_VERSION >= 5000
std::unique_ptr<CudnnRnnStateTensorDescriptor> state_desc(
new CudnnRnnStateTensorDescriptor(parent_, num_layer, batch_size,
data_size, ToCudnnDataType(data_type)));
@@ -1875,13 +1802,6 @@ CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
}
return port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>(
std::move(state_desc));
-#else
- string error_msg = port::StrCat(
- "createRnnStateTensorDescriptor needs at least Cudnn 5.0 to work. ",
- "Current Cudnn version: ", CUDNN_VERSION, ". ");
- LOG(ERROR) << error_msg;
- return port::Status(port::error::UNIMPLEMENTED, error_msg);
-#endif // CUDNN_VERSION
}
bool CudnnSupport::DoRnnForward(
@@ -1902,7 +1822,6 @@ bool CudnnSupport::DoRnnForward(
ScratchAllocator* reserve_space_allocator,
ScratchAllocator* workspace_allocator,
dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
const CudnnRnnDescriptor& cudnn_rnn_desc =
static_cast<const CudnnRnnDescriptor&>(rnn_desc);
const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -1924,9 +1843,6 @@ bool CudnnSupport::DoRnnForward(
output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
output_c_data, is_training, reserve_space_allocator, workspace_allocator,
output_profile_result);
-#else
- return false;
-#endif // CUDNN_VERSION
}
bool CudnnSupport::DoRnnForward(
@@ -1946,7 +1862,6 @@ bool CudnnSupport::DoRnnForward(
ScratchAllocator* reserve_space_allocator,
ScratchAllocator* workspace_allocator,
dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
const CudnnRnnDescriptor& cudnn_rnn_desc =
static_cast<const CudnnRnnDescriptor&>(rnn_desc);
const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -1968,9 +1883,6 @@ bool CudnnSupport::DoRnnForward(
output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
output_c_data, is_training, reserve_space_allocator, workspace_allocator,
output_profile_result);
-#else
- return false;
-#endif // CUDNN_VERSION
}
bool CudnnSupport::DoRnnForward(
@@ -1991,7 +1903,6 @@ bool CudnnSupport::DoRnnForward(
ScratchAllocator* reserve_space_allocator,
ScratchAllocator* workspace_allocator,
dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
const CudnnRnnDescriptor& cudnn_rnn_desc =
static_cast<const CudnnRnnDescriptor&>(rnn_desc);
const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2013,9 +1924,6 @@ bool CudnnSupport::DoRnnForward(
output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
output_c_data, is_training, reserve_space_allocator, workspace_allocator,
output_profile_result);
-#else
- return false;
-#endif // CUDNN_VERSION
}
bool CudnnSupport::DoRnnBackward(
@@ -2043,7 +1951,6 @@ bool CudnnSupport::DoRnnBackward(
DeviceMemory<uint8>* reserve_space_data,
ScratchAllocator* workspace_allocator,
dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
const CudnnRnnDescriptor& cudnn_rnn_desc =
static_cast<const CudnnRnnDescriptor&>(rnn_desc);
const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2067,9 +1974,6 @@ bool CudnnSupport::DoRnnBackward(
output_c_backprop_data, input_backprop_data, input_h_backprop_data,
input_c_backprop_data, params_backprop_data, reserve_space_data,
workspace_allocator, output_profile_result);
-#else
- return false;
-#endif // CUDNN_VERSION
}
bool CudnnSupport::DoRnnBackward(
@@ -2096,7 +2000,6 @@ bool CudnnSupport::DoRnnBackward(
DeviceMemory<uint8>* reserve_space_data,
ScratchAllocator* workspace_allocator,
dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
const CudnnRnnDescriptor& cudnn_rnn_desc =
static_cast<const CudnnRnnDescriptor&>(rnn_desc);
const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2120,9 +2023,6 @@ bool CudnnSupport::DoRnnBackward(
output_c_backprop_data, input_backprop_data, input_h_backprop_data,
input_c_backprop_data, params_backprop_data, reserve_space_data,
workspace_allocator, output_profile_result);
-#else
- return false;
-#endif // CUDNN_VERSION
}
bool CudnnSupport::DoRnnBackward(
@@ -2150,7 +2050,6 @@ bool CudnnSupport::DoRnnBackward(
DeviceMemory<uint8>* reserve_space_data,
ScratchAllocator* workspace_allocator,
dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
const CudnnRnnDescriptor& cudnn_rnn_desc =
static_cast<const CudnnRnnDescriptor&>(rnn_desc);
const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2174,9 +2073,6 @@ bool CudnnSupport::DoRnnBackward(
output_c_backprop_data, input_backprop_data, input_h_backprop_data,
input_c_backprop_data, params_backprop_data, reserve_space_data,
workspace_allocator, output_profile_result);
-#else
- return false;
-#endif // CUDNN_VERSION
}
namespace {
@@ -2311,16 +2207,12 @@ class CudnnEnvVar {
};
// A helper struct to decide whether to enable the FFT_TILING algorithms for
-// forward convolution. Before cudnn v5.1 it works fine but since cudnn v5.1
-// it is turned off due to memory corruption caused by some shapes with this
-// algorithm.
-// Before NVIDIA fixes the memory corruption bug, users can explicitly
-// enable the algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1".
+// forward convolution. It is disabled for cuDNN < 7 due to memory corruption
+// caused by some shapes with this algorithm. Users can explicitly enable the
+// algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1".
struct FftTilingForward {
static constexpr const char* kName = "TF_ENABLE_FFT_TILING_FORWARD";
- // TODO(yangzihao): turn the default to True when the memory corruption bug
- // is fixed.
- static constexpr bool kDefaultFlag = CUDNN_VERSION < 5100;
+ static constexpr bool kDefaultFlag = CUDNN_VERSION >= 7000;
};
// A helper struct to decide whether to enable the WINOGRAD_NONFUSED algorithms.
@@ -2329,10 +2221,9 @@ struct FftTilingForward {
// https://github.com/tensorflow/tensorflow/pull/4901
struct WinogradNonfused {
static constexpr const char* kName = "TF_ENABLE_WINOGRAD_NONFUSED";
- // NVIDIA has fixed winograd nonfused bug for cudnn v>=7.
- // For cudnn v>=5.1, we have a workaround and for any lower version, we
- // disable it by default.
- static constexpr bool kDefaultFlag = CUDNN_VERSION >= 5100;
+ // NVIDIA has fixed winograd nonfused bug for cudnn v>=7. For older versions,
+ // we have a workaround.
+ static constexpr bool kDefaultFlag = true;
};
// A helper struct to decide whether to use FP32 as the internal compute type
@@ -2621,11 +2512,6 @@ bool CudnnSupport::DoFusedConvolveImpl(
DeviceMemory<Type>* output_data, ScratchAllocator* scratch_allocator,
const dnn::AlgorithmConfig& algorithm_config,
dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION < 6000
- LOG(ERROR) << "cudnnConvolutionBiasActivationForward() is only "
- "supported for cuDNN version >= 6";
- return false;
-#else
ScopedTensorDescriptor conv_input_nd(
conv_input_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
ScopedTensorDescriptor output_nd(
@@ -2732,32 +2618,27 @@ bool CudnnSupport::DoFusedConvolveImpl(
}
return true;
-#endif // CUDNN_VERSION < 6000
}
bool CudnnSupport::GetConvolveAlgorithms(
bool with_winograd_nonfused, int cc_major, int cc_minor,
std::vector<dnn::AlgorithmDesc>* out_algorithms) {
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
- // clang-format off
+ // clang-format off
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
-#if CUDNN_VERSION >= 5000
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-#endif
- // clang-format on
+ // clang-format on
};
if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
}
-#if CUDNN_VERSION >= 5100
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
}
-#endif
out_algorithms->clear();
for (auto i : algo_types) {
@@ -2772,13 +2653,11 @@ bool CudnnSupport::GetConvolveAlgorithms(
bool CudnnSupport::GetRnnAlgorithms(
std::vector<dnn::AlgorithmDesc>* out_algorithms) {
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
- // clang-format off
-#if CUDNN_VERSION >= 6000
+ // clang-format off
CUDNN_RNN_ALGO_STANDARD,
CUDNN_RNN_ALGO_PERSIST_STATIC,
CUDNN_RNN_ALGO_PERSIST_DYNAMIC,
-#endif
- // clang-format on
+ // clang-format on
};
out_algorithms->clear();
@@ -2797,21 +2676,17 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
bool with_winograd_nonfused, int cc_major, int cc_minor,
std::vector<dnn::AlgorithmDesc>* out_algorithms) {
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
- // clang-format off
+ // clang-format off
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
-#if CUDNN_VERSION >= 5000
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
-#endif
- // clang-format on
+ // clang-format on
};
-#if CUDNN_VERSION >= 5100
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
}
-#endif
out_algorithms->clear();
for (auto i : algo_types) {
@@ -2834,13 +2709,15 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
// Based on cudnn.h, the following is not implemented.
// CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
+
+ // Produces incorrect results for some shapes. Disabled for now, see
+ // NVIDIA bug 2072856. TODO(csigg): Only disable for subset of shapes.
+ // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
// clang-format on
};
-#if CUDNN_VERSION >= 5100
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
}
-#endif
out_algorithms->clear();
for (auto i : algo_types) {
@@ -2939,17 +2816,8 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
scale.opaque(), offset.opaque(), 1.0, batch_mean_opaque,
batch_var_opaque, epsilon, saved_mean->opaque(),
saved_inv_var->opaque());
-#if CUDNN_VERSION < 5000
- CHECK(inv_var_to_var);
- inv_var_to_var();
-#endif
} else {
-#if CUDNN_VERSION < 5000
- CHECK(var_to_inv_var);
- const void* maybe_inv_var = var_to_inv_var().opaque();
-#else
const void* maybe_inv_var = estimated_variance.opaque();
-#endif
status = cudnnBatchNormalizationForwardInference(
cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
@@ -3159,11 +3027,6 @@ bool CudnnSupport::DoFusedConvolve(
DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
const dnn::AlgorithmConfig& algorithm_config,
dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION < 6000
- LOG(WARNING) << "cudnnConvolutionBiasActivationForward() is only "
- "supported for cuDNN version >= 6";
- return false;
-#else
int cc_major, cc_minor;
stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
&cc_minor);
@@ -3179,7 +3042,6 @@ bool CudnnSupport::DoFusedConvolve(
side_input_scale, bias_descriptor, biases, activation_mode,
output_descriptor, output_data, scratch_allocator, algorithm_config,
output_profile_result);
-#endif
}
namespace {
@@ -3428,13 +3290,8 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
timer->Start(AsCUDAStream(stream));
}
-#if CUDNN_VERSION >= 5000
auto status =
cudnnConvolutionBackwardData(cudnn.handle(),
-#else
- auto status =
- cudnnConvolutionBackwardData_v3(cudnn.handle(),
-#endif
/*alpha=*/alpha,
/*wDesc=*/filter.handle(),
/*w=*/filter_data.opaque(),
@@ -3697,13 +3554,8 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
timer->Start(AsCUDAStream(stream));
}
-#if CUDNN_VERSION >= 5000
auto status = cudnnConvolutionBackwardFilter(
cudnn.handle(),
-#else
- auto status = cudnnConvolutionBackwardFilter_v3(
- cudnn.handle(),
-#endif
/*alpha=*/alpha,
/*srcDesc=*/input_nd.handle(),
/*srcData=*/input_data.opaque(),
@@ -4016,11 +3868,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
auto cudnn = cudnn_->GetHandle(parent_, stream);
-#if CUDNN_VERSION >= 5000
auto status = cudnnAddTensor(
-#else
- auto status = cudnnAddTensor_v3(
-#endif
cudnn.handle(), &alpha, bias_descriptor.handle(), biases.opaque(), &beta,
input_descriptor.handle(), output_data->opaque());
@@ -4038,37 +3886,8 @@ bool CudnnSupport::DoActivate(Stream* stream,
const DeviceMemory<float>& input_data,
DeviceMemory<float>* output_data,
uint64 options) {
-#if CUDNN_VERSION >= 5000
ScopedActivationDescriptor activation_desc(
activation_mode, CUDNN_PROPAGATE_NAN, dimensions.value_max());
-#else
- cudnnActivationMode_t mode;
- switch (activation_mode) {
- case dnn::ActivationMode::kRelu6:
- // TODO(leary) should probably do a post-pass to clip at 6?
- LOG(WARNING) << "user requested Relu6, but providing Relu instead";
- mode = CUDNN_ACTIVATION_RELU;
- break;
- case dnn::ActivationMode::kReluX:
- // TODO(broune) should probably do a post-pass to clip at X?
- LOG(WARNING) << "user requested ReluX, but providing Relu instead";
- mode = CUDNN_ACTIVATION_RELU;
- break;
- case dnn::ActivationMode::kRelu:
- mode = CUDNN_ACTIVATION_RELU;
- break;
- case dnn::ActivationMode::kSigmoid:
- mode = CUDNN_ACTIVATION_SIGMOID;
- break;
- case dnn::ActivationMode::kTanh:
- mode = CUDNN_ACTIVATION_TANH;
- break;
- default:
- LOG(ERROR) << "unrecognized activation mode: "
- << static_cast<int>(activation_mode);
- return false;
- }
-#endif
ScopedTensorDescriptor input_nd(dimensions, CUDNN_DATA_FLOAT);
// Alpha is the input scaling factor.
@@ -4077,15 +3896,9 @@ bool CudnnSupport::DoActivate(Stream* stream,
float beta = 0.0;
auto cudnn = cudnn_->GetHandle(parent_, stream);
- auto status =
- cudnnActivationForward(cudnn.handle(),
-#if CUDNN_VERSION >= 5000
- activation_desc.handle(),
-#else
- mode,
-#endif
- &alpha, input_nd.handle(), input_data.opaque(),
- &beta, input_nd.handle(), output_data->opaque());
+ auto status = cudnnActivationForward(
+ cudnn.handle(), activation_desc.handle(), &alpha, input_nd.handle(),
+ input_data.opaque(), &beta, input_nd.handle(), output_data->opaque());
if (status != CUDNN_STATUS_SUCCESS) {
LOG(ERROR) << "stream " << stream
<< " could not enqueue activation: " << ToString(status);