diff options
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_dnn.cc')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.cc | 1074 |
1 files changed, 1074 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc new file mode 100644 index 0000000000..6e4403512b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -0,0 +1,1074 @@ +#include "tensorflow/stream_executor/cuda/cuda_dnn.h" + +#include <dlfcn.h> +#include <functional> + +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/threadpool.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "third_party/gpus/cuda/include/cudnn.h" + +namespace { + +// Converts (via narrowing) a type T value to a type U, and checks that the +// value has no value change due to the conversion. +template <typename WideT, typename NarrowT> +NarrowT CheckedNarrowing(const WideT& wide) { + NarrowT narrow = wide; + CHECK_EQ(narrow, wide) + << "checked narrowing failed; values not equal post-conversion"; + return narrow; +} + +} // namespace + +namespace perftools { +namespace gputools { + +using dnn::BatchDescriptor; +using dnn::FilterDescriptor; +using dnn::ConvolutionDescriptor; +using dnn::PoolingDescriptor; + +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin); + +extern CUstream AsCUDAStreamValue(Stream* stream); + +string ToString(cudnnStatus_t status) { + switch (status) { + case CUDNN_STATUS_SUCCESS: + return "CUDNN_STATUS_SUCCESS"; + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDNN_STATUS_NOT_INITIALIZED"; + case CUDNN_STATUS_ALLOC_FAILED: + return "CUDNN_STATUS_ALLOC_FAILED"; + case CUDNN_STATUS_BAD_PARAM: + return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_INTERNAL_ERROR: + return "CUDNN_STATUS_INTERNAL_ERROR"; + case CUDNN_STATUS_INVALID_VALUE: + return "CUDNN_STATUS_INVALID_VALUE"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH"; + case CUDNN_STATUS_MAPPING_ERROR: + return "CUDNN_STATUS_MAPPING_ERROR"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_LICENSE_ERROR: + return "CUDNN_STATUS_LICENSE_ERROR"; + default: + return port::StrCat("<unknown cudnn status: ", static_cast<int>(status), + ">"); + } +} + +namespace dynload { + +static port::ThreadPool* InitCudnnThreadpool() { + port::ThreadPool* cudnn_threadpool_; + port::ThreadOptions options; + // TBD(keveman): Conservatively setting the stack size and guard size to 2MB, + // until we can get some guarantees from NVIDIA on the minimum stack space + // they will work with. + options.stack_size = 2 * 1024 * 1024; + options.guard_size = 2 * 1024 * 1024; + cudnn_threadpool_ = new port::ThreadPool(port::Env::Default(), options, + "cudnn_threadpool", 1); + CHECK(cudnn_threadpool_); + return cudnn_threadpool_; +} + +static mutex cudnn_threadpool_mu(LINKER_INITIALIZED); +static port::ThreadPool* GetCudaThreadpool() { + mutex_lock lock(cudnn_threadpool_mu); + static port::ThreadPool* cudnn_threadpool = InitCudnnThreadpool(); + return cudnn_threadpool; +} + +#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char* kName; \ + typedef std::add_pointer<decltype(::__name)>::type FuncPointerT; \ + static void* GetDsoHandle() { \ + static auto result = internal::CachedDsoLoader::GetCudnnDsoHandle(); \ + return result.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void* f = dlsym(GetDsoHandle(), kName); \ + if (f == nullptr) { \ + LOG(FATAL) << "could not find " << kName \ + << " in cudnn DSO; dlerror: " << dlerror(); \ + } \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + void CallWrapper(CUDAExecutor* parent, port::Notification* n, \ + cudnnStatus_t* retval, const Args&... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + *retval = DynLoad()(args...); \ + n->Notify(); \ + } \ + template <typename... Args> \ + cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) { \ + port::Notification n; \ + cudnnStatus_t retval; \ + auto call_func_closure = \ + std::bind(&DynLoadShim__##__name::CallWrapper<Args...>, this, \ + parent, &n, &retval, args...); \ + GetCudaThreadpool()->Schedule(call_func_closure); \ + n.WaitForNotification(); \ + return retval; \ + } \ + } __name; \ + const char* DynLoadShim__##__name::kName = #__name; + +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor) __macro( \ + cudnnGetConvolutionNdForwardOutputDim) \ + __macro(cudnnGetConvolutionForwardAlgorithm) __macro( \ + cudnnCreateTensorDescriptor) __macro(cudnnDestroyTensorDescriptor) \ + __macro(cudnnCreateFilterDescriptor) \ + __macro(cudnnSetFilter4dDescriptor) \ + __macro(cudnnSetPooling2dDescriptor) \ + __macro(cudnnDestroyFilterDescriptor) \ + __macro(cudnnCreateConvolutionDescriptor) \ + __macro(cudnnCreatePoolingDescriptor) \ + __macro(cudnnAddTensor) \ + __macro(cudnnDestroyPoolingDescriptor) + +CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH + +// clang-format off +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetConvolution2dDescriptor) \ + __macro(cudnnDestroyConvolutionDescriptor) \ + __macro(cudnnCreate) \ + __macro(cudnnDestroy) \ + __macro(cudnnSetStream) \ + __macro(cudnnActivationForward) \ + __macro(cudnnConvolutionForward) \ + __macro(cudnnConvolutionBackwardData) \ + __macro(cudnnConvolutionBackwardFilter) \ + __macro(cudnnGetConvolutionForwardWorkspaceSize) \ + __macro(cudnnTransformTensor) \ + __macro(cudnnPoolingForward) \ + __macro(cudnnPoolingBackward) +// clang-format on + +CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH + +} // namespace dynload + +namespace { + +cudnnHandle_t ToHandle(void* opaque_handle) { + return static_cast<cudnnHandle_t>(opaque_handle); +} + +} // namespace + +CudnnSupport::CudnnSupport(CUDAExecutor* parent) + : parent_(parent), dnn_handle_(nullptr) {} + +CudnnSupport::~CudnnSupport() { + auto status = dynload::cudnnDestroy(parent_, ToHandle(dnn_handle_)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn handle: " << ToString(status); + } +} + +port::Status CudnnSupport::Init() { + auto status = dynload::cudnnCreate( + parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_)); + if (status == CUDNN_STATUS_SUCCESS) { + return port::Status::OK(); + } + + LOG(ERROR) << "could not create cudnn handle: " << ToString(status); + if (status == CUDNN_STATUS_NOT_INITIALIZED) { + // This is the error code that the driver returns when we're not running a + // sufficient CUDA driver -- cudnn requires 6.5+ compatibility, which + // starts with the 340.XX driver series. + auto result = cuda::Diagnostician::FindKernelDriverVersion(); + if (!result.ok()) { + LOG(ERROR) << "error retrieving driver version: " + << DriverVersionStatusToString(result); + } else { + const auto& version = result.ValueOrDie(); + LOG(INFO) << "running driver version: " << DriverVersionToString(version); + if (std::get<0>(version) < 340) { + LOG(ERROR) + << "cudnn library is only supported on 340.XX+ driver versions"; + } + } + } + return port::Status{port::error::INTERNAL, + port::StrCat("cudnn library could not create a handle: ", + ToString(status))}; +} + +// Turns a BatchDescriptor structure into a cudnn tensor handle within a scope. +class ScopedTensorDescriptor { + public: + ScopedTensorDescriptor(CUDAExecutor* parent, + const BatchDescriptor& batch_descriptor, + cudnnDataType_t elem_type) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateTensorDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn tensor descriptor: " + << ToString(status); + } + + cudnnTensorFormat_t format; + switch (batch_descriptor.layout()) { + case dnn::DataLayout::kBatchYXDepth: + format = CUDNN_TENSOR_NHWC; + break; + case dnn::DataLayout::kBatchDepthYX: + format = CUDNN_TENSOR_NCHW; + break; + default: + LOG(FATAL) << "Unsupported tensor format " + << DataLayoutString(batch_descriptor.layout()); + break; + } + + status = dynload::cudnnSetTensor4dDescriptor( + parent_, handle_, format, elem_type, + CheckedNarrowing<int64, int>(batch_descriptor.count()), + CheckedNarrowing<int64, int>(batch_descriptor.feature_map_count()), + CheckedNarrowing<int64, int>(batch_descriptor.height()), + CheckedNarrowing<int64, int>(batch_descriptor.width())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn tensor descriptor: " + << ToString(status); + } + } + + ~ScopedTensorDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyTensorDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn tensor descriptor: " + << ToString(status); + } + } + + cudnnTensorDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnTensorDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor); +}; + +// Turns a FilterDescriptor structure into a cudnn filter handle within a scope. +class ScopedFilterDescriptor { + public: + ScopedFilterDescriptor(CUDAExecutor* parent, + const FilterDescriptor& filter_descriptor, + cudnnDataType_t elem_type) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateFilterDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn filter descriptor: " + << ToString(status); + } + + // TODO(b/23032134): Even if the filter layout is not supported, + // cudnnSetFilter4DDescriptor will return CUDNN_STATUS_SUCCESS because it + // does not take layout as an input. Maybe force cuDNN by giving wrong + // inputs intentionally? + switch (filter_descriptor.layout()) { + case dnn::FilterLayout::kOutputInputYX: + break; + default: + LOG(FATAL) << "Unsupported filter format " + << FilterLayoutString(filter_descriptor.layout()); + break; + } + + status = dynload::cudnnSetFilter4dDescriptor( + parent_, handle_, elem_type, + CheckedNarrowing<int64, int>( + filter_descriptor.output_feature_map_count()), + CheckedNarrowing<int64, int>( + filter_descriptor.input_feature_map_count()), + CheckedNarrowing<int64, int>(filter_descriptor.input_filter_height()), + CheckedNarrowing<int64, int>(filter_descriptor.input_filter_width())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn filter descriptor: " + << ToString(status); + } + } + + ~ScopedFilterDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyFilterDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn filter descriptor: " + << ToString(status); + } + } + + cudnnFilterDescriptor_t handle() const { return handle_; } + + private: + // Parent executor object. Not owned. + CUDAExecutor* parent_; + + // cudnn filter descriptor this object creates. Owned. + cudnnFilterDescriptor_t handle_; + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor); +}; + +// Turns a ConvolutionDescriptor structure into a cudnn convolution handle +// within a scope. +class ScopedConvolutionDescriptor { + public: + ScopedConvolutionDescriptor( + CUDAExecutor* parent, const ConvolutionDescriptor& convolution_descriptor) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateConvolutionDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn convolution descriptor: " + << ToString(status); + } + + status = dynload::cudnnSetConvolution2dDescriptor( + parent_, handle_, CheckedNarrowing<int64, int>( + convolution_descriptor.zero_padding_height()), + CheckedNarrowing<int64, int>( + convolution_descriptor.zero_padding_width()), + CheckedNarrowing<int64, int>( + convolution_descriptor.vertical_filter_stride()), + CheckedNarrowing<int64, int>( + convolution_descriptor.horizontal_filter_stride()), + // TODO(leary) not sure what the following two params do. + 1 /* = upscale_input_x */, 1 /* = upscale_input_y */, + // NOTE(keveman): cuDNN supports convolution and cross correlation. + // However, almost all the use cases do cross correlation, so just hard + // coding it here. + CUDNN_CROSS_CORRELATION); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn convolution descriptor: " + << ToString(status); + } + } + + ~ScopedConvolutionDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyConvolutionDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn convolution descriptor: " + << ToString(status); + } + } + + cudnnConvolutionDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnConvolutionDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); +}; + +// Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle +// within a scope. +class ScopedPoolingDescriptor { + public: + ScopedPoolingDescriptor(CUDAExecutor* parent, + const PoolingDescriptor& pooling_descriptor) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreatePoolingDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn pooling descriptor: " + << ToString(status); + } + status = dynload::cudnnSetPooling2dDescriptor( + parent_, handle_, + (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum + ? CUDNN_POOLING_MAX + : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING), + CheckedNarrowing<int64, int>(pooling_descriptor.window_height()), + CheckedNarrowing<int64, int>(pooling_descriptor.window_width()), + CheckedNarrowing<int64, int>(pooling_descriptor.vertical_padding()), + CheckedNarrowing<int64, int>(pooling_descriptor.horizontal_padding()), + CheckedNarrowing<int64, int>(pooling_descriptor.vertical_stride()), + CheckedNarrowing<int64, int>(pooling_descriptor.horizontal_stride())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn pooling descriptor: " + << ToString(status); + } + } + ~ScopedPoolingDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyPoolingDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn pooling descriptor: " + << ToString(status); + } + } + + cudnnPoolingDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnPoolingDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor); +}; + +bool CudnnSupport::DoConvolve( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory<float>& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) { + ScopedTensorDescriptor input_4d{parent_, batch_descriptor, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor output_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // The NO_WORKSPACE versions are possibly slower for certain shapes, but + // not so for the shapes currently used by Brain. Also, it seems prudent to + // keep cuMemAlloc off the critical path. + cudnnConvolutionFwdAlgo_t algo; + status = dynload::cudnnGetConvolutionForwardAlgorithm( + parent_, ToHandle(dnn_handle_), input_4d.handle(), filter.handle(), + conv.handle(), output_4d.handle(), CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, + &algo); + + CHECK_EQ(status, CUDNN_STATUS_SUCCESS) + << "Unable to find a suitable algorithm for doing forward convolution"; + + status = dynload::cudnnConvolutionForward( + parent_, ToHandle(dnn_handle_), &alpha, input_4d.handle(), + input_data.opaque(), filter.handle(), filter_data.opaque(), conv.handle(), + algo, nullptr /* workspace ptr */, 0 /* workspace size */, &beta, + output_4d.handle(), output_data->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + + return true; +} + +bool CudnnSupport::DoConvolve( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory<double>& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory<double>& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory<double>* output_data) { + LOG(ERROR) << "double-based DNN not yet implemented"; + return false; +} + +DeviceMemory<float> CudnnSupport::MaybeTransformLayout( + Stream* stream, BatchDescriptor* output_descriptor, + DeviceMemory<float> backward_output_data, + std::unique_ptr<TemporaryDeviceMemory<float>>* transform_scratch) { + if (output_descriptor->layout() == dnn::DataLayout::kBatchDepthYX) { + return backward_output_data; + } + CHECK(output_descriptor->layout() == dnn::DataLayout::kBatchYXDepth); + *transform_scratch = + stream->AllocateTemporaryArray<float>(backward_output_data.ElementCount()) + .ConsumeValueOrDie(); + BatchDescriptor transformed_output_descriptor; + transformed_output_descriptor.CloneFrom(*output_descriptor); + transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX); + ScopedTensorDescriptor orig_out_back_4d{parent_, *output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor transformed_out_back_4d{ + parent_, transformed_output_descriptor, CUDNN_DATA_FLOAT}; + + float alpha = 1.0f; + float beta = 0.0f; + auto status = dynload::cudnnTransformTensor( + parent_, ToHandle(dnn_handle_), &alpha, orig_out_back_4d.handle(), + backward_output_data.opaque(), &beta, transformed_out_back_4d.handle(), + (*transform_scratch)->mutable_device_memory()->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "Failed to transform the data layout."; + } + output_descriptor->set_layout(dnn::DataLayout::kBatchDepthYX); + return (*transform_scratch)->device_memory(); +} + +bool CudnnSupport::DoConvolveBackwardData( + Stream* stream, const FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const BatchDescriptor& output_descriptor_in, + DeviceMemory<float> backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& input_descriptor, + DeviceMemory<float>* backward_input_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass. + BatchDescriptor output_descriptor; + output_descriptor.CloneFrom(output_descriptor_in); + std::unique_ptr<TemporaryDeviceMemory<float>> transform_scratch; + backward_output_data = MaybeTransformLayout( + stream, &output_descriptor, backward_output_data, &transform_scratch); + + ScopedTensorDescriptor out_back_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor in_back_4d{parent_, input_descriptor, + CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + status = dynload::cudnnConvolutionBackwardData( + parent_, ToHandle(dnn_handle_), &alpha, filter.handle(), + filter_data.opaque(), out_back_4d.handle(), backward_output_data.opaque(), + conv.handle(), &beta, in_back_4d.handle(), backward_input_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_descriptor_in, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory<float>* backward_filter_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass. + BatchDescriptor output_descriptor; + output_descriptor.CloneFrom(output_descriptor_in); + std::unique_ptr<TemporaryDeviceMemory<float>> transform_scratch; + backward_output_data = MaybeTransformLayout( + stream, &output_descriptor, backward_output_data, &transform_scratch); + + ScopedTensorDescriptor out_back_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor input_4d{parent_, input_descriptor, CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + status = dynload::cudnnConvolutionBackwardFilter( + parent_, ToHandle(dnn_handle_), &alpha, input_4d.handle(), + input_data.opaque(), out_back_4d.handle(), backward_output_data.opaque(), + conv.handle(), &beta, filter.handle(), backward_filter_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoMatMul(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<float>& weights, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + if (input_dimensions.count() != output_dimensions.count()) { + LOG(ERROR) << "MatMul input and output dimensions are not compatible."; + return false; + } + + // We do not permute the input or output, instead we just + // reinterpret the layout. We are working with row-major matrices + // and the rows of the input and output correspond to batch, so + // batch has to be outermost in both the input and output. + // + // By adding transposes to the BLAS gemm call we could perhaps make + // the kYXDepthBatch layout work as well, but there has been no need + // for that so far. + if (input_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + input_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) { + LOG(ERROR) << "Unsupported MatMul input layout."; + return false; + } + if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + output_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) { + LOG(ERROR) << "Unsupported MatMul output layout."; + return false; + } + + if (output_dimensions.width() == 1 && output_dimensions.height() == 1) { + // This is a fast path that also supports the kBatchYXDepth layout. + + // The matrices here are in row-major format while BLAS expects + // column-major, i.e. our matrices are transposed as far as BLAS + // is concerned. So we need to compute output^T = + // input^T*weights^T. There is no parameter for transposing the + // output in BLAS gemm, but instead we can transpose both sides of + // the equality to see that this is equivalent to + // output=weights*input. So we only need to swap the order of + // weights and input in the matrix product to correct for the + // row-major versus column-major difference. + const float alpha = 1.0f; // Take the matrix product without scaling it. + const float beta = 0.0f; // Ignore the original values in output_data. + const int64 m = output_dimensions.NodesAcrossFeatureMaps(); + const int64 n = input_dimensions.count(); + const int64 k = input_dimensions.NodesAcrossFeatureMaps(); + stream->ThenBlasGemm(blas::Transpose::kNoTranspose, + blas::Transpose::kNoTranspose, m, n, k, alpha, weights, + m, input_data, k, beta, output_data, m); + } else { + // This is a slower and more complex path that supports output + // width() * height() > 1, though it only supports the + // kBatchYXDepth layout. Does support kBatchDepthYX if output + // feature_map_count() == 1, as then there is no difference + // between the two layouts. + // + // The operation here is the same as above, except that we have to + // do the matrix multiplication for each (y,x) output coordinate + // separately. We then interpret weights as containing K = width() + // * height() different matrices, which we all multiply onto the + // matrix from input_data, yielding K matrix products. We then + // combine these together into one matrix by concatenating all the + // first rows of these matrices, then all the seconds rows and so + // on. We can do this with a batched matrix multiplication, where + // the result is written to a different submatrix of the output + // for each matrix multiplication. + // + // The reason that we only support the kBatchYXDepth output layout + // is that we have to do something in the depth for each (y,x) + // coordinate. The kBatchYXDepth layout has the depth information + // for each point (y,x) in contiguous memory while the + // kBatchDepthYX layout does not. + // + // TODO(broune): Consider a special case for when output depth == + // 1, as then possibly this could all be done as one matrix + // multiplication instead of a batched one, which should be + // faster. Another possibility would be to add a weights layout + // parameter and then support kBatchDepthYX for a different + // weights layout. + if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + !(output_dimensions.layout() == dnn::DataLayout::kBatchDepthYX && + output_dimensions.feature_map_count() == 1)) { + LOG(ERROR) << "Unsupported MatMul output layout."; + return false; + } + + const float alpha = 1.0f; // Take the matrix product without scaling it. + const float beta = 0.0f; // Ignore the original values in output_data. + const uint64 m = output_dimensions.feature_map_count(); + const uint64 n = input_dimensions.count(); + const uint64 k = input_dimensions.NodesAcrossFeatureMaps(); + const int lda = m; + const int ldb = k; + const int ldc = output_dimensions.NodesAcrossFeatureMaps(); + const int batch_count = output_dimensions.NodesPerFeatureMap(); + + std::vector<DeviceMemory<float>> a(batch_count); + std::vector<DeviceMemory<float>> b(batch_count); + std::vector<DeviceMemory<float>> c(batch_count); + for (int i = 0; i < batch_count; ++i) { + const int weights_offset = i * input_dimensions.NodesAcrossFeatureMaps() * + output_dimensions.feature_map_count(); + a[i] = DeviceMemory<float>::MakeFromByteSize( + const_cast<float*>(reinterpret_cast<const float*>(weights.opaque())) + + weights_offset, + weights.ElementCount() - weights_offset); + + b[i] = input_data; + + const int output_offset = i * output_dimensions.feature_map_count(); + c[i] = DeviceMemory<float>::MakeFromByteSize( + const_cast<float*>( + reinterpret_cast<const float*>(output_data->opaque())) + + output_offset, + output_data->ElementCount() - output_offset); + } + const auto toPtrs = [](std::vector<DeviceMemory<float>>& v) { + std::vector<DeviceMemory<float>*> ptrs; + for (auto& mem : v) { + ptrs.push_back(&mem); + } + return ptrs; + }; + + stream->ThenBlasGemmBatched(blas::Transpose::kNoTranspose, + blas::Transpose::kNoTranspose, m, n, k, alpha, + toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c), + ldc, batch_count); + } + + return stream->ok(); +} + +bool CudnnSupport::DoBiasAdd(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<float>& biases, + const dnn::BatchDescriptor& dimensions, + DeviceMemory<float>* output_data) { + ScopedTensorDescriptor input_descriptor{parent_, dimensions, + CUDNN_DATA_FLOAT}; + + BatchDescriptor bias_dimensions; + bias_dimensions.set_count(1) + .set_feature_map_count(dimensions.feature_map_count()) + .set_height(1) + .set_width(1) + .set_layout(dnn::DataLayout::kBatchYXDepth); + ScopedTensorDescriptor bias_descriptor{parent_, bias_dimensions, + CUDNN_DATA_FLOAT}; + + // cudnnAddTensor is in-place, so we need to copy input_data to + // output_data before doing the addition, unless the input and + // output are at the same address. + if (input_data.opaque() != output_data->opaque()) { + stream->ThenMemcpy(output_data, input_data, + dimensions.ElementCount() * sizeof(float)); + if (!stream->ok()) { + LOG(ERROR) + << "stream " << stream + << " could not enqueue a tensor copy as part of bias addition."; + return false; + } + } + + mutex_lock lock{dnn_handle_mutex_}; + + const float alpha = 1.0f; + const float beta = 1.0f; + auto status = dynload::cudnnAddTensor( + parent_, ToHandle(dnn_handle_), CUDNN_ADD_SAME_C, &alpha, + bias_descriptor.handle(), biases.opaque(), &beta, + input_descriptor.handle(), output_data->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "stream " << stream << " could not enqueue bias addition."; + return false; + } + + return true; +} + +bool CudnnSupport::DoActivate(Stream* stream, + dnn::ActivationMode activation_mode, + const dnn::BatchDescriptor& dimensions, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + cudnnActivationMode_t mode; + switch (activation_mode) { + case dnn::ActivationMode::kRelu6: + // TODO(leary) should probably do a post-pass to clip at 6? + LOG(WARNING) << "user requested Relu6, but providing Relu instead"; + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kReluX: + // TODO(broune) should probably do a post-pass to clip at X? + LOG(WARNING) << "user requested ReluX, but providing Relu instead"; + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kRelu: + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kSigmoid: + mode = CUDNN_ACTIVATION_SIGMOID; + break; + case dnn::ActivationMode::kTanh: + mode = CUDNN_ACTIVATION_TANH; + break; + default: + LOG(ERROR) << "unrecognized activation mode: " + << static_cast<int>(activation_mode); + return false; + } + + ScopedTensorDescriptor input_4d{parent_, dimensions, CUDNN_DATA_FLOAT}; + // Alpha is the input scaling factor. + float alpha = 1.0; + // Beta is the output scaling factor. + float beta = 0.0; + status = dynload::cudnnActivationForward( + parent_, ToHandle(dnn_handle_), mode, &alpha, input_4d.handle(), + input_data.opaque(), &beta, input_4d.handle(), output_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "stream " << stream + << " could not enqueue activation: " << ToString(status); + return false; + } + + return true; +} + +bool CudnnSupport::DoPoolForward( + Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor dest_desc{parent_, output_dimensions, + CUDNN_DATA_FLOAT}; + ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions}; + status = dynload::cudnnPoolingForward( + parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha, + src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(), + output_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue forward pooling on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoPoolBackward( + Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory<float>& output_data, + const DeviceMemory<float>& input_diff_data, + DeviceMemory<float>* output_diff_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor dest_desc{parent_, output_dimensions, + CUDNN_DATA_FLOAT}; + ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions}; + status = dynload::cudnnPoolingBackward( + parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha, + dest_desc.handle(), output_data.opaque(), dest_desc.handle(), + input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta, + src_desc.handle(), output_diff_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue backward pooling on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoNormalize( + Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, + const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoDepthConcatenate( + Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoElementwiseOperate( + Stream* stream, dnn::ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<uint8> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<uint16> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<int32> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyH2DQuantized( + Stream* stream, port::ArraySlice<uint8> host_src, + DeviceMemory<float>* gpu_unquantized_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DeriveOutputBatchDescriptor( + const BatchDescriptor& batch_descriptor, + const FilterDescriptor& filter_descriptor, + const dnn::ConvolutionDescriptor& convolution_descriptor, + dnn::BatchDescriptor* output_batch_descriptor) { + ScopedTensorDescriptor input_4d{parent_, batch_descriptor, CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + int dims[4]; + auto status = dynload::cudnnGetConvolutionNdForwardOutputDim( + parent_, conv.handle(), input_4d.handle(), filter.handle(), 4, dims); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not get output tensor for convolution: " + << ToString(status); + return false; + } + + output_batch_descriptor->set_count(dims[0]) + .set_feature_map_count(dims[1]) + .set_height(dims[2]) + .set_width(dims[3]) + .set_layout(batch_descriptor.layout()); + return true; +} + +} // namespace cuda + +namespace gpu = ::perftools::gputools; + +void initialize_cudnn() { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::DnnFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuDnnPlugin, "cuDNN", + [](gpu::internal::StreamExecutorInterface* + parent) -> gpu::dnn::DnnSupport* { + gpu::cuda::CUDAExecutor* cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor*>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuBLAS " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + gpu::cuda::CudnnSupport* dnn = + new gpu::cuda::CudnnSupport(cuda_executor); + if (!dnn->Init().ok()) { + // Note: Init() will log a more specific error. + delete dnn; + return nullptr; + } + return dnn; + }); + + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuDNN factory: " + << status.error_message(); + } + + // Prime the cuDNN DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCudnnDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuDNN DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kDnn, + gpu::cuda::kCuDnnPlugin); +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(register_cudnn, + { perftools::gputools::initialize_cudnn(); }); |