diff options
Diffstat (limited to 'tensorflow/stream_executor/cuda')
27 files changed, 9108 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/cuda/cuda_activation.cc new file mode 100644 index 0000000000..32d2c0d424 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_activation.cc @@ -0,0 +1,30 @@ +#include "tensorflow/stream_executor/cuda/cuda_activation.h" + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +CUcontext ExtractCudaContext(CUDAExecutor *cuda_exec); +CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec); + +ScopedActivateExecutorContext::ScopedActivateExecutorContext( + CUDAExecutor *cuda_exec, MultiOpActivation moa) + : cuda_exec_(cuda_exec), + driver_scoped_activate_context_( + new ScopedActivateContext{ExtractCudaContext(cuda_exec), moa}) {} + +ScopedActivateExecutorContext::ScopedActivateExecutorContext( + StreamExecutor *stream_exec, MultiOpActivation moa) + : ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec), moa) {} + +ScopedActivateExecutorContext::~ScopedActivateExecutorContext() { + delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_); +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h new file mode 100644 index 0000000000..4181d13d0a --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_activation.h @@ -0,0 +1,53 @@ +// This file contains APIs that assume a StreamExecutor is backed by CUDA. +// It reaches into the CUDA implementation to activate an underlying CUDA +// context. +// +// Having this file separate from cuda_gpu_executor.h means that dependent +// code does not also have to depend on cuda.h. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_ + +#include "tensorflow/stream_executor/cuda/multi_op_activation.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class StreamExecutor; + +namespace cuda { + +class CUDAExecutor; +class ScopedActivateContext; + +// Activates a CUDA context within an enclosing scope. +class ScopedActivateExecutorContext { + public: + // Form that takes a CUDA executor implementation. + explicit ScopedActivateExecutorContext( + CUDAExecutor* cuda_exec, MultiOpActivation moa = MultiOpActivation::kNo); + + // Form that takes a pImpl executor and extracts a CUDA implementation -- + // fatal failure if it is not CUDA inside. + explicit ScopedActivateExecutorContext( + StreamExecutor* stream_exec, + MultiOpActivation moa = MultiOpActivation::kNo); + + ~ScopedActivateExecutorContext(); + + private: + // The CUDA executor implementation whose context is activated. + CUDAExecutor* cuda_exec_; + + // The cuda.h-using datatype that we wrap. + ScopedActivateContext* driver_scoped_activate_context_; + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc new file mode 100644 index 0000000000..ef1036bca3 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -0,0 +1,2184 @@ +#include "tensorflow/stream_executor/cuda/cuda_blas.h" + +#include <dlfcn.h> + +#include <complex> + +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_helpers.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/status_macros.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "third_party/gpus/cuda/include/cublas_v2.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin); + +namespace dynload { + +#define PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetCublasDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << " in cuBLAS DSO; dlerror: " << dlerror(); \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + cublasStatus_t operator()(CUDAExecutor * parent, Args... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +#define PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(__name) \ + PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name) + +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSnrm2) \ + __macro(cublasDnrm2) \ + __macro(cublasScnrm2) \ + __macro(cublasDznrm2) \ + __macro(cublasSdot) \ + __macro(cublasDdot) \ + __macro(cublasCdotu) \ + __macro(cublasCdotc) \ + __macro(cublasZdotu) \ + __macro(cublasZdotc) \ + __macro(cublasSscal) \ + __macro(cublasDscal) \ + __macro(cublasCscal) \ + __macro(cublasCsscal) \ + __macro(cublasZscal) \ + __macro(cublasZdscal) \ + __macro(cublasSaxpy) \ + __macro(cublasDaxpy) \ + __macro(cublasCaxpy) \ + __macro(cublasZaxpy) \ + __macro(cublasScopy) \ + __macro(cublasDcopy) \ + __macro(cublasCcopy) \ + __macro(cublasZcopy) \ + __macro(cublasSswap) \ + __macro(cublasDswap) \ + __macro(cublasCswap) \ + __macro(cublasZswap) \ + __macro(cublasIsamax) \ + __macro(cublasIdamax) \ + __macro(cublasIcamax) \ + __macro(cublasIzamax) \ + __macro(cublasIsamin) \ + __macro(cublasIdamin) \ + __macro(cublasIcamin) \ + __macro(cublasIzamin) \ + __macro(cublasSasum) \ + __macro(cublasDasum) \ + __macro(cublasScasum) \ + __macro(cublasDzasum) \ + __macro(cublasSrot) \ + __macro(cublasDrot) \ + __macro(cublasCrot) \ + __macro(cublasCsrot) \ + __macro(cublasZrot) \ + __macro(cublasZdrot) \ + __macro(cublasSrotg) \ + __macro(cublasDrotg) \ + __macro(cublasCrotg) \ + __macro(cublasZrotg) \ + __macro(cublasSrotm) \ + __macro(cublasDrotm) \ + __macro(cublasSrotmg) \ + __macro(cublasDrotmg) \ + __macro(cublasSgemv) \ + __macro(cublasDgemv) \ + __macro(cublasCgemv) \ + __macro(cublasZgemv) \ + __macro(cublasSgbmv) \ + __macro(cublasDgbmv) \ + __macro(cublasCgbmv) \ + __macro(cublasZgbmv) \ + __macro(cublasStrmv) \ + __macro(cublasDtrmv) \ + __macro(cublasCtrmv) \ + __macro(cublasZtrmv) \ + __macro(cublasStbmv) \ + __macro(cublasDtbmv) \ + __macro(cublasCtbmv) \ + __macro(cublasZtbmv) \ + __macro(cublasStpmv) \ + __macro(cublasDtpmv) \ + __macro(cublasCtpmv) \ + __macro(cublasZtpmv) \ + __macro(cublasStrsv) \ + __macro(cublasDtrsv) \ + __macro(cublasCtrsv) \ + __macro(cublasZtrsv) \ + __macro(cublasStpsv) \ + __macro(cublasDtpsv) \ + __macro(cublasCtpsv) \ + __macro(cublasZtpsv) \ + __macro(cublasStbsv) \ + __macro(cublasDtbsv) \ + __macro(cublasCtbsv) \ + __macro(cublasZtbsv) \ + __macro(cublasSsymv) \ + __macro(cublasDsymv) \ + __macro(cublasCsymv) \ + __macro(cublasZsymv) \ + __macro(cublasChemv) \ + __macro(cublasZhemv) \ + __macro(cublasSsbmv) \ + __macro(cublasDsbmv) \ + __macro(cublasChbmv) \ + __macro(cublasZhbmv) \ + __macro(cublasSspmv) \ + __macro(cublasDspmv) \ + __macro(cublasChpmv) \ + __macro(cublasZhpmv) \ + __macro(cublasSger) \ + __macro(cublasDger) \ + __macro(cublasCgeru) \ + __macro(cublasCgerc) \ + __macro(cublasZgeru) \ + __macro(cublasZgerc) \ + __macro(cublasSsyr) \ + __macro(cublasDsyr) \ + __macro(cublasCsyr) \ + __macro(cublasZsyr) \ + __macro(cublasCher) \ + __macro(cublasZher) \ + __macro(cublasSspr) \ + __macro(cublasDspr) \ + __macro(cublasChpr) \ + __macro(cublasZhpr) \ + __macro(cublasSsyr2) \ + __macro(cublasDsyr2) \ + __macro(cublasCsyr2) \ + __macro(cublasZsyr2) \ + __macro(cublasCher2) \ + __macro(cublasZher2) \ + __macro(cublasSspr2) \ + __macro(cublasDspr2) \ + __macro(cublasChpr2) \ + __macro(cublasZhpr2) \ + __macro(cublasSgemm) \ + __macro(cublasDgemm) \ + __macro(cublasCgemm) \ + __macro(cublasZgemm) \ + __macro(cublasSsyrk) \ + __macro(cublasDsyrk) \ + __macro(cublasCsyrk) \ + __macro(cublasZsyrk) \ + __macro(cublasCherk) \ + __macro(cublasZherk) \ + __macro(cublasSsyr2k) \ + __macro(cublasDsyr2k) \ + __macro(cublasCsyr2k) \ + __macro(cublasZsyr2k) \ + __macro(cublasCher2k) \ + __macro(cublasZher2k) \ + __macro(cublasSsyrkx) \ + __macro(cublasDsyrkx) \ + __macro(cublasCsyrkx) \ + __macro(cublasZsyrkx) \ + __macro(cublasCherkx) \ + __macro(cublasZherkx) \ + __macro(cublasSsymm) \ + __macro(cublasDsymm) \ + __macro(cublasCsymm) \ + __macro(cublasZsymm) \ + __macro(cublasChemm) \ + __macro(cublasZhemm) \ + __macro(cublasStrsm) \ + __macro(cublasDtrsm) \ + __macro(cublasCtrsm) \ + __macro(cublasZtrsm) \ + __macro(cublasStrmm) \ + __macro(cublasDtrmm) \ + __macro(cublasCtrmm) \ + __macro(cublasZtrmm) \ + __macro(cublasSgeam) \ + __macro(cublasDgeam) \ + __macro(cublasCgeam) \ + __macro(cublasZgeam) \ + __macro(cublasSdgmm) \ + __macro(cublasDdgmm) \ + __macro(cublasCdgmm) \ + __macro(cublasZdgmm) + +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasCreate) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasDestroy) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetStream) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetPointerMode) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasGetPointerMode) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasDgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasCgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasZgemmBatched) +CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP) + +} // namespace dynload + +static string ToString(cublasStatus_t status) { + switch (status) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + default: + return port::StrCat("<invalid cublas status: ", status, ">"); + } +} + +// cuBLAS has interfaces that permit pointers to be passed from either the host +// memory space or the device memory space; however, you must instruct it as to +// which address space those pointers are in with cublasSetPointerMode. +// +// This helper sets the cuBLAS pointer mode to a desired value for a cuBLAS call +// you are about to perform in a given scope. +// +// The prior cuBLAS pointer mode is retained and restored when this object goes +// out of scope. +class ScopedCublasPointerMode { + public: + // Note that, because the setting of the cublas pointer mode is fallible, + // construction of this scoped datatype must be paired with a call to + // Init(). + // + // Parameters: + // handle: The cublas library handle to act upon in setting the pointer mode. + explicit ScopedCublasPointerMode(CUDAExecutor *parent, cublasHandle_t handle) + : parent_(parent), handle_(handle), ok_(false) {} + + // Attempts the switch to the requested scoped pointer mode, new_mode. + // + // Note that when false is returned, an appropriate error has already been + // logged. + bool Init(cublasPointerMode_t new_mode) { + cublasStatus_t ret = + dynload::cublasGetPointerMode_v2(parent_, handle_, &old_mode_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret); + return ok_ = false; + } + + ret = dynload::cublasSetPointerMode_v2(parent_, handle_, new_mode); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret); + return ok_ = false; + } + + return ok_ = true; + } + + // Switches back to the prior pointer mode, if the switch operation was + // successful in the first place. + ~ScopedCublasPointerMode() { + if (ok_) { + cublasStatus_t ret = + dynload::cublasSetPointerMode_v2(parent_, handle_, old_mode_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set former cublas pointer mode: " + << ToString(ret); + } + } + } + + private: + CUDAExecutor *parent_; // Executor establishing this pointer mode for. + cublasHandle_t handle_; // Handle to the cuBLAS instance of interest. + cublasPointerMode_t old_mode_; // Prior cuBLAS pointer mode, to be restored. + bool ok_; // Whether the change was successful. +}; + +bool CUDABlas::Init() { + cublasStatus_t ret = dynload::cublasCreate_v2(parent_, &blas_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to create cublas handle: " << ToString(ret); + return false; + } + + return true; +} + +CUDABlas::CUDABlas(cuda::CUDAExecutor *parent) + : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {} + +CUDABlas::~CUDABlas() { + if (blas_ != nullptr) { + dynload::cublasDestroy_v2(parent_, blas_); + } +} + +bool CUDABlas::SetStream(Stream *stream) { + CHECK(stream != nullptr); + CHECK(AsCUDAStreamValue(stream) != nullptr); + CHECK(blas_ != nullptr); + cublasStatus_t ret = + dynload::cublasSetStream_v2(parent_, blas_, AsCUDAStreamValue(stream)); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret); + return false; + } + + return true; +} + +namespace { + +// Helper functions transforming blas arguments into cuBLAS arguments. + +cublasOperation_t CUDABlasTranspose(blas::Transpose trans) { + switch (trans) { + case blas::Transpose::kNoTranspose: + return CUBLAS_OP_N; + case blas::Transpose::kTranspose: + return CUBLAS_OP_T; + case blas::Transpose::kConjugateTranspose: + return CUBLAS_OP_C; + default: + LOG(FATAL) << "Invalid value of blas::Transpose."; + } +} + +cublasFillMode_t CUDABlasUpperLower(blas::UpperLower uplo) { + switch (uplo) { + case blas::UpperLower::kUpper: + return CUBLAS_FILL_MODE_UPPER; + case blas::UpperLower::kLower: + return CUBLAS_FILL_MODE_LOWER; + default: + LOG(FATAL) << "Invalid value of blas::UpperLower."; + } +} + +cublasDiagType_t CUDABlasDiagonal(blas::Diagonal diag) { + switch (diag) { + case blas::Diagonal::kUnit: + return CUBLAS_DIAG_UNIT; + case blas::Diagonal::kNonUnit: + return CUBLAS_DIAG_NON_UNIT; + default: + LOG(FATAL) << "Invalid value of blas::Diagonal."; + } +} + +cublasSideMode_t CUDABlasSide(blas::Side side) { + switch (side) { + case blas::Side::kLeft: + return CUBLAS_SIDE_LEFT; + case blas::Side::kRight: + return CUBLAS_SIDE_RIGHT; + default: + LOG(FATAL) << "Invalid value of blas::Side."; + } +} + +} // namespace + +template <typename FuncT, typename... Args> +bool CUDABlas::DoBlasInternal(FuncT cublas_func, Stream *stream, + bool pointer_mode_host, Args... args) { + mutex_lock lock{mu_}; + + CHECK(blas_ != nullptr); + if (!SetStream(stream)) { + return false; + } + + ScopedCublasPointerMode pointer_mode{parent_, blas_}; + if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST + : CUBLAS_POINTER_MODE_DEVICE)) { + return false; + } + + cublasStatus_t ret = cublas_func(parent_, blas_, args...); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to run cuBLAS routine " << cublas_func.kName << ": " + << ToString(ret); + return false; + } + + return true; +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *result) { + return DoBlasInternal(dynload::cublasSasum, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *result) { + return DoBlasInternal(dynload::cublasDasum, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<float> *result) { + return DoBlasInternal( + dynload::cublasScasum, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<double> *result) { + return DoBlasInternal( + dynload::cublasDzasum, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasSaxpy, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDaxpy, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal(dynload::cublasCaxpy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal(dynload::cublasZaxpy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasScopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal(dynload::cublasCcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal(dynload::cublasZcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *result) { + return DoBlasInternal( + dynload::cublasSdot, stream, false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *result) { + return DoBlasInternal( + dynload::cublasDdot, stream, false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *result) { + return DoBlasInternal( + dynload::cublasCdotc, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *result) { + return DoBlasInternal( + dynload::cublasZdotc, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *result) { + return DoBlasInternal( + dynload::cublasCdotu, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *result) { + return DoBlasInternal( + dynload::cublasZdotu, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *result) { + return DoBlasInternal(dynload::cublasSnrm2, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *result) { + return DoBlasInternal(dynload::cublasDnrm2, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<float> *result) { + return DoBlasInternal( + dynload::cublasScnrm2, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<double> *result) { + return DoBlasInternal( + dynload::cublasDznrm2, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, float c, float s) { + return DoBlasInternal( + dynload::cublasSrot, stream, true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy, double c, + double s) { + return DoBlasInternal( + dynload::cublasDrot, stream, true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<float>> *x, int incx, + DeviceMemory<std::complex<float>> *y, int incy, + float c, float s) { + return DoBlasInternal(dynload::cublasCsrot, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<double>> *x, int incx, + DeviceMemory<std::complex<double>> *y, int incy, + double c, double s) { + return DoBlasInternal(dynload::cublasZdrot, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a, + DeviceMemory<float> *b, DeviceMemory<float> *c, + DeviceMemory<float> *s) { + return DoBlasInternal(dynload::cublasSrotg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(a), + CUDAMemoryMutable(b), CUDAMemoryMutable(c), + CUDAMemoryMutable(s)); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a, + DeviceMemory<double> *b, DeviceMemory<double> *c, + DeviceMemory<double> *s) { + return DoBlasInternal(dynload::cublasDrotg, stream, + false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAMemoryMutable(b), + CUDAMemoryMutable(c), CUDAMemoryMutable(s)); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a, + DeviceMemory<std::complex<float>> *b, + DeviceMemory<float> *c, + DeviceMemory<std::complex<float>> *s) { + return DoBlasInternal( + dynload::cublasCrotg, stream, false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)), + CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s))); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a, + DeviceMemory<std::complex<double>> *b, + DeviceMemory<double> *c, + DeviceMemory<std::complex<double>> *s) { + return DoBlasInternal( + dynload::cublasZrotg, stream, false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)), + CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s))); +} + +bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, + const DeviceMemory<float> ¶m) { + return DoBlasInternal(dynload::cublasSrotm, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, + CUDAMemory(param)); +} + +bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy, + const DeviceMemory<double> ¶m) { + return DoBlasInternal(dynload::cublasDrotm, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, + CUDAMemory(param)); +} + +bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1, + DeviceMemory<float> *d2, DeviceMemory<float> *x1, + const DeviceMemory<float> &y1, + DeviceMemory<float> *param) { + return DoBlasInternal(dynload::cublasSrotmg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(d1), + CUDAMemoryMutable(d2), CUDAMemoryMutable(x1), + CUDAMemory(y1), CUDAMemoryMutable(param)); +} + +bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1, + DeviceMemory<double> *d2, DeviceMemory<double> *x1, + const DeviceMemory<double> &y1, + DeviceMemory<double> *param) { + return DoBlasInternal(dynload::cublasDrotmg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(d1), + CUDAMemoryMutable(d2), CUDAMemoryMutable(x1), + CUDAMemory(y1), CUDAMemoryMutable(param)); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasSscal, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDscal, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal( + dynload::cublasCsscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal( + dynload::cublasZdscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, + std::complex<float> alpha, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal( + dynload::cublasCscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, + std::complex<double> alpha, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal( + dynload::cublasZscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasSswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<float>> *x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal(dynload::cublasCswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<double>> *x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal(dynload::cublasZswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal(dynload::cublasIsamax, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal(dynload::cublasIdamax, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIcamax, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIzamax, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIsamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIdamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIcamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIzamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal( + dynload::cublasSgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal( + dynload::cublasDgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasCgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &x, int incx, + float beta, DeviceMemory<float> *y, int incy) { + return DoBlasInternal( + dynload::cublasSgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + return DoBlasInternal( + dynload::cublasDgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasCgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda) { + return DoBlasInternal( + dynload::cublasSger, stream, true /* = pointer_mode_host */, m, n, &alpha, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda) { + return DoBlasInternal( + dynload::cublasDger, stream, true /* = pointer_mode_host */, m, n, &alpha, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) { + return DoBlasInternal( + dynload::cublasCgerc, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) { + return DoBlasInternal( + dynload::cublasZgerc, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) { + return DoBlasInternal( + dynload::cublasCgeru, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) { + return DoBlasInternal( + dynload::cublasZgeru, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasChbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasChemv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhemv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *a, int lda) { + return DoBlasInternal( + dynload::cublasCher, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *a, int lda) { + return DoBlasInternal( + dynload::cublasZher, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) { + return DoBlasInternal( + dynload::cublasCher2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) { + return DoBlasInternal( + dynload::cublasZher2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &ap, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasChpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &ap, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *ap) { + return DoBlasInternal( + dynload::cublasChpr, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *ap) { + return DoBlasInternal( + dynload::cublasZhpr, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *ap) { + return DoBlasInternal( + dynload::cublasChpr2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *ap) { + return DoBlasInternal( + dynload::cublasZhpr2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &x, int incx, + float beta, DeviceMemory<float> *y, int incy) { + return DoBlasInternal( + dynload::cublasSsbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + return DoBlasInternal( + dynload::cublasDsbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &ap, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasSspmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap), + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &ap, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDspmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap), + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *ap) { + return DoBlasInternal(dynload::cublasSspr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *ap) { + return DoBlasInternal(dynload::cublasDspr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *ap) { + return DoBlasInternal(dynload::cublasSspr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *ap) { + return DoBlasInternal(dynload::cublasDspr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasSsymv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDsymv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *a, int lda) { + return DoBlasInternal(dynload::cublasSsyr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *a, int lda) { + return DoBlasInternal(dynload::cublasDsyr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda) { + return DoBlasInternal(dynload::cublasSsyr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda) { + return DoBlasInternal(dynload::cublasDsyr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasStbmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDtbmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) { + return DoBlasInternal( + dynload::cublasCtbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) { + return DoBlasInternal( + dynload::cublasZtbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasStbsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDtbsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) { + return DoBlasInternal( + dynload::cublasCtbsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) { + return DoBlasInternal( + dynload::cublasZtbsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, DeviceMemory<float> *x, + int incx) { + return DoBlasInternal( + dynload::cublasStpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal( + dynload::cublasDtpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal(dynload::cublasCtpmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal(dynload::cublasZtpmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, DeviceMemory<float> *x, + int incx) { + return DoBlasInternal( + dynload::cublasStpsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal( + dynload::cublasDtpsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal(dynload::cublasCtpsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal(dynload::cublasZtpsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasStrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal(dynload::cublasCtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal(dynload::cublasZtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasStrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal(dynload::cublasCtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal(dynload::cublasZtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + VLOG(1) << port::Printf( + "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu " + "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f " + "c=%p ldc=%d", + static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha, + a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc); + if (transa == blas::Transpose::kNoTranspose) { + if (lda < static_cast<int64>(m)) { + LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); " + "precondition violation"; + } + } else { + if (lda < static_cast<int64>(k)) { + LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k + << ") (transpose case); precondition violation"; + } + } + if (transb == blas::Transpose::kNoTranspose) { + if (ldb < static_cast<int64>(k)) { + LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k + << ") (no transpose case); precondition violation"; + } + } else { + if (ldb < static_cast<int64>(n)) { + LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); " + "precondition violation"; + } + } + return DoBlasInternal( + dynload::cublasSgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) { + return DoBlasInternal( + dynload::cublasDgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +template <typename T, typename FuncT> +port::Status CUDABlas::DoBlasGemmBatchedInternal( + FuncT cublas_func, Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha, + const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda, + const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta, + const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc, + int batch_count) { + std::vector<T *> a_ptr_vec, b_ptr_vec, c_ptr_vec; + for (int i = 0; i < batch_count; ++i) { + a_ptr_vec.push_back(static_cast<T *>(a_array[i]->opaque())); + b_ptr_vec.push_back(static_cast<T *>(b_array[i]->opaque())); + c_ptr_vec.push_back(static_cast<T *>(c_array[i]->opaque())); + } + + typedef typename CUDAComplexT<T>::type CUDA_T; + SE_ASSIGN_OR_RETURN( + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_ptr_array, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + SE_ASSIGN_OR_RETURN( + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_ptr_array, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + SE_ASSIGN_OR_RETURN( + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_ptr_array, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + + if (!stream->ThenMemcpy(a_ptr_array->mutable_device_memory(), + a_ptr_vec.data(), batch_count * sizeof(T *)) + .ok() || + !stream->ThenMemcpy(b_ptr_array->mutable_device_memory(), + b_ptr_vec.data(), batch_count * sizeof(T *)) + .ok() || + !stream->ThenMemcpy(c_ptr_array->mutable_device_memory(), + c_ptr_vec.data(), batch_count * sizeof(T *)) + .ok()) { + return port::Status(port::error::INTERNAL, + "failed to copy memory from host to device in " + "CUDABlas::DoBlasGemmBatched"); + } + + bool ok = DoBlasInternal( + cublas_func, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), + const_cast<const CUDA_T **>(CUDAMemory(a_ptr_array->device_memory())), + lda, + const_cast<const CUDA_T **>(CUDAMemory(b_ptr_array->device_memory())), + ldb, CUDAComplex(&beta), + const_cast<CUDA_T **>(CUDAMemory(c_ptr_array->device_memory())), ldc, + batch_count); + + if (ok) { + return port::Status::OK(); + } + return port::Status(port::error::INTERNAL, + "failed BLAS call, see log for details"); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, float alpha, + const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda, + const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta, + const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc, + int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, double alpha, + const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda, + const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb, + double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex<float> alpha, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array, + int lda, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array, + int ldb, std::complex<float> beta, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex<double> alpha, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array, + int lda, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array, + int ldb, std::complex<double> beta, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasChemm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZhemm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + float beta, DeviceMemory<std::complex<float>> *c, + int ldc) { + return DoBlasInternal(dynload::cublasCherk, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + double beta, DeviceMemory<std::complex<double>> *c, + int ldc) { + return DoBlasInternal(dynload::cublasZherk, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + float beta, DeviceMemory<std::complex<float>> *c, + int ldc) { + return DoBlasInternal(dynload::cublasCher2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, &beta, + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + double beta, DeviceMemory<std::complex<double>> *c, + int ldc) { + return DoBlasInternal(dynload::cublasZher2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, &beta, + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + float beta, DeviceMemory<float> *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + double beta, DeviceMemory<double> *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsyr2k, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsyr2k, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal(dynload::cublasCsyr2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal(dynload::cublasZsyr2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *b, int ldb) { + return DoBlasInternal( + dynload::cublasStrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda, + CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *b, int ldb) { + return DoBlasInternal( + dynload::cublasDtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda, + CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *b, int ldb) { + return DoBlasInternal( + dynload::cublasCtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb, + CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *b, int ldb) { + return DoBlasInternal( + dynload::cublasZtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb, + CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *b, int ldb) { + return DoBlasInternal(dynload::cublasStrsm, stream, + true /* = pointer_mode_host */, CUDABlasSide(side), + CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *b, int ldb) { + return DoBlasInternal(dynload::cublasDtrsm, stream, + true /* = pointer_mode_host */, CUDABlasSide(side), + CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *b, int ldb) { + return DoBlasInternal( + dynload::cublasCtrsm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *b, int ldb) { + return DoBlasInternal( + dynload::cublasZtrsm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +} // namespace cuda + +namespace gpu = ::perftools::gputools; + +void initialize_cublas() { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::BlasFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuBlasPlugin, "cuBLAS", + [](gpu::internal::StreamExecutorInterface + *parent) -> gpu::blas::BlasSupport * { + gpu::cuda::CUDAExecutor *cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor *>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuBLAS " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + gpu::cuda::CUDABlas *blas = + new gpu::cuda::CUDABlas(cuda_executor); + if (!blas->Init()) { + // Note: Init() will log a more specific error. + delete blas; + return nullptr; + } + return blas; + }); + + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuBLAS factory: " + << status.error_message(); + } + + // Prime the cuBLAS DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCublasDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuBLAS DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kBlas, + gpu::cuda::kCuBlasPlugin); +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(register_cublas, + { perftools::gputools::initialize_cublas(); }); diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h new file mode 100644 index 0000000000..1dfec2ebc5 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -0,0 +1,100 @@ +// CUDA-specific support for BLAS functionality -- this wraps the cuBLAS library +// capabilities, and is only included into CUDA implementation code -- it will +// not introduce cuda headers into other code. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_ + +#include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/plugin_registry.h" + +typedef struct cublasContext *cublasHandle_t; + +namespace perftools { +namespace gputools { + +class Stream; + +namespace cuda { + +// Opaque and unique identifier for the cuBLAS plugin. +extern const PluginId kCuBlasPlugin; + +class CUDAExecutor; + +// BLAS plugin for CUDA platform via cuBLAS library. +// +// This satisfies the platform-agnostic BlasSupport interface. +// +// Note that the cuBLAS handle that this encapsulates is implicitly tied to the +// context (and, as a result, the device) that the parent CUDAExecutor is tied +// to. This simply happens as an artifact of creating the cuBLAS handle when a +// CUDA context is active. +// +// Thread-safe post-initialization. +class CUDABlas : public blas::BlasSupport { + public: + explicit CUDABlas(CUDAExecutor *parent); + + // Allocates a cuBLAS handle. + bool Init(); + + // Releases the cuBLAS handle, if present. + ~CUDABlas() override; + + TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES + + private: + // Tells cuBLAS to enqueue the BLAS operation onto a particular Stream. + // + // cuBLAS is stateful, and only be associated with one stream (in order to + // enqueue dispatch) at a given time. As a result, this generally must be + // invoked before calling into cuBLAS. + bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // A helper function that calls the real cuBLAS function together with error + // handling. + // + // cublas_func: cuBLAS function pointer. + // cublas_name: cuBLAS function name. + // stream: Stream to enqueue the BLAS operation onto. + // pointer_mode_host: Indicate if the pointer to a scalar value is from host + // (true) or device (false). + // args: Arguments of cuBLAS function. + template <typename FuncT, typename... Args> + bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host, + Args... args); + + // A helper function to implement DoBlasGemmBatched interfaces for generic + // types. + template <typename T, typename FuncT> + port::Status DoBlasGemmBatchedInternal( + FuncT cublas_func, Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha, + const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda, + const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta, + const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc, + int batch_count); + + // mutex that guards the cuBLAS handle for this device. + mutex mu_; + + // CUDAExecutor which instantiated this CUDABlas. + // Immutable post-initialization. + CUDAExecutor *parent_; + + // cuBLAS library handle on the device. + cublasHandle_t blas_ GUARDED_BY(mu_); + + SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc new file mode 100644 index 0000000000..c01c9978a1 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc @@ -0,0 +1,260 @@ +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" + +#include <dirent.h> +#include <limits.h> +#include <link.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/sysmacros.h> +#include <unistd.h> +#include <algorithm> +#include <memory> +#include <vector> + +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/inlined_vector.h" +#include "tensorflow/stream_executor/lib/numbers.h" +#include "tensorflow/stream_executor/lib/process_state.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/str_util.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/logging.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +static const char *kDriverVersionPath = "/proc/driver/nvidia/version"; + +string DriverVersionToString(DriverVersion version) { + return port::Printf("%d.%d", std::get<0>(version), std::get<1>(version)); +} + +string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) { + if (!version.ok()) { + return version.status().ToString(); + } + + return DriverVersionToString(version.ValueOrDie()); +} + +port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) { + std::vector<string> pieces = port::Split(value, '.'); + if (pieces.size() != 2) { + return port::Status{ + port::error::INVALID_ARGUMENT, + port::Printf("expected %%d.%%d form for driver version; got \"%s\"", + value.c_str())}; + } + + int major; + int minor; + if (!port::safe_strto32(pieces[0], &major)) { + return port::Status{ + port::error::INVALID_ARGUMENT, + port::Printf("could not parse major version number \"%s\" as an " + "integer from string \"%s\"", + pieces[0].c_str(), value.c_str())}; + } + if (!port::safe_strto32(pieces[1], &minor)) { + return port::Status{ + port::error::INVALID_ARGUMENT, + port::Printf("could not parse minor version number \"%s\" as an " + "integer from string \"%s\"", + pieces[1].c_str(), value.c_str())}; + } + + DriverVersion result{major, minor}; + VLOG(2) << "version string \"" << value << "\" made value " + << DriverVersionToString(result); + return result; +} + +// -- class Diagnostician + +string Diagnostician::GetDevNodePath(int dev_node_ordinal) { + return port::StrCat("/dev/nvidia", dev_node_ordinal); +} + +void Diagnostician::LogDiagnosticInformation() { + if (access(kDriverVersionPath, F_OK) != 0) { + LOG(INFO) << "kernel driver does not appear to be running on this host " + << "(" << port::Hostname() << "): " + << "/proc/driver/nvidia/version does not exist"; + return; + } + auto dev0_path = GetDevNodePath(0); + if (access(dev0_path.c_str(), F_OK) != 0) { + LOG(INFO) << "no NVIDIA GPU device is present: " << dev0_path + << " does not exist"; + return; + } + + LOG(INFO) << "retrieving CUDA diagnostic information for host: " + << port::Hostname(); + + + LogDriverVersionInformation(); +} + +/* static */ void Diagnostician::LogDriverVersionInformation() { + LOG(INFO) << "hostname: " << port::Hostname(); + + if (VLOG_IS_ON(1)) { + const char *value = getenv("LD_LIBRARY_PATH"); + string library_path = value == nullptr ? "" : value; + VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\""; + + std::vector<string> pieces = port::Split(library_path, ':'); + for (auto piece : pieces) { + if (piece.empty()) { + continue; + } + DIR *dir = opendir(piece.c_str()); + if (dir == nullptr) { + VLOG(1) << "could not open \"" << piece << "\""; + continue; + } + while (dirent *entity = readdir(dir)) { + VLOG(1) << piece << " :: " << entity->d_name; + } + closedir(dir); + } + } + + port::StatusOr<DriverVersion> dso_version = FindDsoVersion(); + LOG(INFO) << "libcuda reported version is: " + << DriverVersionStatusToString(dso_version); + + port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion(); + LOG(INFO) << "kernel reported version is: " + << DriverVersionStatusToString(kernel_version); + if (kernel_version.ok() && dso_version.ok()) { + WarnOnDsoKernelMismatch(dso_version, kernel_version); + } +} + +// Iterates through loaded DSOs with DlIteratePhdrCallback to find the +// driver-interfacing DSO version number. Returns it as a string. +port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() { + port::StatusOr<DriverVersion> result{port::Status{ + port::error::NOT_FOUND, + "was unable to find libcuda.so DSO loaded into this program"}}; + + // Callback used when iterating through DSOs. Looks for the driver-interfacing + // DSO and yields its version number into the callback data, when found. + auto iterate_phdr = + [](struct dl_phdr_info *info, size_t size, void *data) -> int { + if (strstr(info->dlpi_name, "libcuda.so")) { + VLOG(1) << "found DLL info with name: " << info->dlpi_name; + char resolved_path[PATH_MAX] = {0}; + if (realpath(info->dlpi_name, resolved_path) == nullptr) { + return 0; + } + VLOG(1) << "found DLL info with resolved path: " << resolved_path; + const char *slash = rindex(resolved_path, '/'); + if (slash == nullptr) { + return 0; + } + const char *so_suffix = ".so."; + const char *dot = strstr(slash, so_suffix); + if (dot == nullptr) { + return 0; + } + string dso_version = dot + strlen(so_suffix); + // TODO(b/22689637): Eliminate the explicit namespace if possible. + auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64"); + auto result = static_cast<port::StatusOr<DriverVersion> *>(data); + *result = StringToDriverVersion(stripped_dso_version); + return 1; + } + return 0; + }; + + dl_iterate_phdr(iterate_phdr, &result); + + return result; +} + +port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion( + const string &driver_version_file_contents) { + static const char *kDriverFilePrelude = "Kernel Module "; + size_t offset = driver_version_file_contents.find(kDriverFilePrelude); + if (offset == string::npos) { + return port::Status{ + port::error::NOT_FOUND, + port::StrCat("could not find kernel module information in " + "driver version file contents: \"", + driver_version_file_contents, "\"")}; + } + + string version_and_rest = driver_version_file_contents.substr( + offset + strlen(kDriverFilePrelude), string::npos); + size_t space_index = version_and_rest.find(" "); + auto kernel_version = version_and_rest.substr(0, space_index); + // TODO(b/22689637): Eliminate the explicit namespace if possible. + auto stripped_kernel_version = + port::StripSuffixString(kernel_version, ".ld64"); + return StringToDriverVersion(stripped_kernel_version); +} + +void Diagnostician::WarnOnDsoKernelMismatch( + port::StatusOr<DriverVersion> dso_version, + port::StatusOr<DriverVersion> kernel_version) { + if (kernel_version.ok() && dso_version.ok() && + dso_version.ValueOrDie() == kernel_version.ValueOrDie()) { + LOG(INFO) << "kernel version seems to match DSO: " + << DriverVersionToString(kernel_version.ValueOrDie()); + } else { + LOG(ERROR) << "kernel version " + << DriverVersionStatusToString(kernel_version) + << " does not match DSO version " + << DriverVersionStatusToString(dso_version) + << " -- cannot find working devices in this configuration"; + } +} + + +port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() { + FILE *driver_version_file = fopen(kDriverVersionPath, "r"); + if (driver_version_file == nullptr) { + return port::Status{ + port::error::PERMISSION_DENIED, + port::StrCat("could not open driver version path for reading: ", + kDriverVersionPath)}; + } + + static const int kContentsSize = 1024; + port::InlinedVector<char, 4> contents(kContentsSize); + size_t retcode = + fread(contents.begin(), 1, kContentsSize - 2, driver_version_file); + if (retcode < kContentsSize - 1) { + contents[retcode] = '\0'; + } + contents[kContentsSize - 1] = '\0'; + + if (retcode != 0) { + LOG(INFO) << "driver version file contents: \"\"\"" << contents.begin() + << "\"\"\""; + fclose(driver_version_file); + return FindKernelModuleVersion(string{contents.begin()}); + } + + auto status = + port::Status{port::error::INTERNAL, + port::StrCat("failed to read driver version file contents: ", + kDriverVersionPath, "; ferror: ", + ferror(driver_version_file))}; + fclose(driver_version_file); + return status; +} + + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h new file mode 100644 index 0000000000..005b3dc310 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h @@ -0,0 +1,85 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_ + +#include <tuple> + +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +// e.g. DriverVersion{331, 79} +using DriverVersion = std::tuple<int, int>; + +// Converts a parsed driver version to string form. +string DriverVersionToString(DriverVersion version); + +// Converts a parsed driver version or status value to natural string form. +string DriverVersionStatusToString(port::StatusOr<DriverVersion> version); + +// Converts a string of a form like "331.79" to a DriverVersion{331, 79}. +port::StatusOr<DriverVersion> StringToDriverVersion(const string &value); + +class Diagnostician { + public: + // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is + // not initializing). + // + // Note: if we're running on a machine that has no GPUs, we don't want to + // produce very much log spew beyond saying, "looks like there's no CUDA + // kernel + // module running". + // + // Note: we use non-Google-File:: API here because we may be called before + // InitGoogle has completed. + static void LogDiagnosticInformation(); + + // Given the driver version file contents, finds the kernel module version and + // returns it as a string. + // + // This is solely used for more informative log messages when the user is + // running on a machine that happens to have a libcuda/kernel driver mismatch. + static port::StatusOr<DriverVersion> FindKernelModuleVersion( + const string &driver_version_file_contents); + + // Extracts the kernel driver version from the current host. + static port::StatusOr<DriverVersion> FindKernelDriverVersion(); + + // Iterates through loaded DSOs with DlIteratePhdrCallback to find the + // driver-interfacing DSO version number. Returns it as a string. + static port::StatusOr<DriverVersion> FindDsoVersion(); + + // Logs information about the kernel driver version and userspace driver + // library version. + static void LogDriverVersionInformation(); + + private: + // Logs information about the loaded nvidia-related kernel modules. + static void LogKernelModuleInformation(); + + // Given the DSO version number and the driver version file contents, extracts + // the driver version and compares, warning the user in the case of + // incompatability. + // + // This is solely used for more informative log messages when the user is + // running on a machine that happens to have a libcuda/kernel driver mismatch. + static void WarnOnDsoKernelMismatch( + port::StatusOr<DriverVersion> dso_version, + port::StatusOr<DriverVersion> kernel_version); + + // Logs information about the dev nodes present on this machine: their + // existence, permissions, accessibility from this uid/gid. + static void LogDevNodeDiagnosticInformation(); + + static string GetDevNodePath(int dev_node_ordinal); + + SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc new file mode 100644 index 0000000000..6e4403512b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -0,0 +1,1074 @@ +#include "tensorflow/stream_executor/cuda/cuda_dnn.h" + +#include <dlfcn.h> +#include <functional> + +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/threadpool.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "third_party/gpus/cuda/include/cudnn.h" + +namespace { + +// Converts (via narrowing) a type T value to a type U, and checks that the +// value has no value change due to the conversion. +template <typename WideT, typename NarrowT> +NarrowT CheckedNarrowing(const WideT& wide) { + NarrowT narrow = wide; + CHECK_EQ(narrow, wide) + << "checked narrowing failed; values not equal post-conversion"; + return narrow; +} + +} // namespace + +namespace perftools { +namespace gputools { + +using dnn::BatchDescriptor; +using dnn::FilterDescriptor; +using dnn::ConvolutionDescriptor; +using dnn::PoolingDescriptor; + +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin); + +extern CUstream AsCUDAStreamValue(Stream* stream); + +string ToString(cudnnStatus_t status) { + switch (status) { + case CUDNN_STATUS_SUCCESS: + return "CUDNN_STATUS_SUCCESS"; + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDNN_STATUS_NOT_INITIALIZED"; + case CUDNN_STATUS_ALLOC_FAILED: + return "CUDNN_STATUS_ALLOC_FAILED"; + case CUDNN_STATUS_BAD_PARAM: + return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_INTERNAL_ERROR: + return "CUDNN_STATUS_INTERNAL_ERROR"; + case CUDNN_STATUS_INVALID_VALUE: + return "CUDNN_STATUS_INVALID_VALUE"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH"; + case CUDNN_STATUS_MAPPING_ERROR: + return "CUDNN_STATUS_MAPPING_ERROR"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_LICENSE_ERROR: + return "CUDNN_STATUS_LICENSE_ERROR"; + default: + return port::StrCat("<unknown cudnn status: ", static_cast<int>(status), + ">"); + } +} + +namespace dynload { + +static port::ThreadPool* InitCudnnThreadpool() { + port::ThreadPool* cudnn_threadpool_; + port::ThreadOptions options; + // TBD(keveman): Conservatively setting the stack size and guard size to 2MB, + // until we can get some guarantees from NVIDIA on the minimum stack space + // they will work with. + options.stack_size = 2 * 1024 * 1024; + options.guard_size = 2 * 1024 * 1024; + cudnn_threadpool_ = new port::ThreadPool(port::Env::Default(), options, + "cudnn_threadpool", 1); + CHECK(cudnn_threadpool_); + return cudnn_threadpool_; +} + +static mutex cudnn_threadpool_mu(LINKER_INITIALIZED); +static port::ThreadPool* GetCudaThreadpool() { + mutex_lock lock(cudnn_threadpool_mu); + static port::ThreadPool* cudnn_threadpool = InitCudnnThreadpool(); + return cudnn_threadpool; +} + +#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char* kName; \ + typedef std::add_pointer<decltype(::__name)>::type FuncPointerT; \ + static void* GetDsoHandle() { \ + static auto result = internal::CachedDsoLoader::GetCudnnDsoHandle(); \ + return result.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void* f = dlsym(GetDsoHandle(), kName); \ + if (f == nullptr) { \ + LOG(FATAL) << "could not find " << kName \ + << " in cudnn DSO; dlerror: " << dlerror(); \ + } \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + void CallWrapper(CUDAExecutor* parent, port::Notification* n, \ + cudnnStatus_t* retval, const Args&... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + *retval = DynLoad()(args...); \ + n->Notify(); \ + } \ + template <typename... Args> \ + cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) { \ + port::Notification n; \ + cudnnStatus_t retval; \ + auto call_func_closure = \ + std::bind(&DynLoadShim__##__name::CallWrapper<Args...>, this, \ + parent, &n, &retval, args...); \ + GetCudaThreadpool()->Schedule(call_func_closure); \ + n.WaitForNotification(); \ + return retval; \ + } \ + } __name; \ + const char* DynLoadShim__##__name::kName = #__name; + +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor) __macro( \ + cudnnGetConvolutionNdForwardOutputDim) \ + __macro(cudnnGetConvolutionForwardAlgorithm) __macro( \ + cudnnCreateTensorDescriptor) __macro(cudnnDestroyTensorDescriptor) \ + __macro(cudnnCreateFilterDescriptor) \ + __macro(cudnnSetFilter4dDescriptor) \ + __macro(cudnnSetPooling2dDescriptor) \ + __macro(cudnnDestroyFilterDescriptor) \ + __macro(cudnnCreateConvolutionDescriptor) \ + __macro(cudnnCreatePoolingDescriptor) \ + __macro(cudnnAddTensor) \ + __macro(cudnnDestroyPoolingDescriptor) + +CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH + +// clang-format off +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetConvolution2dDescriptor) \ + __macro(cudnnDestroyConvolutionDescriptor) \ + __macro(cudnnCreate) \ + __macro(cudnnDestroy) \ + __macro(cudnnSetStream) \ + __macro(cudnnActivationForward) \ + __macro(cudnnConvolutionForward) \ + __macro(cudnnConvolutionBackwardData) \ + __macro(cudnnConvolutionBackwardFilter) \ + __macro(cudnnGetConvolutionForwardWorkspaceSize) \ + __macro(cudnnTransformTensor) \ + __macro(cudnnPoolingForward) \ + __macro(cudnnPoolingBackward) +// clang-format on + +CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH + +} // namespace dynload + +namespace { + +cudnnHandle_t ToHandle(void* opaque_handle) { + return static_cast<cudnnHandle_t>(opaque_handle); +} + +} // namespace + +CudnnSupport::CudnnSupport(CUDAExecutor* parent) + : parent_(parent), dnn_handle_(nullptr) {} + +CudnnSupport::~CudnnSupport() { + auto status = dynload::cudnnDestroy(parent_, ToHandle(dnn_handle_)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn handle: " << ToString(status); + } +} + +port::Status CudnnSupport::Init() { + auto status = dynload::cudnnCreate( + parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_)); + if (status == CUDNN_STATUS_SUCCESS) { + return port::Status::OK(); + } + + LOG(ERROR) << "could not create cudnn handle: " << ToString(status); + if (status == CUDNN_STATUS_NOT_INITIALIZED) { + // This is the error code that the driver returns when we're not running a + // sufficient CUDA driver -- cudnn requires 6.5+ compatibility, which + // starts with the 340.XX driver series. + auto result = cuda::Diagnostician::FindKernelDriverVersion(); + if (!result.ok()) { + LOG(ERROR) << "error retrieving driver version: " + << DriverVersionStatusToString(result); + } else { + const auto& version = result.ValueOrDie(); + LOG(INFO) << "running driver version: " << DriverVersionToString(version); + if (std::get<0>(version) < 340) { + LOG(ERROR) + << "cudnn library is only supported on 340.XX+ driver versions"; + } + } + } + return port::Status{port::error::INTERNAL, + port::StrCat("cudnn library could not create a handle: ", + ToString(status))}; +} + +// Turns a BatchDescriptor structure into a cudnn tensor handle within a scope. +class ScopedTensorDescriptor { + public: + ScopedTensorDescriptor(CUDAExecutor* parent, + const BatchDescriptor& batch_descriptor, + cudnnDataType_t elem_type) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateTensorDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn tensor descriptor: " + << ToString(status); + } + + cudnnTensorFormat_t format; + switch (batch_descriptor.layout()) { + case dnn::DataLayout::kBatchYXDepth: + format = CUDNN_TENSOR_NHWC; + break; + case dnn::DataLayout::kBatchDepthYX: + format = CUDNN_TENSOR_NCHW; + break; + default: + LOG(FATAL) << "Unsupported tensor format " + << DataLayoutString(batch_descriptor.layout()); + break; + } + + status = dynload::cudnnSetTensor4dDescriptor( + parent_, handle_, format, elem_type, + CheckedNarrowing<int64, int>(batch_descriptor.count()), + CheckedNarrowing<int64, int>(batch_descriptor.feature_map_count()), + CheckedNarrowing<int64, int>(batch_descriptor.height()), + CheckedNarrowing<int64, int>(batch_descriptor.width())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn tensor descriptor: " + << ToString(status); + } + } + + ~ScopedTensorDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyTensorDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn tensor descriptor: " + << ToString(status); + } + } + + cudnnTensorDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnTensorDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor); +}; + +// Turns a FilterDescriptor structure into a cudnn filter handle within a scope. +class ScopedFilterDescriptor { + public: + ScopedFilterDescriptor(CUDAExecutor* parent, + const FilterDescriptor& filter_descriptor, + cudnnDataType_t elem_type) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateFilterDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn filter descriptor: " + << ToString(status); + } + + // TODO(b/23032134): Even if the filter layout is not supported, + // cudnnSetFilter4DDescriptor will return CUDNN_STATUS_SUCCESS because it + // does not take layout as an input. Maybe force cuDNN by giving wrong + // inputs intentionally? + switch (filter_descriptor.layout()) { + case dnn::FilterLayout::kOutputInputYX: + break; + default: + LOG(FATAL) << "Unsupported filter format " + << FilterLayoutString(filter_descriptor.layout()); + break; + } + + status = dynload::cudnnSetFilter4dDescriptor( + parent_, handle_, elem_type, + CheckedNarrowing<int64, int>( + filter_descriptor.output_feature_map_count()), + CheckedNarrowing<int64, int>( + filter_descriptor.input_feature_map_count()), + CheckedNarrowing<int64, int>(filter_descriptor.input_filter_height()), + CheckedNarrowing<int64, int>(filter_descriptor.input_filter_width())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn filter descriptor: " + << ToString(status); + } + } + + ~ScopedFilterDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyFilterDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn filter descriptor: " + << ToString(status); + } + } + + cudnnFilterDescriptor_t handle() const { return handle_; } + + private: + // Parent executor object. Not owned. + CUDAExecutor* parent_; + + // cudnn filter descriptor this object creates. Owned. + cudnnFilterDescriptor_t handle_; + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor); +}; + +// Turns a ConvolutionDescriptor structure into a cudnn convolution handle +// within a scope. +class ScopedConvolutionDescriptor { + public: + ScopedConvolutionDescriptor( + CUDAExecutor* parent, const ConvolutionDescriptor& convolution_descriptor) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateConvolutionDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn convolution descriptor: " + << ToString(status); + } + + status = dynload::cudnnSetConvolution2dDescriptor( + parent_, handle_, CheckedNarrowing<int64, int>( + convolution_descriptor.zero_padding_height()), + CheckedNarrowing<int64, int>( + convolution_descriptor.zero_padding_width()), + CheckedNarrowing<int64, int>( + convolution_descriptor.vertical_filter_stride()), + CheckedNarrowing<int64, int>( + convolution_descriptor.horizontal_filter_stride()), + // TODO(leary) not sure what the following two params do. + 1 /* = upscale_input_x */, 1 /* = upscale_input_y */, + // NOTE(keveman): cuDNN supports convolution and cross correlation. + // However, almost all the use cases do cross correlation, so just hard + // coding it here. + CUDNN_CROSS_CORRELATION); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn convolution descriptor: " + << ToString(status); + } + } + + ~ScopedConvolutionDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyConvolutionDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn convolution descriptor: " + << ToString(status); + } + } + + cudnnConvolutionDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnConvolutionDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); +}; + +// Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle +// within a scope. +class ScopedPoolingDescriptor { + public: + ScopedPoolingDescriptor(CUDAExecutor* parent, + const PoolingDescriptor& pooling_descriptor) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreatePoolingDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn pooling descriptor: " + << ToString(status); + } + status = dynload::cudnnSetPooling2dDescriptor( + parent_, handle_, + (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum + ? CUDNN_POOLING_MAX + : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING), + CheckedNarrowing<int64, int>(pooling_descriptor.window_height()), + CheckedNarrowing<int64, int>(pooling_descriptor.window_width()), + CheckedNarrowing<int64, int>(pooling_descriptor.vertical_padding()), + CheckedNarrowing<int64, int>(pooling_descriptor.horizontal_padding()), + CheckedNarrowing<int64, int>(pooling_descriptor.vertical_stride()), + CheckedNarrowing<int64, int>(pooling_descriptor.horizontal_stride())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn pooling descriptor: " + << ToString(status); + } + } + ~ScopedPoolingDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyPoolingDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn pooling descriptor: " + << ToString(status); + } + } + + cudnnPoolingDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnPoolingDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor); +}; + +bool CudnnSupport::DoConvolve( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory<float>& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) { + ScopedTensorDescriptor input_4d{parent_, batch_descriptor, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor output_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // The NO_WORKSPACE versions are possibly slower for certain shapes, but + // not so for the shapes currently used by Brain. Also, it seems prudent to + // keep cuMemAlloc off the critical path. + cudnnConvolutionFwdAlgo_t algo; + status = dynload::cudnnGetConvolutionForwardAlgorithm( + parent_, ToHandle(dnn_handle_), input_4d.handle(), filter.handle(), + conv.handle(), output_4d.handle(), CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, + &algo); + + CHECK_EQ(status, CUDNN_STATUS_SUCCESS) + << "Unable to find a suitable algorithm for doing forward convolution"; + + status = dynload::cudnnConvolutionForward( + parent_, ToHandle(dnn_handle_), &alpha, input_4d.handle(), + input_data.opaque(), filter.handle(), filter_data.opaque(), conv.handle(), + algo, nullptr /* workspace ptr */, 0 /* workspace size */, &beta, + output_4d.handle(), output_data->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + + return true; +} + +bool CudnnSupport::DoConvolve( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory<double>& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory<double>& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory<double>* output_data) { + LOG(ERROR) << "double-based DNN not yet implemented"; + return false; +} + +DeviceMemory<float> CudnnSupport::MaybeTransformLayout( + Stream* stream, BatchDescriptor* output_descriptor, + DeviceMemory<float> backward_output_data, + std::unique_ptr<TemporaryDeviceMemory<float>>* transform_scratch) { + if (output_descriptor->layout() == dnn::DataLayout::kBatchDepthYX) { + return backward_output_data; + } + CHECK(output_descriptor->layout() == dnn::DataLayout::kBatchYXDepth); + *transform_scratch = + stream->AllocateTemporaryArray<float>(backward_output_data.ElementCount()) + .ConsumeValueOrDie(); + BatchDescriptor transformed_output_descriptor; + transformed_output_descriptor.CloneFrom(*output_descriptor); + transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX); + ScopedTensorDescriptor orig_out_back_4d{parent_, *output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor transformed_out_back_4d{ + parent_, transformed_output_descriptor, CUDNN_DATA_FLOAT}; + + float alpha = 1.0f; + float beta = 0.0f; + auto status = dynload::cudnnTransformTensor( + parent_, ToHandle(dnn_handle_), &alpha, orig_out_back_4d.handle(), + backward_output_data.opaque(), &beta, transformed_out_back_4d.handle(), + (*transform_scratch)->mutable_device_memory()->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "Failed to transform the data layout."; + } + output_descriptor->set_layout(dnn::DataLayout::kBatchDepthYX); + return (*transform_scratch)->device_memory(); +} + +bool CudnnSupport::DoConvolveBackwardData( + Stream* stream, const FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const BatchDescriptor& output_descriptor_in, + DeviceMemory<float> backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& input_descriptor, + DeviceMemory<float>* backward_input_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass. + BatchDescriptor output_descriptor; + output_descriptor.CloneFrom(output_descriptor_in); + std::unique_ptr<TemporaryDeviceMemory<float>> transform_scratch; + backward_output_data = MaybeTransformLayout( + stream, &output_descriptor, backward_output_data, &transform_scratch); + + ScopedTensorDescriptor out_back_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor in_back_4d{parent_, input_descriptor, + CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + status = dynload::cudnnConvolutionBackwardData( + parent_, ToHandle(dnn_handle_), &alpha, filter.handle(), + filter_data.opaque(), out_back_4d.handle(), backward_output_data.opaque(), + conv.handle(), &beta, in_back_4d.handle(), backward_input_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_descriptor_in, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory<float>* backward_filter_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass. + BatchDescriptor output_descriptor; + output_descriptor.CloneFrom(output_descriptor_in); + std::unique_ptr<TemporaryDeviceMemory<float>> transform_scratch; + backward_output_data = MaybeTransformLayout( + stream, &output_descriptor, backward_output_data, &transform_scratch); + + ScopedTensorDescriptor out_back_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor input_4d{parent_, input_descriptor, CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + status = dynload::cudnnConvolutionBackwardFilter( + parent_, ToHandle(dnn_handle_), &alpha, input_4d.handle(), + input_data.opaque(), out_back_4d.handle(), backward_output_data.opaque(), + conv.handle(), &beta, filter.handle(), backward_filter_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoMatMul(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<float>& weights, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + if (input_dimensions.count() != output_dimensions.count()) { + LOG(ERROR) << "MatMul input and output dimensions are not compatible."; + return false; + } + + // We do not permute the input or output, instead we just + // reinterpret the layout. We are working with row-major matrices + // and the rows of the input and output correspond to batch, so + // batch has to be outermost in both the input and output. + // + // By adding transposes to the BLAS gemm call we could perhaps make + // the kYXDepthBatch layout work as well, but there has been no need + // for that so far. + if (input_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + input_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) { + LOG(ERROR) << "Unsupported MatMul input layout."; + return false; + } + if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + output_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) { + LOG(ERROR) << "Unsupported MatMul output layout."; + return false; + } + + if (output_dimensions.width() == 1 && output_dimensions.height() == 1) { + // This is a fast path that also supports the kBatchYXDepth layout. + + // The matrices here are in row-major format while BLAS expects + // column-major, i.e. our matrices are transposed as far as BLAS + // is concerned. So we need to compute output^T = + // input^T*weights^T. There is no parameter for transposing the + // output in BLAS gemm, but instead we can transpose both sides of + // the equality to see that this is equivalent to + // output=weights*input. So we only need to swap the order of + // weights and input in the matrix product to correct for the + // row-major versus column-major difference. + const float alpha = 1.0f; // Take the matrix product without scaling it. + const float beta = 0.0f; // Ignore the original values in output_data. + const int64 m = output_dimensions.NodesAcrossFeatureMaps(); + const int64 n = input_dimensions.count(); + const int64 k = input_dimensions.NodesAcrossFeatureMaps(); + stream->ThenBlasGemm(blas::Transpose::kNoTranspose, + blas::Transpose::kNoTranspose, m, n, k, alpha, weights, + m, input_data, k, beta, output_data, m); + } else { + // This is a slower and more complex path that supports output + // width() * height() > 1, though it only supports the + // kBatchYXDepth layout. Does support kBatchDepthYX if output + // feature_map_count() == 1, as then there is no difference + // between the two layouts. + // + // The operation here is the same as above, except that we have to + // do the matrix multiplication for each (y,x) output coordinate + // separately. We then interpret weights as containing K = width() + // * height() different matrices, which we all multiply onto the + // matrix from input_data, yielding K matrix products. We then + // combine these together into one matrix by concatenating all the + // first rows of these matrices, then all the seconds rows and so + // on. We can do this with a batched matrix multiplication, where + // the result is written to a different submatrix of the output + // for each matrix multiplication. + // + // The reason that we only support the kBatchYXDepth output layout + // is that we have to do something in the depth for each (y,x) + // coordinate. The kBatchYXDepth layout has the depth information + // for each point (y,x) in contiguous memory while the + // kBatchDepthYX layout does not. + // + // TODO(broune): Consider a special case for when output depth == + // 1, as then possibly this could all be done as one matrix + // multiplication instead of a batched one, which should be + // faster. Another possibility would be to add a weights layout + // parameter and then support kBatchDepthYX for a different + // weights layout. + if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + !(output_dimensions.layout() == dnn::DataLayout::kBatchDepthYX && + output_dimensions.feature_map_count() == 1)) { + LOG(ERROR) << "Unsupported MatMul output layout."; + return false; + } + + const float alpha = 1.0f; // Take the matrix product without scaling it. + const float beta = 0.0f; // Ignore the original values in output_data. + const uint64 m = output_dimensions.feature_map_count(); + const uint64 n = input_dimensions.count(); + const uint64 k = input_dimensions.NodesAcrossFeatureMaps(); + const int lda = m; + const int ldb = k; + const int ldc = output_dimensions.NodesAcrossFeatureMaps(); + const int batch_count = output_dimensions.NodesPerFeatureMap(); + + std::vector<DeviceMemory<float>> a(batch_count); + std::vector<DeviceMemory<float>> b(batch_count); + std::vector<DeviceMemory<float>> c(batch_count); + for (int i = 0; i < batch_count; ++i) { + const int weights_offset = i * input_dimensions.NodesAcrossFeatureMaps() * + output_dimensions.feature_map_count(); + a[i] = DeviceMemory<float>::MakeFromByteSize( + const_cast<float*>(reinterpret_cast<const float*>(weights.opaque())) + + weights_offset, + weights.ElementCount() - weights_offset); + + b[i] = input_data; + + const int output_offset = i * output_dimensions.feature_map_count(); + c[i] = DeviceMemory<float>::MakeFromByteSize( + const_cast<float*>( + reinterpret_cast<const float*>(output_data->opaque())) + + output_offset, + output_data->ElementCount() - output_offset); + } + const auto toPtrs = [](std::vector<DeviceMemory<float>>& v) { + std::vector<DeviceMemory<float>*> ptrs; + for (auto& mem : v) { + ptrs.push_back(&mem); + } + return ptrs; + }; + + stream->ThenBlasGemmBatched(blas::Transpose::kNoTranspose, + blas::Transpose::kNoTranspose, m, n, k, alpha, + toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c), + ldc, batch_count); + } + + return stream->ok(); +} + +bool CudnnSupport::DoBiasAdd(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<float>& biases, + const dnn::BatchDescriptor& dimensions, + DeviceMemory<float>* output_data) { + ScopedTensorDescriptor input_descriptor{parent_, dimensions, + CUDNN_DATA_FLOAT}; + + BatchDescriptor bias_dimensions; + bias_dimensions.set_count(1) + .set_feature_map_count(dimensions.feature_map_count()) + .set_height(1) + .set_width(1) + .set_layout(dnn::DataLayout::kBatchYXDepth); + ScopedTensorDescriptor bias_descriptor{parent_, bias_dimensions, + CUDNN_DATA_FLOAT}; + + // cudnnAddTensor is in-place, so we need to copy input_data to + // output_data before doing the addition, unless the input and + // output are at the same address. + if (input_data.opaque() != output_data->opaque()) { + stream->ThenMemcpy(output_data, input_data, + dimensions.ElementCount() * sizeof(float)); + if (!stream->ok()) { + LOG(ERROR) + << "stream " << stream + << " could not enqueue a tensor copy as part of bias addition."; + return false; + } + } + + mutex_lock lock{dnn_handle_mutex_}; + + const float alpha = 1.0f; + const float beta = 1.0f; + auto status = dynload::cudnnAddTensor( + parent_, ToHandle(dnn_handle_), CUDNN_ADD_SAME_C, &alpha, + bias_descriptor.handle(), biases.opaque(), &beta, + input_descriptor.handle(), output_data->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "stream " << stream << " could not enqueue bias addition."; + return false; + } + + return true; +} + +bool CudnnSupport::DoActivate(Stream* stream, + dnn::ActivationMode activation_mode, + const dnn::BatchDescriptor& dimensions, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + cudnnActivationMode_t mode; + switch (activation_mode) { + case dnn::ActivationMode::kRelu6: + // TODO(leary) should probably do a post-pass to clip at 6? + LOG(WARNING) << "user requested Relu6, but providing Relu instead"; + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kReluX: + // TODO(broune) should probably do a post-pass to clip at X? + LOG(WARNING) << "user requested ReluX, but providing Relu instead"; + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kRelu: + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kSigmoid: + mode = CUDNN_ACTIVATION_SIGMOID; + break; + case dnn::ActivationMode::kTanh: + mode = CUDNN_ACTIVATION_TANH; + break; + default: + LOG(ERROR) << "unrecognized activation mode: " + << static_cast<int>(activation_mode); + return false; + } + + ScopedTensorDescriptor input_4d{parent_, dimensions, CUDNN_DATA_FLOAT}; + // Alpha is the input scaling factor. + float alpha = 1.0; + // Beta is the output scaling factor. + float beta = 0.0; + status = dynload::cudnnActivationForward( + parent_, ToHandle(dnn_handle_), mode, &alpha, input_4d.handle(), + input_data.opaque(), &beta, input_4d.handle(), output_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "stream " << stream + << " could not enqueue activation: " << ToString(status); + return false; + } + + return true; +} + +bool CudnnSupport::DoPoolForward( + Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor dest_desc{parent_, output_dimensions, + CUDNN_DATA_FLOAT}; + ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions}; + status = dynload::cudnnPoolingForward( + parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha, + src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(), + output_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue forward pooling on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoPoolBackward( + Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory<float>& output_data, + const DeviceMemory<float>& input_diff_data, + DeviceMemory<float>* output_diff_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor dest_desc{parent_, output_dimensions, + CUDNN_DATA_FLOAT}; + ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions}; + status = dynload::cudnnPoolingBackward( + parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha, + dest_desc.handle(), output_data.opaque(), dest_desc.handle(), + input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta, + src_desc.handle(), output_diff_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue backward pooling on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoNormalize( + Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, + const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoDepthConcatenate( + Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoElementwiseOperate( + Stream* stream, dnn::ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<uint8> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<uint16> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<int32> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyH2DQuantized( + Stream* stream, port::ArraySlice<uint8> host_src, + DeviceMemory<float>* gpu_unquantized_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DeriveOutputBatchDescriptor( + const BatchDescriptor& batch_descriptor, + const FilterDescriptor& filter_descriptor, + const dnn::ConvolutionDescriptor& convolution_descriptor, + dnn::BatchDescriptor* output_batch_descriptor) { + ScopedTensorDescriptor input_4d{parent_, batch_descriptor, CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + int dims[4]; + auto status = dynload::cudnnGetConvolutionNdForwardOutputDim( + parent_, conv.handle(), input_4d.handle(), filter.handle(), 4, dims); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not get output tensor for convolution: " + << ToString(status); + return false; + } + + output_batch_descriptor->set_count(dims[0]) + .set_feature_map_count(dims[1]) + .set_height(dims[2]) + .set_width(dims[3]) + .set_layout(batch_descriptor.layout()); + return true; +} + +} // namespace cuda + +namespace gpu = ::perftools::gputools; + +void initialize_cudnn() { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::DnnFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuDnnPlugin, "cuDNN", + [](gpu::internal::StreamExecutorInterface* + parent) -> gpu::dnn::DnnSupport* { + gpu::cuda::CUDAExecutor* cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor*>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuBLAS " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + gpu::cuda::CudnnSupport* dnn = + new gpu::cuda::CudnnSupport(cuda_executor); + if (!dnn->Init().ok()) { + // Note: Init() will log a more specific error. + delete dnn; + return nullptr; + } + return dnn; + }); + + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuDNN factory: " + << status.error_message(); + } + + // Prime the cuDNN DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCudnnDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuDNN DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kDnn, + gpu::cuda::kCuDnnPlugin); +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(register_cudnn, + { perftools::gputools::initialize_cudnn(); }); diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h new file mode 100644 index 0000000000..08e952cee0 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -0,0 +1,206 @@ +// The CUDA-specific DNN library support, implementing the general DnnSupport +// interface. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_ + +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/temporary_device_memory.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +class CUDAExecutor; + +// Opaque and unique identifer for the cuDNN plugin. +extern const PluginId kCuDnnPlugin; + +// cudnn-library based DNN support. For details on overridden interface +// functions, see dnn.h. +class CudnnSupport : public dnn::DnnSupport { + public: + explicit CudnnSupport(CUDAExecutor* parent); + ~CudnnSupport() override; + + port::Status Init() override; + + bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) override; + + bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory<double>& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<double>& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<double>* output_data) override; + + bool DoSeparableConvolve( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory<float>& input_data, + const dnn::FilterDescriptor& filter_descriptor, int depth_multiplier, + const DeviceMemory<float>& first_weights, + const DeviceMemory<float>& second_weights, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) override { + LOG(ERROR) << "separable convolution not supported by CUDNN"; + return false; + } + + bool DoConvolveBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory<float>* backward_input_data) override; + + bool DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory<float>* backward_filter_data) override; + + bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<float>& weights, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override; + + bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<int8>& quantized_weights, + const DeviceMemory<float>& weight_scales, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override { + LOG(ERROR) << "DNN MatMulQuantized not supported by CUDNN"; + return false; + } + + bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<int16>& quantized_weights, + const DeviceMemory<float>& weight_scales, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override { + LOG(ERROR) << "DNN MatMulQuantized not supported by CUDNN"; + return false; + } + + bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<float>& biases, + const dnn::BatchDescriptor& dimensions, + DeviceMemory<float>* output_data) override; + + bool DoActivate(Stream* stream, dnn::ActivationMode activation_mode, + const dnn::BatchDescriptor& dimensions, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) override; + + bool DoPoolForward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override; + + bool DoPoolBackward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory<float>& output_data, + const DeviceMemory<float>& input_diff_data, + DeviceMemory<float>* output_diff_data) override; + + bool DoNormalize(Stream* stream, + const dnn::NormalizeDescriptor& normalize_descriptor, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) override; + + bool DoDepthConcatenate( + Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + DeviceMemory<float>* output_data) override; + + bool DoElementwiseOperate( + Stream* stream, dnn::ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override; + + bool DoMemcpyD2HQuantized(Stream* stream, + const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<uint8> host_dst) override; + + bool DoMemcpyD2HQuantized(Stream* stream, + const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<uint16> host_dst) override; + + bool DoMemcpyD2HQuantized(Stream* stream, + const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<int32> host_dst) override; + + bool DoMemcpyH2DQuantized( + Stream* stream, port::ArraySlice<uint8> host_src, + DeviceMemory<float>* device_unquantized_dst) override; + + // Derives an output batch descriptor from an input batch and convolution + // descriptors. + bool DeriveOutputBatchDescriptor( + const dnn::BatchDescriptor& batch_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + const dnn::ConvolutionDescriptor& convolution_descriptor, + dnn::BatchDescriptor* output_batch_descriptor); + + private: + // Guards the enqueueing of DNN operations via the dnn_handle_ below. + mutex dnn_handle_mutex_; + + CUDAExecutor* parent_; // Parent executor object. Not owned. + + // cudnn library handle. cudnnHandle_t type is not present in this header to + // prevent third-party library header inclusions from leaking outside the + // single cuda_dnn translation unit. + void* dnn_handle_ GUARDED_BY(dnn_handle_mutex_); + + // NOTE(keveman): Temporary data layout transformation until cuDNN supports + // kBatchYXDepth for backward pass. This function allocates temporary memory, + // lays out the source data into the temporary but in the kBatchDepthXY + // layout, and returns the temporary memory. The caller is responsible for + // deallocating the temporary. Since the allocation is done using Stream's + // AllocateTemporaryMemory, a later BlockHostUntilDone could be used for + // deallocation. + // + // transform_scratch is populated with a legitimate temporary allocation iff + // the original output data needs to be transformed. + DeviceMemory<float> MaybeTransformLayout( + Stream* stream, dnn::BatchDescriptor* output_descriptor, + DeviceMemory<float> backward_output_data, + std::unique_ptr<TemporaryDeviceMemory<float>>* transform_scratch) + EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_); + + SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc new file mode 100644 index 0000000000..8c4316b4c1 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -0,0 +1,1608 @@ +#include "tensorflow/stream_executor/cuda/cuda_driver.h" + +#include <dlfcn.h> +#include <stdint.h> +#include <stdlib.h> +#include <set> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/human_readable.h" +#include "tensorflow/stream_executor/lib/notification.h" +#include "tensorflow/stream_executor/lib/threadpool.h" +#include "tensorflow/stream_executor/lib/stacktrace.h" +#include "tensorflow/stream_executor/lib/static_threadlocal.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/lib/inlined_vector.h" + +bool FLAGS_gpuexec_cuda_driver_inject_init_error = false; +bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false; +bool FLAGS_gpuexec_cuda_device_0_only = false; + +namespace perftools { +namespace gputools { +namespace cuda { + +namespace dynload { + +#define PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << "in libcuda DSO; dlerror: " << dlerror(); \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + CUresult operator()(Args... args) { \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxCreate_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxDestroy); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxEnablePeerAccess); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetCurrent); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetDevice); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetSharedMemConfig); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxPopCurrent_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetCurrent); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetSharedMemConfig); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSynchronize); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceComputeCapability); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceCanAccessPeer); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGet); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetAttribute); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetCount); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetName); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetPCIBusId); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetProperties); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceTotalMem); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDriverGetVersion); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventCreate); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorString); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuInit); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuLaunchKernel); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemAlloc_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoD_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoH_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoD_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoDAsync_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoHAsync_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoDAsync_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetAddressRange_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFree_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFreeHost); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetInfo_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostAlloc); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostRegister_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostUnregister); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32Async); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetFunction); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetGlobal_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadDataEx); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadFatBinary); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleUnload); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuPointerGetAttribute); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamAddCallback); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamCreate); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamDestroy_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamQuery); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamSynchronize); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamWaitEvent); + +} // namespace dynload + +namespace { + +// Manages the singleton set of contexts that we've created. This is used for +// checking that no CUDA-runtime-created contexts have been generated +// accidentally. CUDA-runtime-created contexts are avoided, if triple angle +// brace launches are required, by using the scoped activations in +// cuda_activation.h. +class CreatedContexts { + public: + // Returns whether context is a member of the live set. + static bool Has(CUcontext context) { + shared_lock lock{mu_}; + return Live()->find(context) != Live()->end(); + } + + // Adds context to the live set. + static void Add(CUcontext context) { + CHECK(context != nullptr); + mutex_lock lock{mu_}; + Live()->emplace(context); + } + + // Removes context from the live set. + static void Remove(CUcontext context) { + CHECK(context != nullptr); + mutex_lock lock{mu_}; + Live()->erase(context); + } + + private: + // Returns the live set singleton. + static std::set<CUcontext> *Live() { + static auto singleton = new std::set<CUcontext>; + return singleton; + } + + // Lock that guards access-to/mutation-of the live set. + static mutex mu_; +}; + +/* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED}; + +// Formats CUresult to output prettified values into a log stream. +// Error summaries taken from: +// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9 +// +// TODO(leary) switch to cuGetErrorName when updated cuda.h is available. +string ToString(CUresult result) { +#define OSTREAM_CUDA_ERROR(__name) \ + case CUDA_ERROR_##__name: \ + return "CUDA_ERROR_" #__name; + +/////////////// +// NOTE: here we specify return code values outside of the enum explicitly +// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver +// libraries are deployed in the fleet these error codes are backwards +// compatible, but if we see a "new" one, we want to be able to identify it in +// the logs. +// +// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can +// eliminate this function and just rely on the driver to provide us these +// strings. +// +// NOTE: "Must reboot all context" below is shorthand for, "must +// destroy/recreate the offending context and any allocation which come from +// it if you are to continue using CUDA." +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wswitch" + switch (result) { + OSTREAM_CUDA_ERROR(INVALID_VALUE) + OSTREAM_CUDA_ERROR(OUT_OF_MEMORY) + OSTREAM_CUDA_ERROR(NOT_INITIALIZED) + OSTREAM_CUDA_ERROR(DEINITIALIZED) + OSTREAM_CUDA_ERROR(NO_DEVICE) + OSTREAM_CUDA_ERROR(INVALID_DEVICE) + OSTREAM_CUDA_ERROR(INVALID_IMAGE) + OSTREAM_CUDA_ERROR(INVALID_CONTEXT) + OSTREAM_CUDA_ERROR(INVALID_HANDLE) + OSTREAM_CUDA_ERROR(NOT_FOUND) + OSTREAM_CUDA_ERROR(NOT_READY) + OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU) + + // Encountered an uncorrectable ECC error during execution. + OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE) + + // Load/store on an invalid address. Must reboot all context. + case 700: + return "CUDA_ERROR_ILLEGAL_ADDRESS"; + // Passed too many / wrong arguments, too many threads for register count. + case 701: + return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + // Kernel took too long to execute. + case 702: + return "CUDA_ERROR_LAUNCH_TIMEOUT"; + // Kernel launch uses an incompatible texturing mode. + case 703: + return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; + // Trying to re-enable peer access that already has it enabled. + case 704: + return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; + // Trying to disable peer access that has not yet been enabled. + case 705: + return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; + // Primary context for the specified device has already been initialized. + case 708: + return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; + // Context current to calling thread has been destroyed or is a primary + // context that has not yet been initialized. + case 709: + return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; + // Device-side assert triggered during kernel execution. Must reboot all + // context. + case 710: + return "CUDA_ERROR_ASSERT"; + // Hardware resources to enable peer access have been exhausted. + case 711: + return "CUDA_ERROR_TOO_MANY_PEERS"; + // Memory range has already been registered. + case 712: + return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; + // Pointer does not correspond to any currently registered memory region. + case 713: + return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; + // Due to stack corruption or exceeding stack size limit. Must reboot all + // context. + case 714: + return "CUDA_ERROR_HARDWARE_STACK_ERROR"; + case 715: + return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; + // Load/store on an unaligned memory address. Must reboot all context. + case 716: + return "CUDA_ERROR_MISALIGNED_ADDRESS"; + // Device instruction with specific address space given address not + // belonging to allowed address space. Must reboot all context. + case 717: + return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; + // Device program counter wrapped its address space. Must reboot all + // context. + case 718: + return "CUDA_ERROR_INVALID_PC"; + // Exception on device while executing a kernel; e.g. deref invalid device + // pointer, accessing OOB shared memory. Must reboot all context. + case 719: + return "CUDA_ERROR_LAUNCH_FAILED"; + + OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE) + OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED) + OSTREAM_CUDA_ERROR(NOT_PERMITTED) + OSTREAM_CUDA_ERROR(NOT_SUPPORTED) + OSTREAM_CUDA_ERROR(UNKNOWN) // Unknown internal error to CUDA. + default: + return port::StrCat("CUresult(", static_cast<int>(result), ")"); + } +#pragma GCC diagnostic pop +} + +// Returns the current context and checks that it is in the set of CUDA contexts +// created by StreamExecutor (to ensure that the CUDA runtime didn't create a +// context behind our backs). +CUcontext CurrentContext() { + CUcontext current = nullptr; + CUresult result = dynload::cuCtxGetCurrent(¤t); + if (result != CUDA_SUCCESS) { + LOG(FATAL) << "failed to query current context: " << ToString(result); + } + if (current != nullptr && !CreatedContexts::Has(current)) { + LOG(FATAL) << "current context was not created by the StreamExecutor " + "cuda_driver API: " + << current + << "; a CUDA runtime call " + "was likely performed without using a StreamExecutor context"; + } + return current; +} + +// "Pops" the current context, checks that it matches expected, and checks the +// postcondition that the current context is nullptr. +// +// This is not done when we're nested within a MultiOpActivation, as we want to +// persist the active context until the MultiOpActivation is popped. +void PopContextAndCheckNowNull(CUcontext expected) { + CUcontext actual = CurrentContext(); + CHECK_EQ(expected, actual) << "would pop unexpected context"; + CUcontext popped; + CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxPopCurrent_v2(&popped)); + CHECK_EQ(expected, popped); + CHECK(nullptr == CurrentContext()); + VLOG(3) << "popped context " << expected + << " and current context is now null"; +} + +// CUDA driver routines may require a large amount of stack (particularly +// cuModuleLoadDataEx, in our experience). To avoid stack overflow when using +// stack-limited threads (such as those spawned by a default-argument +// thread::ThreadPool on some platforms), we run certain routines in this pool +// and wait for completion. +static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED); +static port::ThreadPool *InitializeDriverExecutor() { + return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(), + "cuda_driver", 1); +} + +port::ThreadPool *GetDriverExecutor() { + mutex_lock lock(driver_executor_threadpool_mu); + static port::ThreadPool *thread_pool = InitializeDriverExecutor(); + return thread_pool; +} + +} // namespace + + +// Thread-local storage that indicates whether a CUDA context activation is +// being nested within an outer, MultiOpActivation. In that case, we should not +// pop the context to nullptr when we are done with the current activation. +SE_STATIC_THREAD_LOCAL_POD(bool, tls_in_multi_op_activation); + +string MemorySpaceString(MemorySpace memory_space) { + switch (memory_space) { + case MemorySpace::kHost: + return "host"; + case MemorySpace::kDevice: + return "device"; + default: + LOG(FATAL) << "impossible memory space"; + } +} + +// Implementation note: the CUDA context is held, per-thread, in TLS. We avoid +// setting all the time because it's not clear what side effects might occur for +// a "set" operation, whereas a "get" operation we can reasonably assume is a +// TLS read. +// +// We cannot race here because CUcontext is associated with a particular thread +// and stored in TLS; and these interfaces should not be used from signal +// handlers. +ScopedActivateContext::ScopedActivateContext(CUcontext context, + MultiOpActivation moa) + : context_(CHECK_NOTNULL(context)), + previously_in_multi_op_activation_(tls_in_multi_op_activation.get()) { + if (static_cast<bool>(moa)) { + tls_in_multi_op_activation.get() = true; + } + + CUcontext current = prior_context_ = CurrentContext(); + if (current != context) { + VLOG(3) << "ScopedActivateContext switching context from " << current + << " to " << context; + CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(context)); + if (FLAGS_gpuexec_cuda_sync_around_driver_calls) { + auto res = dynload::cuCtxSynchronize(); + if (res != CUDA_SUCCESS) { + LOG(FATAL) << "gpuexec_cuda_sync_around_driver_calls found " + << ToString(res) + << " immediately after establishing the device context " + << context << " :: " << port::CurrentStackTrace(); + } + } + } +} + +ScopedActivateContext::~ScopedActivateContext() { + if (tls_in_multi_op_activation.get()) { + CHECK_EQ(context_, CurrentContext()); + if (FLAGS_gpuexec_cuda_sync_around_driver_calls) { + auto res = dynload::cuCtxSynchronize(); + if (res != CUDA_SUCCESS) { + LOG(FATAL) << "gpuexec_cuda_sync_around_driver_calls found " + << ToString(res) + << " immediately after de-establishing the device context " + << context_ << " :: " << port::CurrentStackTrace(); + } + } + CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(prior_context_)); + } else { + PopContextAndCheckNowNull(context_); + } + tls_in_multi_op_activation.get() = previously_in_multi_op_activation_; +} + +namespace { + +// Returns a stringified device number associated with pointer, primarily for +// logging purposes. Returns "?" if the device could not be successfully +// queried. +string CUDAPointerToDeviceString(CUdeviceptr pointer) { + auto value = CUDADriver::GetPointerDevice(pointer); + if (value.ok()) { + return port::StrCat(value.ValueOrDie()); + } + LOG(ERROR) << "could not query device: " << value.status(); + return "?"; +} + +// Returns a stringified memory space associated with pointer, primarily for +// logging purposes. Returns "?" if the memory space could not be successfully +// queried. +string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) { + auto value = CUDADriver::GetPointerMemorySpace(pointer); + if (value.ok()) { + return MemorySpaceString(value.ValueOrDie()); + } + LOG(ERROR) << "could not query device: " << value.status(); + return "?"; +} + +// Returns a stringified representation of whether or not peer access is +// permitted between the "from" and "to" pointers' associated contexts, +// primarily for logging purposes. Returns "error" if an error is encountered +// in the process of querying. +string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) { + auto from_context = CUDADriver::GetPointerContext(from); + if (!from_context.ok()) { + LOG(ERROR) << "could not retrieve source pointer's context: " + << from_context.status(); + return "error"; + } + auto to_context = CUDADriver::GetPointerContext(to); + if (!to_context.ok()) { + LOG(ERROR) << "could not retrieve destination pointer's context: " + << to_context.status(); + return "error"; + } + return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(), + to_context.ValueOrDie()) + ? "true" + : "false"; +} + + +// Actually performs the work of CUDA initialization. Wrapped up in one-time +// execution guard. +static port::Status InternalInit() { + CUresult res = CUDA_ERROR_NO_DEVICE; + if (FLAGS_gpuexec_cuda_driver_inject_init_error) { + LOG(ERROR) << "injecting CUDA init error; initialization will fail"; + } else if (internal::CachedDsoLoader::GetLibcudaDsoHandle().ok()) { + // We only call cuInit if we can dynload libcuda. + + res = dynload::cuInit(0 /* = flags */); + } + + if (res == CUDA_SUCCESS) { + return port::Status::OK(); + } + + LOG(ERROR) << "failed call to cuInit: " << ToString(res); + Diagnostician::LogDiagnosticInformation(); + return port::Status{port::error::ABORTED, + port::StrCat("failed call to cuInit: ", ToString(res))}; +} + +} // namespace + +/* static */ port::Status CUDADriver::Init() { + // Cached return value from calling InternalInit(), as cuInit need only be + // called once, but CUDADriver::Init may be called many times. + static port::Status init_retval; + static bool set = false; + static mutex init_mu(LINKER_INITIALIZED); + + mutex_lock lock(init_mu); + if (!set) { + init_retval = InternalInit(); + set = true; + } + + return init_retval; +} + +/* static */ port::Status CUDADriver::GetDevice(int device_ordinal, + CUdevice *device) { + CUresult res = dynload::cuDeviceGet(device, device_ordinal); + if (res == CUDA_SUCCESS) { + return port::Status::OK(); + } + + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed call to cuDeviceGet: ", ToString(res))}; +} + +/* static */ bool CUDADriver::GetDeviceName(CUdevice device, + string *device_name) { + static const size_t kCharLimit = 64; + port::InlinedVector<char, 4> chars(kCharLimit); + CUresult res = + dynload::cuDeviceGetName(chars.begin(), kCharLimit - 1, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to get device name for " << device << ": " + << ToString(res); + return false; + } + chars[kCharLimit - 1] = '\0'; + *device_name = chars.begin(); + return true; +} + +bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) { + static_assert(DeviceOptions::kMask == 0xf, + "needs update for new device options"); + + if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) { + *flags |= CU_CTX_LMEM_RESIZE_TO_MAX; + } + + // If no flags are set the default is CU_CTX_SCHED_AUTO, which + // in Google environments is very likely to mean SPIN. + if (device_options.flags() & DeviceOptions::kScheduleSpin) { + *flags |= CU_CTX_SCHED_SPIN; + } + if (device_options.flags() & DeviceOptions::kScheduleYield) { + *flags |= CU_CTX_SCHED_YIELD; + } + if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) { + *flags |= CU_CTX_SCHED_BLOCKING_SYNC; + } + + return true; +} + +/* static */ port::Status CUDADriver::CreateContext( + CUdevice device, DeviceOptions device_options, CUcontext *context) { + CUcontext former_context = CurrentContext(); + if (former_context != nullptr) { + LOG(WARNING) << "creating context when one is currently active; existing: " + << former_context; + } + + int flags = 0; + if (!DeviceOptionsToContextFlags(device_options, &flags)) { + LOG(WARNING) << "could not convert all device options into context flags"; + } + + CUresult res; + { + // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their + // context creation: see http://b/13248943 + + res = dynload::cuCtxCreate_v2(context, flags, device); + } + if (res == CUDA_SUCCESS) { + CreatedContexts::Add(*context); + PopContextAndCheckNowNull(*context); + CHECK(*context != nullptr) + << "success in this call must entail non-null result"; + VLOG(2) << "created context " << context << " for this thread"; + return port::Status::OK(); + } + + string message = "failed call to cuCtxCreate: " + ToString(res); + if (res == CUDA_ERROR_OUT_OF_MEMORY) { + uint64 total_memory; + if (GetDeviceTotalMemory(device, &total_memory)) { + port::StrAppend(&message, "; total memory reported: ", total_memory); + } else { + port::StrAppend(&message, "; could not query total memory"); + } + } + + return port::Status{port::error::INTERNAL, message}; +} + +/* static */ void CUDADriver::DestroyContext(CUcontext context) { + if (context == nullptr) { + return; + } + + CUresult res = dynload::cuCtxDestroy_v2(context); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to destroy CUDA context; leaking: " << ToString(res); + } + + CreatedContexts::Remove(context); +} + +/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute, + CUfunction func, + int *attribute_value) { + CUresult res = dynload::cuFuncGetAttribute(attribute_value, attribute, func); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query kernel attribute. kernel: " << func + << ", attribute: " << attribute; + return false; + } + return true; +} + +/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function, + CUfunc_cache cache_config) { + CUresult res = dynload::cuFuncSetCacheConfig(function, cache_config); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function + << ", config: " << cache_config << ", result: " << ToString(res); + return false; + } + + return true; +} + +/* static */ port::StatusOr<CUsharedconfig> +CUDADriver::ContextGetSharedMemConfig(CUcontext context) { + CUsharedconfig shared_mem_config; + ScopedActivateContext activation{context}; + CUresult result = dynload::cuCtxGetSharedMemConfig(&shared_mem_config); + if (result != CUDA_SUCCESS) { + CUdevice device; + dynload::cuCtxGetDevice(&device); + LOG(ERROR) << "failed to get CUDA device shared memory config. " + << "Context device ID: " << device + << ", result: " << ToString(result); + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to get shared memory config: ", ToString(result))}; + } + return shared_mem_config; +} + +/* static */ port::Status CUDADriver::ContextSetSharedMemConfig( + CUcontext context, CUsharedconfig shared_mem_config) { + ScopedActivateContext activation{context}; + CUresult result = dynload::cuCtxSetSharedMemConfig(shared_mem_config); + if (result != CUDA_SUCCESS) { + CUdevice device; + dynload::cuCtxGetDevice(&device); + LOG(ERROR) << "failed to set CUDA device shared memory config. " + << "Context device ID: " << device + << ", config: " << shared_mem_config + << ", result: " << ToString(result); + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to set shared memory config: ", ToString(result))}; + } + return port::Status::OK(); +} + +/* static */ bool CUDADriver::LaunchKernel( + CUcontext context, CUfunction function, unsigned int grid_dim_x, + unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x, + unsigned int block_dim_y, unsigned int block_dim_z, + unsigned int shared_mem_bytes, CUstream stream, void **kernel_params, + void **extra) { + ScopedActivateContext activation{context}; + VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x + << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z + << " bdx: " << block_dim_x << " bdy: " << block_dim_y + << " bdz: " << block_dim_z; + CUresult res = dynload::cuLaunchKernel( + function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y, + block_dim_z, shared_mem_bytes, stream, kernel_params, extra); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to launch CUDA kernel: " << function + << "; result: " << ToString(res); + return false; + } + VLOG(2) << "successfully launched kernel"; + return true; +} + +/* static */ port::Status CUDADriver::LoadCubin(CUcontext context, + const char *cubin_bytes, + CUmodule *module) { + ScopedActivateContext activation{context}; + CUresult result = dynload::cuModuleLoadFatBinary(module, cubin_bytes); + if (result != CUDA_SUCCESS) { + return port::Status{port::error::INTERNAL, + "failed to load in-memory CUBIN: " + ToString(result)}; + } + + return port::Status::OK(); +} + +/* static */ bool CUDADriver::LoadPtx(CUcontext context, + const char *ptx_contents, + CUmodule *module) { + port::Notification notification; + bool ret = true; + GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret, + ¬ification]() { + ScopedActivateContext activation{context}; + void *ptx_data = const_cast<char *>(ptx_contents); + static const unsigned int kLogBufferBytesLimit = 1024; + unsigned int error_log_buffer_bytes = kLogBufferBytesLimit; + unsigned int info_log_buffer_bytes = kLogBufferBytesLimit; + port::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes); + port::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes); + bool log_verbose = true; + CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE}; + // Note that the driver API wants the contents of this values to be stored + // in an array of void*s, so we coerce them accordingly. + void *option_values[] = { + port::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)), + port::bit_cast<void *>(error_log_buffer.data()), + port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)), + port::bit_cast<void *>(info_log_buffer.data()), + port::bit_cast<void *>(uintptr_t(log_verbose))}; + CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values)); + + CUresult res; + { + // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their + // module loading: see http://b/13248943 + + res = dynload::cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), + options, option_values); + } + + // The PTX JIT mutates the values in the option values array to reflect the + // size of the logs it output; now that we've made the call, read the values + // back out. + error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]); + info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]); + CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit); + CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit); + + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res); + // As a precaution for null termination of the API-provided value, ensure + // that at least the last byte is null. + error_log_buffer[error_log_buffer_bytes ? + error_log_buffer_bytes - 1 : 0] = '\0'; + LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes + << " bytes): " << error_log_buffer.data(); + ret = false; + notification.Notify(); + } + + VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes + << " bytes): " << info_log_buffer.data(); + VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes + << " bytes): " << error_log_buffer.data(); + CHECK(module != nullptr); + notification.Notify(); + }); + notification.WaitForNotification(); + + return ret; +} + +/* static */ bool CUDADriver::SynchronousMemsetUint8(CUcontext context, + CUdeviceptr location, + uint8 value, size_t size) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemsetD8_v2(location, value, size); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to memset memory: " << ToString(res); + return false; + } + return true; +} + +/* static */ bool CUDADriver::SynchronousMemsetUint32(CUcontext context, + CUdeviceptr location, + uint32 value, + size_t uint32_count) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemsetD32_v2(location, value, uint32_count); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to memset memory: " << ToString(res); + return false; + } + return true; +} + +/* static */ bool CUDADriver::AsynchronousMemsetUint32(CUcontext context, + CUdeviceptr location, + uint32 value, + size_t uint32_count, + CUstream stream) { + ScopedActivateContext activation{context}; + CUresult res = + dynload::cuMemsetD32Async(location, value, uint32_count, stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); + return false; + } + VLOG(2) << "successfully enqueued async memset operation"; + return true; +} + +/* static */ bool CUDADriver::AddStreamCallback(CUcontext context, + CUstream stream, + StreamCallback callback, + void *data) { + // Note: flags param is required to be zero according to CUDA 6.0. + CUresult res = + dynload::cuStreamAddCallback(stream, callback, data, 0 /* = flags */); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "unable to add host callback: " << ToString(res); + return false; + } + return true; +} + +/* static */ bool CUDADriver::GetModuleFunction(CUcontext context, + CUmodule module, + const char *kernel_name, + CUfunction *function) { + ScopedActivateContext activated{context}; + CHECK(module != nullptr && kernel_name != nullptr); + CUresult res = dynload::cuModuleGetFunction(function, module, kernel_name); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name + << "\" from module: " << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::GetModuleSymbol(CUcontext context, + CUmodule module, + const char *symbol_name, + CUdeviceptr *dptr, + size_t *bytes) { + ScopedActivateContext activated{context}; + CHECK(module != nullptr && symbol_name != nullptr && + (dptr != nullptr || bytes != nullptr)); + CUresult res = + dynload::cuModuleGetGlobal_v2(dptr, bytes, module, symbol_name); + if (res != CUDA_SUCCESS) { + // symbol may not be found in the current module, but it may reside in + // another module. + VLOG(2) << "failed to get symbol \"" << symbol_name + << "\" from module: " << ToString(res); + return false; + } + + return true; +} + +/* static */ void CUDADriver::UnloadModule(CUcontext context, CUmodule module) { + ScopedActivateContext activated{context}; + CUresult res = dynload::cuModuleUnload(module); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to unload module " << module + << "; leaking: " << ToString(res); + } +} + +/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext( + CUcontext context) { + ScopedActivateContext activated{context}; + CUdevice device = -1; + CUresult result = dynload::cuCtxGetDevice(&device); + if (result == CUDA_SUCCESS) { + return device; + } + + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to get device for context: ", ToString(result))}; +} + +/* static */ bool CUDADriver::CreateStream(CUcontext context, CUstream *out) { + // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess + // up synchronization with respect to memsets and any other things that have + // to occur on the default stream? + ScopedActivateContext activated{context}; + CUresult res = dynload::cuStreamCreate(out, 0); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not allocate CUDA stream for context " << context + << ": " << ToString(res); + return false; + } + + VLOG(2) << "successfully created stream " << *out << " for context " + << context << " on thread"; + return true; +} + +/* static */ void CUDADriver::DestroyStream(CUcontext context, + CUstream *stream) { + if (*stream == nullptr) { + return; + } + + ScopedActivateContext activated{context}; + CUresult res = dynload::cuStreamDestroy_v2(*stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to destroy CUDA stream for context " << context + << ": " << ToString(res); + } else { + VLOG(2) << "successfully destroyed stream " << *stream << " for context " + << context; + *stream = nullptr; + } +} + +/* static */ void *CUDADriver::DeviceAllocate(CUcontext context, uint64 bytes) { + ScopedActivateContext activated{context}; + CUdeviceptr result = 0; + CUresult res = dynload::cuMemAlloc_v2(&result, bytes); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to allocate " + << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes + << " bytes) from device: " << ToString(res); + return nullptr; + } + void *ptr = reinterpret_cast<void *>(result); + VLOG(2) << "allocated " << ptr << " for context " << context << " of " + << bytes << " bytes"; + return ptr; +} + +/* static */ void CUDADriver::DeviceDeallocate(CUcontext context, + void *location) { + ScopedActivateContext activation{context}; + CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location); + CUresult res = dynload::cuMemFree_v2(pointer); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to free device memory at " << location + << "; result: " << ToString(res); + } else { + VLOG(2) << "deallocated " << location << " for context " << context; + } +} + +/* static */ void *CUDADriver::HostAllocate(CUcontext context, uint64 bytes) { + ScopedActivateContext activation{context}; + void *host_mem = nullptr; + // "Portable" memory is visible to all CUDA contexts. Safe for our use model. + CUresult res = + dynload::cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to alloc " << bytes + << " bytes on host: " << ToString(res); + } + return host_mem; +} + +/* static */ void CUDADriver::HostDeallocate(CUcontext context, + void *location) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemFreeHost(location); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "error deallocating host memory at " << location << ": " + << ToString(res); + } +} + +/* static */ bool CUDADriver::HostRegister(CUcontext context, void *location, + uint64 bytes) { + ScopedActivateContext activation{context}; + // "Portable" memory is visible to all CUDA contexts. Safe for our use model. + CUresult res = + dynload::cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "error registering host memory at " << location << ": " + << ToString(res); + return false; + } + return true; +} + +/* static */ bool CUDADriver::HostUnregister(CUcontext context, + void *location) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemHostUnregister(location); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "error unregistering host memory at " << location << ": " + << ToString(res); + return false; + } + return true; +} + +/* static */ port::Status CUDADriver::DestroyEvent(CUcontext context, + CUevent *event) { + if (*event == nullptr) { + return port::Status{port::error::INVALID_ARGUMENT, + "input event cannot be null"}; + } + + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventDestroy_v2(*event); + *event = nullptr; + + switch (res) { + case CUDA_SUCCESS: + return port::Status::OK(); + case CUDA_ERROR_DEINITIALIZED: + case CUDA_ERROR_NOT_INITIALIZED: + return port::Status{ + port::error::FAILED_PRECONDITION, + port::Printf("error destroying CUDA event in context %p: %s", context, + ToString(res).c_str())}; + default: + return port::Status{ + port::error::INTERNAL, + port::Printf("error destroying CUDA event in context %p: %s", context, + ToString(res).c_str())}; + } +} + +/* static */ port::Status CUDADriver::RecordEvent(CUcontext context, + CUevent event, + CUstream stream) { + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventRecord(event, stream); + switch (res) { + case CUDA_SUCCESS: + return port::Status::OK(); + case CUDA_ERROR_DEINITIALIZED: + case CUDA_ERROR_NOT_INITIALIZED: + return port::Status{ + port::error::FAILED_PRECONDITION, + port::Printf("error recording CUDA event on stream %p: %s", stream, + ToString(res).c_str())}; + default: + return port::Status{ + port::error::INVALID_ARGUMENT, + port::Printf("error recording CUDA event on stream %p: %s", stream, + ToString(res).c_str())}; + } +} + +/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(CUcontext context, + CUevent event) { + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventQuery(event); + if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) { + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to query event: %s", ToString(res).c_str())}; + } + + return res; +} + +/* static */ bool CUDADriver::GetEventElapsedTime(CUcontext context, + float *elapsed_milliseconds, + CUevent start, CUevent stop) { + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to get elapsed time between events: " + << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::WaitStreamOnEvent(CUcontext context, + CUstream stream, + CUevent event) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuStreamWaitEvent(stream, event, 0 /* = flags */); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not wait stream on event: " << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::SynchronizeContext(CUcontext context) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuCtxSynchronize(); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res) + << " :: " << port::CurrentStackTrace(); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::SynchronizeStream(CUcontext context, + CUstream stream) { + ScopedActivateContext activated{context}; + CHECK(stream != nullptr); + CUresult res = dynload::cuStreamSynchronize(stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not synchronize on CUDA stream: " << ToString(res) + << " :: " << port::CurrentStackTrace(); + return false; + } + VLOG(2) << "successfully synchronized stream " << stream << " on context " + << context; + return true; +} + +/* static */ bool CUDADriver::IsStreamIdle(CUcontext context, CUstream stream) { + ScopedActivateContext activated{context}; + CHECK(stream != nullptr); + CUresult res = dynload::cuStreamQuery(stream); + if (res == CUDA_SUCCESS) { + return true; + } + + if (res != CUDA_ERROR_NOT_READY) { + LOG(ERROR) << "stream in bad state on status query: " << ToString(res); + } + return false; +} + +/* static */ bool CUDADriver::SynchronousMemcpyD2H(CUcontext context, + void *host_dst, + CUdeviceptr gpu_src, + uint64 size) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to synchronous memcpy from device to host: %s; " + "host dst: %p; GPU src: %p; size: %llu=0x%llx", + ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size); + return false; + } + VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to " + << host_dst; + return true; +} + +/* static */ bool CUDADriver::SynchronousMemcpyH2D(CUcontext context, + CUdeviceptr gpu_dst, + const void *host_src, + uint64 size) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to synchronous memcpy from host to device: %s; GPU dst: %p;" + " host src: %p; size: %llu=0x%llx", + ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size); + return false; + } + VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes"; + return true; +} + +/* static */ bool CUDADriver::SynchronousMemcpyD2D(CUcontext context, + CUdeviceptr gpu_dst, + CUdeviceptr gpu_src, + uint64 size) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to synchronous memcpy from host to device: %s; GPU dst: %p; " + "GPU src: %p; size: %llu=0x%llx", + ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), + port::bit_cast<void *>(gpu_src), size, size); + return false; + } + VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes"; + return true; +} + +/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CUcontext context, + void *host_dst, + CUdeviceptr gpu_src, + uint64 size, + CUstream stream) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyDtoHAsync_v2(host_dst, gpu_src, size, stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to enqueue async memcpy from device to host: %s; host dst: %p; " + "GPU src: %p; size: %llu=0x%llx", + ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size); + return false; + } + VLOG(2) << "successfully enqueued async memcpy d2h of " << size + << " bytes from " << port::bit_cast<void *>(gpu_src) << " to " << host_dst + << " on stream " << stream; + return true; +} + +/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CUcontext context, + CUdeviceptr gpu_dst, + const void *host_src, + uint64 size, + CUstream stream) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyHtoDAsync_v2(gpu_dst, host_src, size, stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; " + "host src: %p; size: %llu=0x%llx", + ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size); + return false; + } + VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes" + << " on stream " << stream; + return true; +} + +/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CUcontext context, + CUdeviceptr gpu_dst, + CUdeviceptr gpu_src, + uint64 size, + CUstream stream) { + ScopedActivateContext activation{context}; + CUresult result = + dynload::cuMemcpyDtoDAsync_v2(gpu_dst, gpu_src, size, stream); + if (result != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to enqueue async memcpy from device to device: %s" + "; GPU dst: %p on %s %s" + "; GPU src: %p on %s %s" + "; can access? %s; size: %llu=0x%llx", + ToString(result).c_str(), port::bit_cast<void *>(gpu_dst), + CUDAPointerToMemorySpaceString(gpu_dst).c_str(), + CUDAPointerToDeviceString(gpu_dst).c_str(), port::bit_cast<void *>(gpu_src), + CUDAPointerToMemorySpaceString(gpu_src).c_str(), + CUDAPointerToDeviceString(gpu_src).c_str(), + CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size); + + return false; + } + VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes"; + return true; +} + +/* static */ port::Status CUDADriver::CreateEvent(CUcontext context, + CUevent *result, + EventFlags flags) { + int cuflags; + switch (flags) { + case EventFlags::kDefault: + cuflags = CU_EVENT_DEFAULT; + break; + case EventFlags::kDisableTiming: + cuflags = CU_EVENT_DISABLE_TIMING; + break; + default: + LOG(FATAL) << "impossible event flags: " << int(flags); + } + + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventCreate(result, cuflags); + + if (res == CUDA_SUCCESS) { + return port::Status::OK(); + } else if (res == CUDA_ERROR_OUT_OF_MEMORY) { + return port::Status{port::error::RESOURCE_EXHAUSTED, + "could not create CUDA event: out of device memory"}; + } else { + return port::Status{ + port::error::FAILED_PRECONDITION, + port::StrCat("could not create CUDA event: ", ToString(res))}; + } +} + +/* static */ int CUDADriver::GetDeviceCount() { + int device_count = 0; + CUresult res = dynload::cuDeviceGetCount(&device_count); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res); + return 0; + } + + if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) { + device_count = 1; + } + return device_count; +} + +/* static */ port::StatusOr<CUcontext> CUDADriver::GetPointerContext( + CUdeviceptr pointer) { + CUcontext context = nullptr; + CUresult result = dynload::cuPointerGetAttribute( + &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer); + if (result == CUDA_SUCCESS) { + CHECK(context != nullptr) << "success should entail non-null context"; + return context; + } + + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to query device pointer for context: ", + ToString(result))}; +} + +/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace( + CUdeviceptr pointer) { + unsigned int value; + CUresult result = dynload::cuPointerGetAttribute( + &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer); + if (result == CUDA_SUCCESS) { + switch (value) { + case CU_MEMORYTYPE_DEVICE: + return MemorySpace::kDevice; + case CU_MEMORYTYPE_HOST: + return MemorySpace::kHost; + default: + return port::Status{ + port::error::INTERNAL, + port::StrCat("unknown memory space provided by CUDA API: ", value)}; + } + } + + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to query device pointer for memory space: ", + ToString(result))}; +} + +/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr, + CUdeviceptr *base, + size_t *size) { + CUresult result = dynload::cuMemGetAddressRange(base, size, dptr); + if (result == CUDA_SUCCESS) { + return port::Status::OK(); + } else if (result == CUDA_ERROR_NOT_FOUND) { + // We differentiate between "this pointer is unknown" (return here) and + // "there was an internal error while performing this operation" (return + // below). + return port::Status{ + port::error::NOT_FOUND, + port::Printf("not a device pointer %p; %s", + reinterpret_cast<void *>(dptr), ToString(result).c_str())}; + } + + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to get pointer into for device pointer %p; %s", + reinterpret_cast<void *>(dptr), ToString(result).c_str())}; +} + +/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice( + CUdeviceptr pointer) { + auto result = GetPointerContext(pointer); + if (!result.ok()) { + return result.status(); + } + + return DeviceFromContext(result.ValueOrDie()); +} + +/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major, + int *cc_minor, + CUdevice device) { + *cc_major = 0; + *cc_minor = 0; + CUresult result = + dynload::cuDeviceComputeCapability(cc_major, cc_minor, device); + if (result == CUDA_SUCCESS) { + return port::Status::OK(); + } + + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to get compute capability for device: %s; %d", + ToString(result).c_str(), device)}; +} + +// Helper function that turns the integer output of cuDeviceGetAttribute to type +// T and wraps it in a StatusOr. +template <typename T> +static port::StatusOr<T> GetSimpleAttribute(CUdevice device, + CUdevice_attribute attribute) { + int value = -1; + CUresult result = dynload::cuDeviceGetAttribute(&value, attribute, device); + if (result != CUDA_SUCCESS) { + return port::Status{ + port::error::NOT_FOUND, + port::StrCat("could not retrieve CUDA device attribute (", attribute, + "): ", ToString(result))}; + } + T converted = value; + return converted; +} + +/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount( + CUdevice device) { + return GetSimpleAttribute<int>(device, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore( + CUdevice device) { + return GetSimpleAttribute<int64>( + device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock( + CUdevice device) { + return GetSimpleAttribute<int64>( + device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor( + CUdevice device) { + return GetSimpleAttribute<int64>( + device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock( + CUdevice device) { + return GetSimpleAttribute<int64>(device, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock( + CUdevice device) { + return GetSimpleAttribute<int64>(device, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp( + CUdevice device) { + return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE); +} + +/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z, + CUdevice device) { + int value; + CUresult res = dynload::cuDeviceGetAttribute( + &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query max grid dim x: " << ToString(res); + return false; + } + *x = value; + + res = dynload::cuDeviceGetAttribute( + &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query max grid dim y: " << ToString(res); + return false; + } + *y = value; + + res = dynload::cuDeviceGetAttribute( + &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query max grid dim z: " << ToString(res); + return false; + } + *z = value; + return true; +} + +/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) { + CUresult res = dynload::cuDriverGetVersion(driver_version); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query driver version: " << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties, + int device_ordinal) { + CUresult res = + dynload::cuDeviceGetProperties(device_properties, device_ordinal); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query device properties: " << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) { + int value = -1; + CUresult res = dynload::cuDeviceGetAttribute( + &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query ECC status: " << ToString(res); + return false; + } + + *result = value; + return true; +} + +/* static */ bool CUDADriver::GetDeviceMemoryInfo(CUcontext context, + int64 *free_out, + int64 *total_out) { + ScopedActivateContext activation{context}; + size_t free = 0; + size_t total = 0; + CUresult res = dynload::cuMemGetInfo_v2(&free, &total); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query device memory info: " << ToString(res); + return false; + } + + *free_out = free; + *total_out = total; + return true; +} + +/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device, + uint64 *result) { + size_t value = -1; + CUresult res = dynload::cuDeviceTotalMem_v2(&value, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query total available memory: " << ToString(res); + return false; + } + + *result = value; + return true; +} + +/* static */ string CUDADriver::GetPCIBusID(CUdevice device) { + string pci_bus_id; + static const int kBufferSize = 64; + port::InlinedVector<char, 4> chars(kBufferSize); + chars[kBufferSize - 1] = '\0'; + CUresult res = + dynload::cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res); + return pci_bus_id; + } + pci_bus_id = chars.begin(); + return pci_bus_id; +} + +/* static */ bool CUDADriver::CanEnablePeerAccess(CUcontext from, + CUcontext to) { + if (from == to) { + return true; // A context can always access its own memory. + } + + int can_access_peer = -1; + auto from_device = DeviceFromContext(from); + if (!from_device.ok()) { + LOG(ERROR) << "failed to resolve 'from' peer access context to a device: " + << from_device.status(); + return false; + } + auto to_device = DeviceFromContext(to); + if (!to_device.ok()) { + LOG(ERROR) << "failed to resolve 'to' peer access context to a device: " + << to_device.status(); + return false; + } + CUresult res = dynload::cuDeviceCanAccessPeer( + &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie()); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to detect peer access capability: " << ToString(res); + return false; + } + + return can_access_peer; +} + +/* static */ port::Status CUDADriver::EnablePeerAccess(CUcontext from, + CUcontext to) { + if (from == to) { + return port::Status::OK(); // A context can always access its own memory. + } + + ScopedActivateContext activated{from}; + CUresult result = dynload::cuCtxEnablePeerAccess(to, 0 /* = flags */); + if (result != CUDA_SUCCESS && + result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) { + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to enable peer access from %p to %p: %s", from, to, + ToString(result).c_str())}; + } + + return port::Status::OK(); +} + +/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore( + CUcontext context, CUfunction kernel, int threads_per_block, + size_t dynamic_shared_memory_bytes) { + ScopedActivateContext activation{context}; + + int max_blocks; + CUresult result = dynload::cuOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes); + if (result != CUDA_SUCCESS) { + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to calculate occupancy of kernel %p: %s", kernel, + ToString(result).c_str())}; + } + + return max_blocks; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h new file mode 100644 index 0000000000..007db222d9 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_driver.h @@ -0,0 +1,460 @@ +// CUDA userspace driver library wrapper functionality. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_ + +#include <stddef.h> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/cuda/multi_op_activation.h" +#include "tensorflow/stream_executor/device_options.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "third_party/gpus/cuda/include/cuda.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +// Identifies the memory space where an allocation resides. See +// CUDADriver::GetPointerMemorySpace(). +enum class MemorySpace { kHost, kDevice }; + +// Returns a casual string, such as "host" for the provided memory space. +string MemorySpaceString(MemorySpace memory_space); + +// CUDADriver contains wrappers for calls to the userspace library driver. It's +// useful to isolate these calls and put basic wrappers around them to separate +// userspace library driver behaviors from the rest of the program. +// +// At the moment it's simply used as a namespace. +// +// The calls log any specific errors internally and return whether the operation +// was successful to the caller. +// +// The order of parameters is generally kept symmetric with the underlying CUDA +// driver API. +// +// Links on functions are to specific documentation under +// http://docs.nvidia.com/cuda/cuda-driver-api/ +// +// Thread safety: these functions should not be used from signal handlers. +class CUDADriver { + public: + // Wraps a call to cuInit with logging to help indicate what has gone wrong in + // the case of failure. Safe to call multiple times; will be fast on all calls + // after the first. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3 + static port::Status Init(); + + // Returns the device associated with the given context. + // device is an outparam owned by the caller, must not be null. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e + static port::StatusOr<CUdevice> DeviceFromContext(CUcontext context); + + // Creates a new CUDA stream associated with the given context via + // cuStreamCreate. + // stream is an outparam owned by the caller, must not be null. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4 + static bool CreateStream(CUcontext context, CUstream *stream); + + // Destroys a CUDA stream associated with the given context. + // stream is owned by the caller, must not be null, and *stream is set to null + // if the stream is successfuly destroyed. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758 + static void DestroyStream(CUcontext context, CUstream *stream); + + // CUDA events can explicitly disable event TSC retrieval for some presumed + // performance improvement if timing is unnecessary. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db + enum class EventFlags { kDefault, kDisableTiming }; + + // Creates a new event associated with the given context. + // result is an outparam owned by the caller and must not be null. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db + static port::Status CreateEvent(CUcontext context, CUevent *result, + EventFlags flags); + + // Destroys *event and turns it into a nullptr. event may not be null, but + // *event may be, via cuEventDestroy + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef + static port::Status DestroyEvent(CUcontext context, CUevent *event); + + // Allocates a GPU memory space of size bytes associated with the given + // context via cuMemAlloc. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467 + static void *DeviceAllocate(CUcontext context, uint64 bytes); + + // Deallocates a GPU memory space of size bytes associated with the given + // context via cuMemFree. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a + static void DeviceDeallocate(CUcontext context, void *location); + + // Allocates page-locked and CUDA-registered memory on the host via + // cuMemAllocHost. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0 + static void *HostAllocate(CUcontext context, uint64 bytes); + + // Deallocates a location created by HostAllocate, via cuMemFreeHost. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c + static void HostDeallocate(CUcontext context, void *location); + + // Registers a memory region at location of size bytes via cuMemHostRegister. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223 + static bool HostRegister(CUcontext context, void *location, uint64 bytes); + + // Unregisters a memory region that was previously registered at location via + // cuMemHostUnregister. + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14 + // + // TODO(leary) verify an error will be returned if the location wasn't + // previously registered. + static bool HostUnregister(CUcontext context, void *location); + + // Given a device ordinal, returns a device handle into the device outparam, + // which must not be null. + // + // N.B. these device handles do not have a corresponding destroy function in + // the CUDA driver API. + static port::Status GetDevice(int device_ordinal, CUdevice *device); + + // Given a device handle, returns the name reported by the driver for the + // device. + static bool GetDeviceName(CUdevice device, string *name_out); + + // Given a device to create a context for, returns a context handle into the + // context outparam, which must not be null. + // + // N.B. CUDA contexts are weird. They are implicitly associated with the + // calling thread. Current documentation on contexts and their influence on + // userspace processes is given here: + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf + static port::Status CreateContext(CUdevice device, + DeviceOptions device_options, + CUcontext *context); + + // Destroys the provided context via cuCtxDestroy. + // Don't do this while clients could still be using the context, per the docs + // bad things will happen. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e + static void DestroyContext(CUcontext context); + + // Queries the runtime for the specified attribute of the specified function. + // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates + // in terms of integer-sized values, so there's no potential for overrun (as + // of CUDA 5.5). + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b + static bool FuncGetAttribute(CUfunction_attribute attribute, + CUfunction function, int *attribute_value); + + // Sets the preferred cache configuration for the specified function. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681 + static bool FuncSetCacheConfig(CUfunction function, + CUfunc_cache cache_config); + + // Gets the preferred shared memory bank configuration for the specified + // CONTEXT (not function!), either default or four- or eight-byte bank size. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74 + static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig( + CUcontext context); + + // Sets the preferred shared memory bank configuration for the specified + // CONTEXT (not function!), either default or four- or eight-byte bank size. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692 + static port::Status ContextSetSharedMemConfig( + CUcontext context, CUsharedconfig shared_mem_config); + + // Launches a CUDA kernel via cuLaunchKernel. + // TODO(leary) describe the structure of kernel_params and extra in a readable + // way. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15 + static bool LaunchKernel(CUcontext context, CUfunction function, + unsigned int grid_dim_x, unsigned int grid_dim_y, + unsigned int grid_dim_z, unsigned int block_dim_x, + unsigned int block_dim_y, unsigned int block_dim_z, + unsigned int shared_mem_bytes, CUstream stream, + void **kernel_params, void **extra); + + // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting + // handle in "module". Any error logs that are produced are logged internally. + static bool LoadPtx(CUcontext context, const char *ptx_contents, + CUmodule *module); + + // Loads cubin_bytes with the CUDA driver's blob loading interface and stores + // the resulting handle in "module". + static port::Status LoadCubin(CUcontext context, const char *cubin_bytes, + CUmodule *module); + + // Retrieves a named kernel from a loaded module, and places the resulting + // handle into function (outparam) on success. Neither kernel_name nor + // function may be null. No ownership is taken of kernel_name. + static bool GetModuleFunction(CUcontext context, CUmodule module, + const char *kernel_name, CUfunction *function); + + // Retrieves a named global/constant symbol from a loaded module, and returns + // a device pointer and size of the symbol on success. symbol_name may not be + // null. At least one of dptr or bytes should not be null. No ownership is + // taken of symbol_name. + static bool GetModuleSymbol(CUcontext context, CUmodule module, + const char *symbol_name, CUdeviceptr *dptr, + size_t *bytes); + + // Unloads module from the current context via cuModuleUnload. + // TODO(leary) the documentation doesn't say what kind of disasters happen + // if you try to unload a module while its CUfunctions are in use. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b + static void UnloadModule(CUcontext context, CUmodule module); + + // Performs a synchronous memset of the device memory segment via cuMemsetD8. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b + static bool SynchronousMemsetUint8(CUcontext context, CUdeviceptr location, + uint8 value, size_t size); + + // Performs a synchronous memset of the device memory segment via cuMemsetD32. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132 + static bool SynchronousMemsetUint32(CUcontext context, CUdeviceptr location, + uint32 value, size_t uint32_count); + + // Performs an asynchronous memset of the device memory segment via + // cuMemsetD32Async. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5 + static bool AsynchronousMemsetUint32(CUcontext context, CUdeviceptr location, + uint32 value, size_t uint32_count, + CUstream stream); + + // -- Synchronous memcopies. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169 + + static bool SynchronousMemcpyD2H(CUcontext context, void *host_dst, + CUdeviceptr gpu_src, uint64 size); + static bool SynchronousMemcpyH2D(CUcontext context, CUdeviceptr gpu_dst, + const void *host_src, uint64 size); + static bool SynchronousMemcpyD2D(CUcontext context, CUdeviceptr gpu_dst, + CUdeviceptr gpu_src, uint64 size); + + // -- Asynchronous memcopies. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362 + + static bool AsynchronousMemcpyD2H(CUcontext context, void *host_dst, + CUdeviceptr gpu_src, uint64 size, + CUstream stream); + static bool AsynchronousMemcpyH2D(CUcontext context, CUdeviceptr gpu_dst, + const void *host_src, uint64 size, + CUstream stream); + static bool AsynchronousMemcpyD2D(CUcontext context, CUdeviceptr gpu_dst, + CUdeviceptr gpu_src, uint64 size, + CUstream stream); + + // The CUDA stream callback type signature. + // The data passed to AddStreamCallback is subsequently passed to this + // callback when it fires. + // + // Some notable things: + // * Callbacks must not make any CUDA API calls. + // * Callbacks from independent streams execute in an undefined order and may + // be serialized. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483 + typedef void (*StreamCallback)(CUstream stream, CUresult status, void *data); + + // Enqueues a callback operation into stream. + // See StreamCallback above and the NVIDIA documentation for additional + // details. + static bool AddStreamCallback(CUcontext context, CUstream stream, + StreamCallback callback, void *data); + + // Causes stream to wait for event to trigger before proceeding via + // cuStreamWaitEvent. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM + static bool WaitStreamOnEvent(CUcontext context, CUstream stream, + CUevent event); + + // Blocks the calling thread until the operations enqueued onto stream have + // been completed, via cuStreamSynchronize. + // + // TODO(leary) if a pathological thread enqueues operations onto the stream + // while another thread blocks like this, can you wind up waiting an unbounded + // amount of time? + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad + static bool SynchronizeStream(CUcontext context, CUstream stream); + + // Blocks the calling thread until the operations associated with the context + // have been completed, via cuCtxSynchronize. + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616 + static bool SynchronizeContext(CUcontext context); + + // Returns true if all stream tasks have completed at time of the call. Note + // the potential for races around this call (if another thread adds work to + // the stream immediately after this returns). + static bool IsStreamIdle(CUcontext context, CUstream stream); + + // Returns whether code in the from context can access memory in the to + // context via cuDeviceCanAccessPeer. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e + static bool CanEnablePeerAccess(CUcontext from, CUcontext to); + + // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a + static port::Status EnablePeerAccess(CUcontext from, CUcontext to); + + // Returns the elapsed milliseconds between start and stop via + // cuEventElapsedTime. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97 + static bool GetEventElapsedTime(CUcontext context, + float *elapsed_milliseconds, CUevent start, + CUevent stop); + + // Records that an event occurred when execution reaches the current point in + // thestream via cuEventRecord. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1 + static port::Status RecordEvent(CUcontext context, CUevent event, + CUstream stream); + + // Polls (without blocking) to determine the status of an event - pending or + // complete (or an error status). + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef + static port::StatusOr<CUresult> QueryEvent(CUcontext context, CUevent event); + + // -- Pointer-specific calls. + + // Returns the context in which pointer was allocated or registered. + static port::StatusOr<CUcontext> GetPointerContext(CUdeviceptr pointer); + + // Returns the device associated with the context from GetPointerContext(). + static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer); + + // Returns the memory space addressed by pointer. + static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer); + + // Returns the base address and size of the device pointer dptr. + static port::Status GetPointerAddressRange(CUdeviceptr dptr, + CUdeviceptr *base, size_t *size); + + // -- Device-specific calls. + + // Returns the compute capability for the device; i.e (3, 5). + // This is currently done via the deprecated device API. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea + static port::Status GetComputeCapability(int *cc_major, int *cc_minor, + CUdevice device); + + // Returns the number of multiprocessors on the device (note that the device + // may be multi-GPU-per-board). + static port::StatusOr<int> GetMultiprocessorCount(CUdevice device); + + // Returns the limit on number of threads that can be resident in a single + // multiprocessor. + static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device); + + // Returns the limit on number of threads which may be resident for a single + // block (cooperative thread array). + static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device); + + // Returns the amount of shared memory available on a single GPU core (i.e. + // SM on NVIDIA devices). + static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device); + + // Returns the amount of shared memory available for a single block + // (cooperative thread array). + static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device); + + // Returns the maximum supported number of registers per block. + static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device); + + // Returns the number of threads per warp. + static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device); + + // Queries the grid limits for device with cuDeviceGetAttribute calls. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 + static bool GetGridLimits(int *x, int *y, int *z, CUdevice device); + + // Returns a grab-bag of device properties in a caller-owned device_properties + // structure for device_ordinal via cuDeviceGetProperties. + // This call is deprecated in the NVIDIA driver API. + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6 + static bool GetDeviceProperties(CUdevprop *device_properties, + int device_ordinal); + + // Returns whether ECC is enabled for the given CUdevice via + // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 + static bool IsEccEnabled(CUdevice device, bool *result); + + // Returns the total amount of memory available for allocation by the CUDA + // context, in bytes, via cuDeviceTotalMem. + static bool GetDeviceTotalMemory(CUdevice device, uint64 *result); + + // Returns the free amount of memory and total amount of memory, as reported + // by cuMemGetInfo. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0 + static bool GetDeviceMemoryInfo(CUcontext context, int64 *free, int64 *total); + + // Returns a PCI bus id string for the device. + // [domain]:[bus]:[device].[function] + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc + static string GetPCIBusID(CUdevice device); + + // -- Context- and device-independent calls. + + // Returns the number of visible CUDA device via cuDeviceGetCount. + // This should correspond to the set of device ordinals available. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74 + static int GetDeviceCount(); + + // Returns the driver version number via cuDriverGetVersion. + // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but, + // instead, the CUDA toolkit release number that this driver is compatible + // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5 + // compatible driver). + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71 + static bool GetDriverVersion(int *driver_version); + + // -- Other calls + + // Returns the maximum number of blocks (per multiprocessor) occupied by the + // specified kernel/CUfunction when launched with the specified parameters. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98 + static port::StatusOr<int> GetMaxOccupiedBlocksPerCore( + CUcontext context, CUfunction kernel, int threads_per_block, + size_t dynamic_shared_memory_bytes); + + // Seam for injecting an error at CUDA initialization time for testing + // purposes. + static bool driver_inject_init_error_; +}; + +// Ensures a context is activated within a scope. +class ScopedActivateContext { + public: + // Activates the context via cuCtxSetCurrent, if it is not the currently + // active context (a la cuCtxGetCurrent). Note the alternative push/pop + // mechanism is said by NVIDIA to be relatively slow and deprecated. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7 + explicit ScopedActivateContext( + CUcontext context, MultiOpActivation moa = MultiOpActivation::kNo); + + // Checks that the context has remained activated for the duration of the + // scope. + ~ScopedActivateContext(); + + private: + CUcontext context_; // context being activated. + + CUcontext prior_context_; // context that was active when we were activated. + + // Stores whether this was instantiated during a MultiOpActivation, in which + // case we will not pop the context when we're destroyed (we will leave it to + // the parent MultiOpActivation that we were nested within). + bool previously_in_multi_op_activation_; +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc new file mode 100644 index 0000000000..a87c868c6b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_event.cc @@ -0,0 +1,56 @@ +#include "tensorflow/stream_executor/cuda/cuda_event.h" + +#include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +CUDAEvent::CUDAEvent(CUDAExecutor* parent) + : parent_(parent), cuda_event_(nullptr) {} + +CUDAEvent::~CUDAEvent() {} + +port::Status CUDAEvent::Init() { + return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_, + CUDADriver::EventFlags::kDisableTiming); +} + +port::Status CUDAEvent::Destroy() { + return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_); +} + +port::Status CUDAEvent::Record(CUDAStream* stream) { + return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_, + stream->cuda_stream()); +} + +Event::Status CUDAEvent::PollForStatus() { + port::StatusOr<CUresult> status = + CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_); + if (!status.ok()) { + LOG(ERROR) << "Error polling for event status: " + << status.status().error_message(); + return Event::Status::kError; + } + + switch (status.ValueOrDie()) { + case CUDA_SUCCESS: + return Event::Status::kComplete; + case CUDA_ERROR_NOT_READY: + return Event::Status::kPending; + default: + LOG(INFO) << "Error condition returned for event status: " + << status.ValueOrDie(); + return Event::Status::kError; + } +} + +const CUevent& CUDAEvent::cuda_event() { + return cuda_event_; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h new file mode 100644 index 0000000000..c5b65662db --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_event.h @@ -0,0 +1,49 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_ + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/lib/status.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +// CUDAEvent wraps a CUevent in the platform-independent EventInterface +// interface. +class CUDAEvent : public internal::EventInterface { + public: + explicit CUDAEvent(CUDAExecutor* parent); + + ~CUDAEvent() override; + + // Populates the CUDA-platform-specific elements of this object. + port::Status Init(); + + // Deallocates any platform-specific elements of this object. This is broken + // out (not part of the destructor) to allow for error reporting. + port::Status Destroy(); + + // Inserts the event at the current position into the specified stream. + port::Status Record(CUDAStream* stream); + + // Polls the CUDA platform for the event's current status. + Event::Status PollForStatus(); + + // The underyling CUDA event element. + const CUevent& cuda_event(); + + private: + // The Executor used to which this object and CUevent are bound. + CUDAExecutor* parent_; + + // The underlying CUDA event element. + CUevent cuda_event_; +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc new file mode 100644 index 0000000000..59c3159895 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_fft.cc @@ -0,0 +1,327 @@ +#include "tensorflow/stream_executor/cuda/cuda_fft.h" + +#include <dlfcn.h> + +#include <complex> + +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_helpers.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin); + +namespace dynload { + +// This macro wraps a global identifier, given by __name, in a callable +// structure that loads the DLL symbol out of the DSO handle in a thread-safe +// manner on first use. This dynamic loading technique is used to avoid DSO +// dependencies on vendor libraries which may or may not be available in the +// deployed binary environment. +#define PERFTOOLS_GPUTOOLS_CUFFT_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetCufftDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << " in cuFFT DSO; dlerror: " << dlerror(); \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + cufftResult operator()(CUDAExecutor * parent, Args... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +#define CUFFT_ROUTINE_EACH(__macro) \ + __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d) \ + __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany) \ + __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C) \ + __macro(cufftExecC2R) __macro(cufftExecZ2Z) \ + __macro(cufftExecR2C) + +CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP) + +} // namespace dynload + +namespace { + +// A helper function transforming gpu_fft arguments into cuFFT arguments. +cufftType CUDAFftType(fft::Type type) { + switch (type) { + case fft::Type::kC2CForward: + case fft::Type::kC2CInverse: + return CUFFT_C2C; + case fft::Type::kC2R: + return CUFFT_C2R; + case fft::Type::kR2C: + return CUFFT_R2C; + case fft::Type::kZ2ZForward: + case fft::Type::kZ2ZInverse: + return CUFFT_Z2Z; + case fft::Type::kZ2D: + return CUFFT_Z2D; + case fft::Type::kD2Z: + return CUFFT_D2Z; + default: + LOG(FATAL) << "Invalid value of fft::Type."; + } +} + +// Associates the given stream with the given cuFFT plan. +bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) { + auto ret = dynload::cufftSetStream(parent, plan, AsCUDAStreamValue(stream)); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret; + return false; + } + return true; +} + +} // namespace + +CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type) + : parent_(parent), fft_type_(type) { + auto ret = dynload::cufftPlan1d(parent, &plan_, num_x, CUDAFftType(type), + 1 /* = batch */); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret; + } +} + +CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, + fft::Type type) + : parent_(parent), fft_type_(type) { + auto ret = + dynload::cufftPlan2d(parent, &plan_, num_x, num_y, CUDAFftType(type)); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret; + } +} + +CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, + uint64 num_z, fft::Type type) + : parent_(parent), fft_type_(type) { + auto ret = dynload::cufftPlan3d(parent, &plan_, num_x, num_y, num_z, + CUDAFftType(type)); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret; + } +} + +CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count, + uint64 *input_embed, uint64 input_stride, + uint64 input_distance, uint64 *output_embed, + uint64 output_stride, uint64 output_distance, + fft::Type type, int batch_count) + : parent_(parent), fft_type_(type) { + int elem_count_[3], input_embed_[3], output_embed_[3]; + for (int i = 0; i < rank; ++i) { + elem_count_[i] = elem_count[i]; + if (input_embed) { + input_embed_[i] = input_embed[i]; + } + if (output_embed) { + output_embed_[i] = output_embed[i]; + } + } + auto ret = dynload::cufftPlanMany( + parent, &plan_, rank, elem_count_, input_embed ? input_embed_ : nullptr, + input_stride, input_distance, output_embed ? output_embed_ : nullptr, + output_stride, output_distance, CUDAFftType(type), batch_count); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to create cuFFT batched plan:" << ret; + } +} + +CUDAFftPlan::~CUDAFftPlan() { dynload::cufftDestroy(parent_, plan_); } + +int CUDAFftPlan::GetFftDirection() const { + switch (fft_type_) { + case fft::Type::kC2CForward: + case fft::Type::kZ2ZForward: + case fft::Type::kR2C: + case fft::Type::kD2Z: + return CUFFT_FORWARD; + case fft::Type::kC2CInverse: + case fft::Type::kZ2ZInverse: + case fft::Type::kC2R: + case fft::Type::kZ2D: + return CUFFT_INVERSE; + default: + LOG(FATAL) << "Invalid value of fft::Type."; + } +} + +std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64 num_x, + fft::Type type, + bool in_place_fft) { + std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, type)}; + return plan; +} + +std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64 num_x, + uint64 num_y, fft::Type type, + bool in_place_fft) { + std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, num_y, type)}; + return plan; +} + +std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64 num_x, + uint64 num_y, uint64 num_z, + fft::Type type, + bool in_place_fft) { + std::unique_ptr<fft::Plan> plan{ + new CUDAFftPlan(parent_, num_x, num_y, num_z, type)}; + return plan; +} + +std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan( + Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed, + uint64 input_stride, uint64 input_distance, uint64 *output_embed, + uint64 output_stride, uint64 output_distance, fft::Type type, + bool in_place_fft, int batch_count) { + std::unique_ptr<fft::Plan> plan{new CUDAFftPlan( + parent_, rank, elem_count, input_embed, input_stride, input_distance, + output_embed, output_stride, output_distance, type, batch_count)}; + return plan; +} + +template <typename FuncT, typename InputT, typename OutputT> +bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec, + const DeviceMemory<InputT> &input, + DeviceMemory<OutputT> *output) { + CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan); + if (cuda_fft_plan == nullptr) { + LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object."; + return false; + } + + if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) { + return false; + } + + auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(), + CUDAComplex(const_cast<InputT *>(CUDAMemory(input))), + CUDAComplex(CUDAMemoryMutable(output))); + + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to run cuFFT routine: " << ret; + return false; + } + + return true; +} + +template <typename FuncT, typename InputT, typename OutputT> +bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan, + FuncT cufftExec, + const DeviceMemory<InputT> &input, + DeviceMemory<OutputT> *output) { + CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan); + if (cuda_fft_plan == nullptr) { + LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object."; + return false; + } + + if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) { + return false; + } + + auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(), + CUDAComplex(const_cast<InputT *>(CUDAMemory(input))), + CUDAComplex(CUDAMemoryMutable(output)), + cuda_fft_plan->GetFftDirection()); + + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to run cuFFT routine: " << ret; + return false; + } + + return true; +} + +#define PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2, \ + __fft_type3) \ + bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<std::complex<__type>> &input, \ + DeviceMemory<std::complex<__type>> *output) { \ + return DoFftWithDirectionInternal( \ + stream, plan, dynload::cufftExec##__fft_type1, input, output); \ + } \ + bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<__type> &input, \ + DeviceMemory<std::complex<__type>> *output) { \ + return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type2, input, \ + output); \ + } \ + bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<std::complex<__type>> &input, \ + DeviceMemory<__type> *output) { \ + return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type3, input, \ + output); \ + } + +PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(float, C2C, R2C, C2R) +PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D) + +#undef PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +namespace gpu = ::perftools::gputools; + +REGISTER_MODULE_INITIALIZER(register_cufft, { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::FftFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuFftPlugin, "cuFFT", + [](gpu::internal::StreamExecutorInterface + *parent) -> gpu::fft::FftSupport * { + gpu::cuda::CUDAExecutor *cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor *>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuFFT " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + return new gpu::cuda::CUDAFft(cuda_executor); + }); + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuFFT factory: " + << status.error_message(); + } + + // Prime the cuFFT DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCufftDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuFFT DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kFft, + gpu::cuda::kCuFftPlugin); +}); diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h new file mode 100644 index 0000000000..2577c2952e --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_fft.h @@ -0,0 +1,95 @@ +// CUDA-specific support for FFT functionality -- this wraps the cuFFT library +// capabilities, and is only included into CUDA implementation code -- it will +// not introduce cuda headers into other code. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_ + +#include "tensorflow/stream_executor/fft.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "third_party/gpus/cuda/include/cufft.h" + +namespace perftools { +namespace gputools { + +class Stream; + +namespace cuda { + +class CUDAExecutor; + +// Opaque and unique indentifier for the cuFFT plugin. +extern const PluginId kCuFftPlugin; + +class CUDAFftPlan : public fft::Plan { + public: + // Constructor creating 1d FFT plan. + CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type); + // Constructor creating 2d FFT plan. + CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, fft::Type type); + // Constructor creating 3d FFT plan. + CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, uint64 num_z, + fft::Type type); + // Constructor creating batched FFT plan. + CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count, + uint64 *input_embed, uint64 input_stride, uint64 input_distance, + uint64 *output_embed, uint64 output_stride, + uint64 output_distance, fft::Type type, int batch_count); + ~CUDAFftPlan() override; + + // Get FFT direction in cuFFT based on FFT type. + int GetFftDirection() const; + cufftHandle GetPlan() const { return plan_; } + + private: + CUDAExecutor *parent_; + cufftHandle plan_; + fft::Type fft_type_; +}; + +// FFT support for CUDA platform via cuFFT library. +// +// This satisfies the platform-agnostic FftSupport interface. +// +// Note that the cuFFT handle that this encapsulates is implicitly tied to the +// context (and, as a result, the device) that the parent CUDAExecutor is tied +// to. This simply happens as an artifact of creating the cuFFT handle when a +// CUDA context is active. +// +// Thread-safe. The CUDA context associated with all operations is the CUDA +// context of parent_, so all context is explicit. +class CUDAFft : public fft::FftSupport { + public: + explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {} + ~CUDAFft() override {} + + TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES + + private: + CUDAExecutor *parent_; + + // Two helper functions that execute dynload::cufftExec?2?. + + // This is for complex to complex FFT, when the direction is required. + template <typename FuncT, typename InputT, typename OutputT> + bool DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan, + FuncT cufft_exec, + const DeviceMemory<InputT> &input, + DeviceMemory<OutputT> *output); + + // This is for complex to real or real to complex FFT, when the direction + // is implied. + template <typename FuncT, typename InputT, typename OutputT> + bool DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufft_exec, + const DeviceMemory<InputT> &input, + DeviceMemory<OutputT> *output); + + SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc new file mode 100644 index 0000000000..77f16e2a6e --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -0,0 +1,1082 @@ +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" + +#include <unistd.h> + +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_event.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/cuda/cuda_timer.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/mathutil.h" +#include "tensorflow/stream_executor/lib/path.h" +#include "tensorflow/stream_executor/lib/process_state.h" +#include "tensorflow/stream_executor/lib/ptr_util.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/lib/str_util.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/timer.h" +#include "tensorflow/stream_executor/lib/numbers.h" + +#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_ +#error \ + "No driver calls in this file, wrap driver functionality in cuda_driver.cc." +#endif + +#ifdef __CUDA_RUNTIME_H__ +#error \ + "CUDA runtime being included into CUDA GPU executor; should be driver only." +#endif + +extern bool FLAGS_check_gpu_leaks; +tensorflow::int32 FLAGS_register_occupancy_warning_threshold; +bool FLAGS_prefer_cubin_to_ptx = true; + +namespace perftools { +namespace gputools { +namespace rng { +class RngSupport; +} // namespace rng +} // namespace gputools +} // namespace perftools + +namespace perftools { +namespace gputools { +namespace cuda { + +// Hook that can be used to CUBIN-ate PTX before it is loaded into the driver. +// It has been observed that loading both PTX and cubins into the driver library +// can cause it to crash, but loading only CUBINs avoids those crashes; +// therefore, it's useful to have this hook to hack in uniform CUBIN-ation of +// PTX code. +// +// As this is an implementation-detail workaround, the usage is to declare this +// variable with extern linkage and populate it from another translation unit. +std::function<string(const string &)> g_cubinate; + +static CUDAEvent *AsCUDAEvent(Event *event) { + DCHECK(event != nullptr); + return static_cast<CUDAEvent *>(event->implementation()); +} + +// Given a platform-independent stream datatype, returns the internal CUDA +// platform implementation pointer. +static CUDAStream *AsCUDAStream(Stream *stream) { + DCHECK(stream != nullptr); + return static_cast<CUDAStream *>(stream->implementation()); +} + +// Given a platform-independent stream datatype, returns the platform +// implementation's internal value, suitable for passing directly to libcuda +// APIs. +CUstream AsCUDAStreamValue(Stream *stream) { + DCHECK(stream != nullptr); + return AsCUDAStream(stream)->cuda_stream(); +} + +// Given a platform-independent timer datatype, returns the internal CUDA +// platform implementation pointer. +static CUDATimer *AsCUDATimer(Timer *timer) { + DCHECK(timer != nullptr); + return static_cast<CUDATimer *>(timer->implementation()); +} + +// Given const GPU memory, returns a libcuda device pointer datatype, suitable +// for passing directly to libcuda APIs. +// +// N.B. we must lose constness in order to pass a suitable type to the existing +// libcuda APIs, so the caller should take care to only pass the result of const +// GPU memory conversions to libcuda functions which will honor constness. +static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase &gpu_mem) { + return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque()); +} + +// See description on const version above. +static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) { + return AsCudaDevicePtr(*gpu_mem); +} + +static CUcontext GetCudaContext(Stream *stream) { + return static_cast<CUDAExecutor *>(stream->parent()->implementation()) + ->cuda_context(); +} + +CUcontext ExtractCudaContext(CUDAExecutor *cuda_exec) { + CHECK(cuda_exec != nullptr); + return cuda_exec->cuda_context(); +} + +CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) { + return static_cast<CUDAExecutor *>(stream_exec->implementation()); +} + +CUDAExecutor::~CUDAExecutor() { + for (auto &it : disk_modules_) { + CUDADriver::UnloadModule(context_, it.second); + } + for (auto &it : in_memory_modules_) { + CUDADriver::UnloadModule(context_, it.second); + } + if (context_ != nullptr) { + CUDADriver::DestroyContext(context_); + } +} + +port::Status CUDAExecutor::Init(int device_ordinal, + DeviceOptions device_options) { + device_ordinal_ = device_ordinal; + + auto status = CUDADriver::Init(); + if (!status.ok()) { + return status; + } + + status = CUDADriver::GetDevice(device_ordinal_, &device_); + if (!status.ok()) { + return status; + } + + status = CUDADriver::CreateContext(device_, device_options, &context_); + if (!status.ok()) { + return status; + } + + return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_); +} + +bool CUDAExecutor::FindOnDiskForComputeCapability( + port::StringPiece filename, port::StringPiece canonical_suffix, + string *found_filename) const { + if (cc_major_ == 0 && cc_minor_ == 0) { + return false; + } + + // TODO(22689637): Eliminate unnecessary ToString()s when all dependencies + // have been migrated. + string cc_specific = port::StrCat(filename.ToString(), ".cc", cc_major_, + cc_minor_, canonical_suffix.ToString()); + if (port::FileExists(cc_specific)) { + VLOG(2) << "found compute-capability-specific file, using that: " + << cc_specific; + *found_filename = cc_specific; + return true; + } + + VLOG(2) << "could not find compute-capability specific file at: " + << cc_specific; + if (port::FileExists(filename.ToString())) { + *found_filename = filename.ToString(); + return true; + } + + return false; +} + +// Returns the path to the running executable. +// N.B. Derived from //knowledge/smalltalk/background_kb.cc +// Arg: strip_exe: if true, remove the name of the executable itself from the +// returned string. Example: calling this from /usr/bin/foo +// would return /usr/bin. +static string GetBinaryDir(bool strip_exe) { + char exe_path[PATH_MAX] = {0}; + CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1)); + // Make sure it's null-terminated: + exe_path[sizeof(exe_path) - 1] = 0; + + if (strip_exe) { + // The exe is the last component of the path, so remove one component. + string ret = exe_path; + std::vector<string> components = port::Split(exe_path, '/'); + components.pop_back(); + return port::Join(components, "/"); + } + return exe_path; +} + +// Returns the location of the runfiles directory. +// This is the directory which "bazel run" sets as the current working directory +// before the program starts. +// N.B. This doesn't have to be running under "bazel run" in order to get the +// appropriate runfiles directory. +static string GetRunfilesDir() { + return port::StrCat(GetBinaryDir(false), ".runfiles"); +} + +bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) { + CUDAKernel *cuda_kernel = AsCUDAKernel(kernel); + CUmodule module = nullptr; + const string *kernelname; + + const OnDiskKernelLoaderSpec *on_disk_spec = nullptr; + bool has_ptx = spec.has_cuda_ptx_on_disk(); + bool has_cubin = spec.has_cuda_cubin_on_disk(); + if (has_cubin && (!has_ptx || FLAGS_prefer_cubin_to_ptx)) { + on_disk_spec = &spec.cuda_cubin_on_disk(); + } else if (has_ptx) { + on_disk_spec = &spec.cuda_ptx_on_disk(); + } + + if (on_disk_spec != nullptr) { + } else if (spec.has_cuda_ptx_in_memory()) { + kernelname = &spec.cuda_ptx_in_memory().kernelname(); + + if (cc_major_ == 0 && cc_minor_ == 0) { + return false; + } + + // Note that the orignal ptx may be compressed, and the ptx we get below is + // the decompressed result. To cache the module we should use the original + // ptx (compressed one) as the key. This is because for the same compressed + // ptx, we may get different decompressed ptx wrt the pointer value. + const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_); + const char *orig_ptx = + spec.cuda_ptx_in_memory().original_text(cc_major_, cc_minor_); + if (ptx == nullptr || orig_ptx == nullptr) { + ptx = spec.cuda_ptx_in_memory().default_text(); + orig_ptx = spec.cuda_ptx_in_memory().original_default_text(); + } + if (ptx == nullptr || orig_ptx == nullptr) { + LOG(FATAL) << "could not load ptx for kernel " << kernelname; + return false; + } + + mutex_lock lock{in_memory_modules_mu_}; + module = in_memory_modules_[orig_ptx]; + + if (module == nullptr) { + if (g_cubinate == nullptr) { + if (!CUDADriver::LoadPtx(context_, ptx, &module)) { + return false; + } + } else { + string cubin = g_cubinate(ptx); + auto load_status = + CUDADriver::LoadCubin(context_, cubin.c_str(), &module); + if (!load_status.ok()) { + LOG(ERROR) << "failed to load cubin via hook: " << load_status; + return false; + } + } + in_memory_modules_[orig_ptx] = module; + } + } else if (spec.has_cuda_cubin_in_memory()) { + kernelname = &spec.cuda_cubin_in_memory().kernelname(); + const char *cubin = spec.cuda_cubin_in_memory().bytes(); + mutex_lock lock{in_memory_modules_mu_}; + module = in_memory_modules_[cubin]; + + if (module == nullptr) { + auto load_status = CUDADriver::LoadCubin(context_, cubin, &module); + if (!load_status.ok()) { + LOG(ERROR) << "failed to load CUBIN: " << load_status; + return false; + } + + in_memory_modules_[cubin] = module; + } + } else { + LOG(WARNING) << "no method of loading CUDA kernel provided"; + return false; + } + + VLOG(2) << "getting function " << kernelname << " from module " << module; + if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(), + cuda_kernel->cuda_function_ptr())) { + return false; + } + + // We have to trust the kernel loader spec arity because there doesn't appear + // to be a way to reflect on the number of expected arguments w/the CUDA API. + cuda_kernel->set_arity(spec.arity()); + + KernelMetadata kernel_metadata; + if (!GetKernelMetadata(cuda_kernel, &kernel_metadata)) { + LOG(WARNING) << "Unable to get metadata for kernel " << kernelname; + } + kernel->set_metadata(kernel_metadata); + kernel->set_name(*kernelname); + return true; +} + +bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel, + KernelMetadata *kernel_metadata) { + int value; + if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS, + *cuda_kernel->cuda_function_ptr(), + &value)) { + return false; + } + kernel_metadata->set_registers_per_thread(value); + + if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + *cuda_kernel->cuda_function_ptr(), + &value)) { + return false; + } + kernel_metadata->set_shared_memory_bytes(value); + + return true; +} + +bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &kernel, + const std::vector<KernelArg> &args) { + CHECK_EQ(kernel.Arity(), args.size()); + CUstream custream = AsCUDAStreamValue(stream); + const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel); + CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue(); + + std::vector<void *> addrs; + addrs.reserve(args.size()); + int shmem_bytes = 0; + for (size_t i = 0; i < args.size(); i++) { + switch (args[i].type) { + case KernelArg::kNormal: + addrs.push_back(const_cast<void *>( + static_cast<const void *>(args[i].data.begin()))); + break; + case KernelArg::kSharedMemory: + shmem_bytes += args[i].bytes; + break; + default: + LOG(ERROR) << "Invalid kernel arg type passed (" << args[i].type + << ") for arg " << i; + return false; + } + } + + // Only perform/print the occupancy check 1x. + launched_kernels_mu_.lock(); + if (launched_kernels_.find(cufunc) == launched_kernels_.end()) { + OccupancyCheck(kernel, thread_dims, block_dims); + // TODO(rspringer): Remove elements from launched_kernels_...if we ever + // expose a kernel/module deallocation method. + launched_kernels_.insert(cufunc); + } + launched_kernels_mu_.unlock(); + + if (cuda_kernel->GetPreferredCacheConfig() != + KernelCacheConfig::kNoPreference) { + CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig()); + } + + if (!CUDADriver::LaunchKernel( + GetCudaContext(stream), cufunc, block_dims.x, block_dims.y, + block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z, + shmem_bytes, custream, addrs.data(), nullptr /* = extra */)) { + LOG(ERROR) << "failed to launch CUDA kernel with args: " << args.size() + << "; thread dim: " << thread_dims.ToString() + << "; block dim: " << block_dims.ToString(); + return false; + } + + return true; +} + +// This is a non-essential operation; if there's a failure, proceed without +// logging an error. It's nearly certain that in case of failures, we'd never +// get here in the first place; these are very low-impact routines. +void CUDAExecutor::OccupancyCheck(const KernelBase &kernel, + const ThreadDim &thread_dims, + const BlockDim &block_dims) { + VLOG(2) << "Computing kernel occupancy for kernel " + << kernel.demangled_name(); + VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y + << ", " << thread_dims.z << ")"; + + int regs_per_thread; + if (!kernel.metadata().registers_per_thread(®s_per_thread)) { + return; + } + + int smem_per_block; + if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) { + return; + } + + const DeviceDescription &device_description = + kernel.parent()->GetDeviceDescription(); + + uint64 blocks_per_sm = CalculateOccupancy( + device_description, regs_per_thread, smem_per_block, thread_dims); + VLOG(2) << "Resident blocks per SM is " << blocks_per_sm; + + // To increase occupancy, there must be a sufficient number of blocks + // available to spread across the sm's at this new improved occupancy level. + int multiprocessor_count = device_description.core_count(); + int block_count = block_dims.x * block_dims.y * block_dims.z; + int available_blocks_per_sm = + port::MathUtil::CeilOfRatio(block_count, multiprocessor_count); + if (available_blocks_per_sm <= static_cast<int64>(blocks_per_sm)) { + VLOG(2) << "Occupancy is limited by number of blocks available per sm."; + return; + } + + uint64 improved_regs_per_thread = CalculateRegisterLimitForTargetOccupancy( + device_description, smem_per_block, thread_dims, blocks_per_sm + 1); + if (improved_regs_per_thread != 0) { + VLOG(2) << "Reducing register usage from " << regs_per_thread + << " to " << improved_regs_per_thread + << " could increase resident blocks per SM by one."; + + uint64 reg_reduction = regs_per_thread - improved_regs_per_thread; + if (reg_reduction <= + static_cast<uint64>(FLAGS_register_occupancy_warning_threshold)) { + LOG(INFO) << "Notice: occupancy would increase if register usage was" + << " reduced from " << regs_per_thread + << " to " << improved_regs_per_thread + << " registers per thread for kernel: " + << kernel.demangled_name(); + } + } else { + VLOG(2) << "Resident blocks per SM cannot be increased by reducing " + "register usage."; + } +} + +void *CUDAExecutor::Allocate(uint64 size) { + return CUDADriver::DeviceAllocate(context_, size); +} + +void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem, + uint64 offset_bytes, uint64 size_bytes) { + // offset and size are in bytes, so char* works as the pointer type. + return reinterpret_cast<char *>(mem->opaque()) + offset_bytes; +} + +void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) { + // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary. + if (!mem->is_sub_buffer()) { + CUDADriver::DeviceDeallocate(context_, mem->opaque()); + } +} + +bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) { + if (location == nullptr || size == 0) { + LOG(WARNING) << "attempting to register null or zero-sized memory: " + << location << "; size " << size; + } + VLOG(2) << "registering " << location << " size " << size; + return CUDADriver::HostRegister(context_, location, size); +} + +bool CUDAExecutor::HostMemoryUnregister(void *location) { + VLOG(2) << "unregistering " << location; + return CUDADriver::HostUnregister(context_, location); +} + +bool CUDAExecutor::SynchronizeAllActivity() { + return CUDADriver::SynchronizeContext(context_); +} + +bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) { + if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 && + size % 4 == 0) { + return CUDADriver::SynchronousMemsetUint32( + context_, AsCudaDevicePtr(location), 0x0, size / 4); + } + return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location), + 0x0, size); +} + +bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) { + if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 && + size % 4 == 0) { + // cudaMemset reinterprets "value" as a uint8. + uint8 byte_value = static_cast<uint8>(value); + uint32 pattern = (byte_value << 24) | (byte_value << 16) | + (byte_value << 8) | byte_value; + return CUDADriver::SynchronousMemsetUint32( + context_, AsCudaDevicePtr(location), pattern, size / 4); + } + return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location), + value, size); +} + +bool CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) { + return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst), + host_src, size); +} + +bool CUDAExecutor::SynchronousMemcpy(void *host_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + return CUDADriver::SynchronousMemcpyD2H(context_, host_dst, + AsCudaDevicePtr(gpu_src), size); +} + +bool CUDAExecutor::SynchronousMemcpyDeviceToDevice( + DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) { + return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst), + AsCudaDevicePtr(gpu_src), size); +} + +bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) { + return Memset32(stream, location, 0x0, size); +} + +bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location, + uint32 pattern, uint64 size) { + VLOG(2) << "enqueueing memset32 operation onto stream " << stream + << " at location " << location << " with size " << size + << " and pattern " << std::hex << pattern; + CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 && + size % 4 == 0); + return CUDADriver::AsynchronousMemsetUint32( + context_, AsCudaDevicePtr(location), pattern, size / 4, + AsCUDAStreamValue(stream)); +} + +bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst, + const DeviceMemoryBase &gpu_src, uint64 size) { + return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst, + AsCudaDevicePtr(gpu_src), size, + AsCUDAStreamValue(stream)); +} + +bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) { + return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst), + host_src, size, + AsCUDAStreamValue(stream)); +} + +bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream, + DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst), + AsCudaDevicePtr(gpu_src), size, + AsCUDAStreamValue(stream)); +} + +bool CUDAExecutor::HostCallback(Stream *stream, + std::function<void()> callback) { + auto callback_ptr = new std::function<void()>(callback); + return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream), + InternalHostCallback, callback_ptr); +} + +/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream, + CUresult status, + void *data) { + std::function<void()> *callback = + reinterpret_cast<std::function<void()> *>(data); + (*callback)(); + delete callback; +} + +port::Status CUDAExecutor::AllocateEvent(Event *event) { + return AsCUDAEvent(event)->Init(); +} + +port::Status CUDAExecutor::DeallocateEvent(Event *event) { + return AsCUDAEvent(event)->Destroy(); +} + +port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) { + return AsCUDAEvent(event)->Record(AsCUDAStream(stream)); +} + +port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) { + if (CUDADriver::WaitStreamOnEvent(context_, + AsCUDAStream(stream)->cuda_stream(), + AsCUDAEvent(event)->cuda_event())) { + return port::Status::OK(); + } else { + return port::Status{ + port::error::INTERNAL, + port::Printf("error recording waiting for CUDA event on stream %p", + stream)}; + } +} + +Event::Status CUDAExecutor::PollForEventStatus(Event *event) { + return AsCUDAEvent(event)->PollForStatus(); +} + +bool CUDAExecutor::AllocateStream(Stream *stream) { + return AsCUDAStream(stream)->Init(); +} + +void CUDAExecutor::DeallocateStream(Stream *stream) { + CUDAStream *cuda_stream = AsCUDAStream(stream); + if (!cuda_stream->IsIdle()) { + LOG(ERROR) << "Deallocating stream with pending work"; + } + cuda_stream->Destroy(); +} + +bool CUDAExecutor::AllocateTimer(Timer *timer) { + return AsCUDATimer(timer)->Init(); +} + +void CUDAExecutor::DeallocateTimer(Timer *timer) { + AsCUDATimer(timer)->Destroy(); +} + +bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) { + CUevent other_completed_event; + bool ok = + AsCUDAStream(other)->GetOrCreateCompletedEvent(&other_completed_event); + if (!ok) { + LOG(ERROR) << "failed to get completion event from other; " + "therefore, failed to create inter-stream dependency"; + return false; + } + + ok = CUDADriver::RecordEvent(context_, other_completed_event, + AsCUDAStreamValue(other)) + .ok(); + if (!ok) { + LOG(ERROR) << "failed to record completion event; " + "therefore, failed to create inter-stream dependency"; + return false; + } + + return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent), + other_completed_event); +} + +bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) { + return AsCUDATimer(timer)->Start(AsCUDAStream(stream)); +} + +bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) { + return AsCUDATimer(timer)->Stop(AsCUDAStream(stream)); +} + +bool CUDAExecutor::BlockHostUntilDone(Stream *stream) { + return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream)); +} + +blas::BlasSupport *CUDAExecutor::CreateBlas() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::BlasFactory> status = + registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId, + plugin_config_.blas()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve BLAS factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +dnn::DnnSupport *CUDAExecutor::CreateDnn() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::DnnFactory> status = + registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId, + plugin_config_.dnn()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve DNN factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +fft::FftSupport *CUDAExecutor::CreateFft() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::FftFactory> status = + registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId, + plugin_config_.fft()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve FFT factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +rng::RngSupport *CUDAExecutor::CreateRng() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::RngFactory> status = + registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId, + plugin_config_.rng()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve RNG factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +// TODO(rspringer): Remove in b/18544742. +bool CUDAExecutor::SupportsDnn() const { + return true; +} + +bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) { + CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other); + return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_); +} + +port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) { + CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other); + return CUDADriver::EnablePeerAccess(context_, cuda_other->context_); +} + +SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() { + port::StatusOr<CUsharedconfig> cuda_config = + CUDADriver::ContextGetSharedMemConfig(context_); + if (!cuda_config.ok()) { + // Don't log; the failed call will log necessary output. + return SharedMemoryConfig::kDefault; + } + + switch (cuda_config.ValueOrDie()) { + case CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: + return SharedMemoryConfig::kDefault; + case CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: + return SharedMemoryConfig::kFourByte; + case CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: + return SharedMemoryConfig::kEightByte; + default: + LOG(FATAL) << "Invalid shared memory configuration returned: " + << cuda_config.ValueOrDie(); + } +} + +port::Status CUDAExecutor::SetDeviceSharedMemoryConfig( + SharedMemoryConfig config) { + CUsharedconfig cuda_config; + switch (config) { + case SharedMemoryConfig::kDefault: + cuda_config = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE; + break; + case SharedMemoryConfig::kFourByte: + cuda_config = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE; + break; + case SharedMemoryConfig::kEightByte: + cuda_config = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE; + break; + default: + LOG(FATAL) << "Invalid shared memory configuration specified: " + << static_cast<int>(config); + } + return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config); +} + +bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const { + return CUDADriver::GetDeviceMemoryInfo(context_, free, total); +} + +bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem, + size_t *bytes) { + { // give limited scope to mutex_lock + mutex_lock lock{disk_modules_mu_}; + for (auto &it : disk_modules_) { + if (CUDADriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(), + reinterpret_cast<CUdeviceptr *>(mem), + bytes)) { + return true; + } + } + } + + { // give limited scope to mutex_lock + mutex_lock lock{in_memory_modules_mu_}; + for (auto &it : in_memory_modules_) { + if (CUDADriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(), + reinterpret_cast<CUdeviceptr *>(mem), + bytes)) { + return true; + } + } + } + + LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name; + return false; +} + +bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const { + // The BlockDim name is a mismatch against these GRID_DIM_* queries because + // we use BlockDims to express the dimensions of blocks within a grid + // (as opposed to ThreadDim which expresses the dimensions of threads + // within a block). + int x, y, z; + if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) { + return false; + } + + block_dim_limit->x = x; + block_dim_limit->y = y; + block_dim_limit->z = z; + return true; +} + +KernelArg CUDAExecutor::DeviceMemoryToKernelArg( + const DeviceMemoryBase &gpu_mem) const { + const void* arg = gpu_mem.opaque(); + const uint8 *arg_ptr = reinterpret_cast<const uint8 *>(&arg); + + KernelArg kernel_arg; + kernel_arg.type = KernelArg::kNormal; + kernel_arg.data = port::InlinedVector<uint8, 4>(arg_ptr, arg_ptr + sizeof(arg)); + kernel_arg.bytes = sizeof(arg); + return kernel_arg; +} + +bool CUDAExecutor::SupportsBlas() const { return true; } + +bool CUDAExecutor::SupportsFft() const { return true; } + +bool CUDAExecutor::SupportsRng() const { return true; } + +void *CUDAExecutor::CudaContextHack() { return context_; } + +CUcontext CUDAExecutor::cuda_context() { return context_; } + +// Attemps to read the NUMA node corresponding to the GPU device's PCI bus out +// of SysFS. Returns -1 if it cannot. +// +// For anything more complicated/prod-focused than this, you'll likely want to +// turn to gsys' topology modeling. +static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) { + VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal; + static const int kUnknownNumaNode = -1; + + if (pci_bus_id.empty()) { + LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal; + return kUnknownNumaNode; + } + + string filename = + port::Printf("/sys/bus/pci/devices/%s/numa_node", pci_bus_id.c_str()); + + // We have to use fopen/fread here so that the device properties can be + // populated before InitGoogle procedure has been completed (at which point we + // could use the file::* utilities). + FILE *file = fopen(filename.c_str(), "r"); + if (file == nullptr) { + LOG(ERROR) << "could not open file to read NUMA node: " << filename; + return kUnknownNumaNode; + } + + string content; + char buf[32]; + size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file); + buf[did_read] = '\0'; + content = buf; + + int32 value; + if (port::safe_strto32(content, &value)) { + if (value < 0) { // See http://b/18228951 for details on this path. + LOG(INFO) << "successful NUMA node read from SysFS had negative value (" + << value << "), but there must be at least one NUMA node" + ", so returning NUMA node zero"; + return 0; + } + return value; + } + + LOG(WARNING) + << "could not convert SysFS file contents to integral NUMA node value: " + << content; + + return kUnknownNumaNode; +} + +// Set of compute capability specific device parameters that cannot be +// queried from the driver API. These values instead are baked into a +// lookup table indexed by compute capability version. +struct UnqueryableDeviceParams { + int cc_major; + int cc_minor; + uint64 blocks_per_core_limit; + uint64 registers_per_core_limit; + uint64 registers_per_thread_limit; + uint64 warp_alloc_granularity; + uint64 register_alloc_granularity; + uint64 shared_memory_alloc_granularity; +}; + +static const UnqueryableDeviceParams kAllUnqueryableDeviceParams[] = { + { + 3, 5, // compute capability (3.5) + 16, // blocks_per_core_limit + 64 * 1024, // registers_per_core_limit + 255, // registers_per_thread_limit + 4, // warp_alloc_granularity + 256, // register_alloc_granularity + 256 // shared_memory_alloc_granularity + } +}; + +DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const { + internal::DeviceDescriptionBuilder builder; + + { + int driver_version = 0; + (void)CUDADriver::GetDriverVersion(&driver_version); + string augmented_driver_version = port::Printf( + "%d (%s)", driver_version, + DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str()); + builder.set_driver_version(augmented_driver_version); + } + + { + string pci_bus_id = CUDADriver::GetPCIBusID(device_); + + // Lower the hex characters to match sysfs. + pci_bus_id = port::Lowercase(pci_bus_id); + builder.set_pci_bus_id(pci_bus_id); + + // Read the NUMA node corresponding to the PCI bus ID out of sysfs. + int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_); + builder.set_numa_node(numa_node); + } + + CUdevprop prop; + if (CUDADriver::GetDeviceProperties(&prop, device_ordinal_)) { + builder.set_threads_per_block_limit(prop.maxThreadsPerBlock); + + ThreadDim thread_dim_limit; + thread_dim_limit.x = prop.maxThreadsDim[0]; + thread_dim_limit.y = prop.maxThreadsDim[1]; + thread_dim_limit.z = prop.maxThreadsDim[2]; + builder.set_thread_dim_limit(thread_dim_limit); + + float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6; + builder.set_clock_rate_ghz(clock_rate_ghz); + } + + { + bool ecc_enabled = false; + (void)CUDADriver::IsEccEnabled(device_, &ecc_enabled); + builder.set_ecc_enabled(ecc_enabled); + } + + { + uint64 device_memory_size = -1; + (void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size); + builder.set_device_memory_size(device_memory_size); + } + + { + BlockDim block_dim_limit; + FillBlockDimLimit(&block_dim_limit); + builder.set_block_dim_limit(block_dim_limit); + } + + { + string device_name; + (void)CUDADriver::GetDeviceName(device_, &device_name); + builder.set_name(device_name); + } + + for (size_t i = 0; i < ARRAYSIZE(kAllUnqueryableDeviceParams); i++) { + const auto ¶ms = kAllUnqueryableDeviceParams[i]; + if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) { + builder.set_blocks_per_core_limit(params.blocks_per_core_limit); + builder.set_registers_per_core_limit(params.registers_per_core_limit); + builder.set_registers_per_thread_limit(params.registers_per_thread_limit); + builder.set_warp_alloc_granularity(params.warp_alloc_granularity); + builder.set_register_alloc_granularity(params.register_alloc_granularity); + builder.set_shared_memory_alloc_granularity( + params.shared_memory_alloc_granularity); + } + } + + builder.set_platform_version( + port::StrCat("Compute Capability ", cc_major_, ".", cc_minor_)); + + // TODO(leary) should be a way to query this from the driver, but this is + // unlikely to change for us any time soon. + builder.set_device_address_bits(64); + + builder.set_device_vendor("NVIDIA Corporation"); + builder.set_cuda_compute_capability(cc_major_, cc_minor_); + builder.set_shared_memory_per_core( + CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie()); + builder.set_shared_memory_per_block( + CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie()); + builder.set_core_count( + CUDADriver::GetMultiprocessorCount(device_).ValueOrDie()); + builder.set_threads_per_core_limit( + CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie()); + builder.set_registers_per_block_limit( + CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie()); + builder.set_threads_per_warp( + CUDADriver::GetThreadsPerWarp(device_).ValueOrDie()); + + auto built = builder.Build(); + return built.release(); +} + +} // namespace cuda + +namespace gpu = ::perftools::gputools; + +void initialize_cuda_gpu_executor() { + port::StatusOr<void *> status = + gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle(); + if (!status.ok()) { + gpu::cuda::Diagnostician::LogDriverVersionInformation(); + LOG(INFO) << "LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH"); + LOG(INFO) << "failed to find libcuda.so on this system: " + << status.status(); + } + + // TODO(b/22689637): Temporary until users are migrated off of PlatformKind. + gpu::PluginRegistry::Instance()->MapPlatformKindToId( + gpu::PlatformKind::kCuda, gpu::cuda::kCudaPlatformId); + + *gpu::internal::MakeCUDAExecutorImplementation() = []( + const gpu::PluginConfig &config) { + return new gpu::cuda::CUDAExecutor{config}; + }; + + *gpu::internal::MakeCUDAKernelImplementation() = []() { + return new gpu::cuda::CUDAKernel; + }; + + *gpu::internal::MakeCUDAEventImplementation() = []( + gpu::StreamExecutor *parent) { + gpu::cuda::CUDAExecutor *cuda_executor = + static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation()); + return new gpu::cuda::CUDAEvent{cuda_executor}; + }; + + *gpu::internal::MakeCUDAStreamImplementation() = []( + gpu::StreamExecutor *parent) { + gpu::cuda::CUDAExecutor *cuda_executor = + static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation()); + return new gpu::cuda::CUDAStream{cuda_executor}; + }; + *gpu::internal::MakeCUDATimerImplementation() = []( + gpu::StreamExecutor *parent) { + gpu::cuda::CUDAExecutor *cuda_executor = + static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation()); + return new gpu::cuda::CUDATimer{cuda_executor}; + }; +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER( + cuda_gpu_executor, {perftools::gputools::initialize_cuda_gpu_executor();}); diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h new file mode 100644 index 0000000000..fda89b9738 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -0,0 +1,270 @@ +// The CUDA implementation of the StreamExecutorInterface functionality. +// CUDA inclusions are ideally confined to this implementation file. +// +// The notions from the StreamExecutor basically correspond to the CUDA streams +// programming model provided by the libcuda.so driver APIs, so we don't have +// to do much more than wrap the calls to the libraries appropriately. +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ + +#include <map> +#include <set> + +#include "tensorflow/stream_executor/cuda/cuda_kernel.h" +#include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace blas { +class BlasSupport; +} +namespace internal { +class RngSupport; +} // namespace internal +} // namespace gputools +} // namespace perftools + +namespace perftools { +namespace gputools { +namespace cuda { + +// CUDA-platform implementation of the platform-agnostic +// StreamExecutorInferface. +class CUDAExecutor : public internal::StreamExecutorInterface { + public: + // sub_platform indicates the subplatform used in this executor; it must + // be a CUDA type. + explicit CUDAExecutor(const PluginConfig &plugin_config) + : device_(0), + context_(nullptr), + device_ordinal_(0), + cc_major_(0), + cc_minor_(0), + plugin_config_(plugin_config) {} + + // See the corresponding StreamExecutor methods for method comments on the + // following overrides. + + ~CUDAExecutor() override; + + port::Status Init(int device_ordinal, DeviceOptions device_options) override; + + bool GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) override; + + bool Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &k, + const std::vector<KernelArg> &args) override; + + void *Allocate(uint64 size) override; + + void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes, + uint64 size_bytes) override; + + void Deallocate(DeviceMemoryBase *mem) override; + + // CUDA allocation/registration functions are necessary because the driver + // internally sets up buffers for DMA operations (and page locks them). + // There's no external interface for us to otherwise control these DMA + // settings. + void *HostMemoryAllocate(uint64 size) override { + return CUDADriver::HostAllocate(context_, size); + } + + void HostMemoryDeallocate(void *location) override { + return CUDADriver::HostDeallocate(context_, location); + } + + bool HostMemoryRegister(void *location, uint64 size) override; + + bool HostMemoryUnregister(void *location) override; + + bool SynchronizeAllActivity() override; + + bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override; + + bool SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) override; + + bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) override; + + bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) override; + bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern, + uint64 size) override; + + bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) override; + + bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool HostCallback(Stream *stream, std::function<void()> callback) override; + + bool AllocateStream(Stream *stream) override; + + void DeallocateStream(Stream *stream) override; + + bool CreateStreamDependency(Stream *dependent, Stream *other) override; + + bool AllocateTimer(Timer *timer) override; + + void DeallocateTimer(Timer *timer) override; + + bool StartTimer(Stream *stream, Timer *timer) override; + + bool StopTimer(Stream *stream, Timer *timer) override; + + port::Status AllocateEvent(Event *event) override; + + port::Status DeallocateEvent(Event *event) override; + + port::Status RecordEvent(Stream *stream, Event *event) override; + + port::Status WaitForEvent(Stream *stream, Event *event) override; + + Event::Status PollForEventStatus(Event *event) override; + + bool BlockHostUntilDone(Stream *stream) override; + + int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); } + + port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override; + + bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override; + + SharedMemoryConfig GetDeviceSharedMemoryConfig() override; + + port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override; + + bool DeviceMemoryUsage(int64 *free, int64 *total) const override; + + // Search for the symbol and returns a device pointer and size. + // Returns false if symbol does not exist. + bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override; + + DeviceDescription *PopulateDeviceDescription() const override; + + // Populates the block_dim_limit by querying the device driver API. If an + // error occurs at any point while asking the driver for block dim limits, it + // will be only partially populated as a result, and an error will be logged. + bool FillBlockDimLimit(BlockDim *block_dim_limit) const; + + KernelArg DeviceMemoryToKernelArg( + const DeviceMemoryBase &gpu_mem) const override; + + bool SupportsBlas() const override; + + blas::BlasSupport *CreateBlas() override; + + bool SupportsFft() const override; + + fft::FftSupport *CreateFft() override; + + bool SupportsRng() const override; + + rng::RngSupport *CreateRng() override; + + bool SupportsDnn() const override; + + dnn::DnnSupport *CreateDnn() override; + + void *CudaContextHack() override; + + CUcontext cuda_context(); + + private: + // Attempts to find a more specific version of the file indicated by + // filename by looking for compute-capability-specific suffixed versions; i.e. + // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if + // we're on a compute capability 3.0 machine. + bool FindOnDiskForComputeCapability(port::StringPiece filename, + port::StringPiece canonical_suffix, + string *found_filename) const; + + // Host callback landing routine invoked by CUDA. + // data: User-provided callback provided to HostCallback() above, captured + // as a std::function<void()>. Allocated/initialized inside + // HostCallback() and owned and deleted by this call. + static void InternalHostCallback(CUstream stream, CUresult status, + void *data); + + // Collects metadata for the specified kernel. + bool GetKernelMetadata(CUDAKernel *cuda_kernel, + KernelMetadata *kernel_metadata); + + // Determines if the given kernel's occupancy could be improved by only + // slightly reducing its register usage. If so, a message is emitted to the + // INFO log. The warning threshold is controlled by the flag + // register_occupancy_warning_threshold. + void OccupancyCheck(const KernelBase &kernel, const ThreadDim &thread_dims, + const BlockDim &block_dims); + + // Guards the on-disk-module mapping. + mutex disk_modules_mu_; + + // Mapping from filename to CUmodule, if it was already retrieved. + // Multiple CUfunctions are usually obtained from a single CUmodule so we + // attempt to hit in this mapping first, before retrieving it. + std::map<string, CUmodule> disk_modules_ GUARDED_BY(disk_modules_mu_); + + // Guards the in-memory-module mapping. + mutex in_memory_modules_mu_; + + std::map<const char *, CUmodule> in_memory_modules_ + GUARDED_BY(in_memory_modules_mu_); + + // Guards the launched kernel set. + mutex launched_kernels_mu_; + + // Keeps track of the set of launched kernels. Currently used to suppress the + // occupancy check on subsequent launches. + std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_); + + // Handle for the CUDA device being operated on. Immutable + // post-initialization. + CUdevice device_; + + // Handle for session with the library/driver. Immutable post-initialization. + CUcontext context_; + + // The device ordinal value that this executor was initialized with; recorded + // for use in getting device metadata. Immutable post-initialization. + int device_ordinal_; + + // The major verion of the compute capability for device_. + int cc_major_; + + // The minor verion of the compute capability for device_. + int cc_minor_; + + // The plugin configuration associated with this instance. + PluginConfig plugin_config_; + + SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h new file mode 100644 index 0000000000..2c5311cb3b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_helpers.h @@ -0,0 +1,95 @@ +// Common helper functions used for dealing with CUDA API datatypes. +// +// These are typically placed here for use by multiple source components (for +// example, BLAS and executor components). + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_ + +#include <stddef.h> +#include <complex> + +#include "third_party/gpus/cuda/include/cuComplex.h" +#include "third_party/gpus/cuda/include/cuda.h" + +namespace perftools { +namespace gputools { + +class Stream; +template <typename ElemT> +class DeviceMemory; + +namespace cuda { + +// Converts a const DeviceMemory reference to its underlying typed pointer in +// CUDA +// device memory. +template <typename T> +const T *CUDAMemory(const DeviceMemory<T> &mem) { + return static_cast<const T *>(mem.opaque()); +} + +// Converts a (non-const) DeviceMemory pointer reference to its underlying typed +// pointer in CUDA device device memory. +template <typename T> +T *CUDAMemoryMutable(DeviceMemory<T> *mem) { + return static_cast<T *>(mem->opaque()); +} + +CUstream AsCUDAStreamValue(Stream *stream); + +static_assert(sizeof(std::complex<float>) == sizeof(cuComplex), + "std::complex<float> and cuComplex should have the same size"); +static_assert(offsetof(cuComplex, x) == 0, + "The real part of cuComplex should appear first."); +static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex), + "std::complex<double> and cuDoubleComplex should have the same " + "size"); +static_assert(offsetof(cuDoubleComplex, x) == 0, + "The real part of cuDoubleComplex should appear first."); + +// Type traits to get CUDA complex types from std::complex<>. + +template <typename T> +struct CUDAComplexT { + typedef T type; +}; + +template <> +struct CUDAComplexT<std::complex<float>> { + typedef cuComplex type; +}; + +template <> +struct CUDAComplexT<std::complex<double>> { + typedef cuDoubleComplex type; +}; + +// Converts pointers of std::complex<> to pointers of +// cuComplex/cuDoubleComplex. No type conversion for non-complex types. + +template <typename T> +inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) { + return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p); +} + +template <typename T> +inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) { + return reinterpret_cast<typename CUDAComplexT<T>::type *>(p); +} + +// Converts values of std::complex<float/double> to values of +// cuComplex/cuDoubleComplex. +inline cuComplex CUDAComplexValue(std::complex<float> val) { + return {val.real(), val.imag()}; +} + +inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) { + return {val.real(), val.imag()}; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h new file mode 100644 index 0000000000..e8ad3955e9 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_kernel.h @@ -0,0 +1,115 @@ +// The CUDA implementation of the StreamExecutorInterface functionality. +// CUDA inclusions are ideally confined to this implementation file. +// +// The notions from the StreamExecutor basically correspond to the CUDA streams +// programming model provided by the libcuda.so driver APIs, so we don't have +// to do much more than wrap the calls to the libraries appropriately. +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ + +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "third_party/gpus/cuda/include/cuda.h" + +#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_ +#error \ + "No driver calls in this file, wrap driver functionality in cuda_driver.cc." +#endif + +#ifdef __CUDA_RUNTIME_H__ +#error \ + "CUDA runtime being included into CUDA GPU executor; should be driver only." +#endif + +namespace perftools { +namespace gputools { +namespace cuda { + +// Wraps a CUfunction to implement the platform-independent KernelInterface. +class CUDAKernel : public internal::KernelInterface { + public: + CUDAKernel() : cuda_function_(nullptr), arity_(0), + preferred_cache_config_(KernelCacheConfig::kNoPreference) {} + + // Note that the function is unloaded when the module is unloaded, and the + // module that the function is contained in is owned by the CUDAExecutor. + ~CUDAKernel() override {} + + // As arity cannot be reflected upon using the CUDA API, the arity is + // explicitly set during the CUDAExecutor::GetKernel initialization process. + void set_arity(unsigned arity) { arity_ = arity; } + unsigned Arity() const override { return arity_; } + + // Returns the CUfunction value for passing to the CUDA API. + CUfunction AsCUDAFunctionValue() const { + DCHECK(cuda_function_ != nullptr); + return const_cast<CUfunction>(cuda_function_); + } + + // Returns the slot that the CUfunction is stored within for this object, + // for the CUDA API which wants to load into a CUfunction*. + CUfunction *cuda_function_ptr() { return &cuda_function_; } + + // CUDA supports setting the preferred cache configuration of a CUfunction + // (more-or-less equivalent to a CUDAKernel). We support this via the below + // functions; users can set a preference, and that is applied when the kernel + // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to + // load the kernel & set the preference when the user calls the setter below; + // either approach is valid. + // Sets the current kernel cache configuration preference. + void SetPreferredCacheConfig(KernelCacheConfig config) override { + preferred_cache_config_ = config; + } + + // Returns the current kernel cache configuration preference. + KernelCacheConfig GetPreferredCacheConfig() const override { + return preferred_cache_config_; + } + + // Returns the current kernel cache configuration preference as a + // CUfunc_cache. + CUfunc_cache GetCUDACacheConfig() const { + switch (preferred_cache_config_) { + case KernelCacheConfig::kNoPreference: + return CU_FUNC_CACHE_PREFER_NONE; + case KernelCacheConfig::kPreferShared: + return CU_FUNC_CACHE_PREFER_SHARED; + case KernelCacheConfig::kPreferL1: + return CU_FUNC_CACHE_PREFER_L1; + case KernelCacheConfig::kPreferEqual: + return CU_FUNC_CACHE_PREFER_EQUAL; + default: + LOG(FATAL) << "Unknown KernelCacheConfig" + << static_cast<int32>(preferred_cache_config_); + } + } + + private: + CUfunction cuda_function_; // Wrapped CUDA kernel handle. + unsigned arity_; // Number of formal parameters the kernel takes. + + // Preferred (but not required) cache configuration for this kernel. + KernelCacheConfig preferred_cache_config_; +}; + +// Given a platform-independent kernel datatype, returns the (const) internal +// CUDA platform implementation pointer. +inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) { + return static_cast<const CUDAKernel *>(kernel->implementation()); +} + +// Given a platform-independent kernel datatype, returns the (non-const) +// internal CUDA platform implementation pointer. +inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) { + return static_cast<CUDAKernel *>(kernel->implementation()); +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc new file mode 100644 index 0000000000..ef88b89eda --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_platform.cc @@ -0,0 +1,172 @@ +#include "tensorflow/stream_executor/cuda/cuda_platform.h" + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/ptr_util.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +PLATFORM_DEFINE_ID(kCudaPlatformId); + +CudaPlatform::CudaPlatform() + : name_("CUDA"), min_numa_node_(0), limit_numa_node_(0) {} + +CudaPlatform::~CudaPlatform() {} + +// Due to legacy issues in user code, we can't currently call InpectNumaNodes +// at module initialization time, because non-GPU programs still include this +// plugin via various methods, so instead, it has to be init-on-reference. +void CudaPlatform::InspectNumaNodes() { + // To get NUMA node information, we need to create all executors, so we can + // examine their device descriptions to see their bus assignments. + static bool initialized = false; + static mutex numa_mutex(LINKER_INITIALIZED); + mutex_lock lock(numa_mutex); + if (initialized) { + return; + } + + StreamExecutorConfig config; + for (int i = 0; i < VisibleDeviceCount(); i++) { + config.ordinal = i; + StreamExecutor* exec = GetExecutor(config).ValueOrDie(); + if (i == 0) { + // NUMA nodes may not start at 0, so set the minimum node based on the + // first executor we see. + min_numa_node_ = exec->GetDeviceDescription().numa_node(); + limit_numa_node_ = min_numa_node_ + 1; + } else { + min_numa_node_ = + std::min(min_numa_node_, exec->GetDeviceDescription().numa_node()); + limit_numa_node_ = std::max(limit_numa_node_, + exec->GetDeviceDescription().numa_node() + 1); + } + } + initialized = true; +} + +int CudaPlatform::BusCount() { + InspectNumaNodes(); + return limit_numa_node_ - min_numa_node_; +} + +int CudaPlatform::DeviceToBus(int device_ordinal) { + StreamExecutorConfig config; + config.ordinal = device_ordinal; + StreamExecutor* exec = GetExecutor(config).ValueOrDie(); + return exec->GetDeviceDescription().numa_node() - min_numa_node_; +} + +port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus( + int bus_ordinal) { + InspectNumaNodes(); + CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range"; + for (int i = 0; i < VisibleDeviceCount(); i++) { + if (DeviceToBus(i) == bus_ordinal) { + StreamExecutorConfig config; + config.ordinal = i; + return GetExecutor(config).ValueOrDie(); + } + } + + return port::Status{ + port::error::NOT_FOUND, + port::Printf("Executor for bus %d not found.", bus_ordinal)}; +} + +Platform::Id CudaPlatform::id() const { return kCudaPlatformId; } + +int CudaPlatform::VisibleDeviceCount() const { + // Throw away the result - it logs internally, and this [containing] function + // isn't in the path of user control. It's safe to call this > 1x. + if (!cuda::CUDADriver::Init().ok()) { + return -1; + } + + return CUDADriver::GetDeviceCount(); +} + +const string& CudaPlatform::Name() const { return name_; } + +port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal) { + StreamExecutorConfig config; + config.ordinal = ordinal; + config.plugin_config = PluginConfig(); + config.device_options = DeviceOptions::Default(); + return GetExecutor(config); +} + +port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDeviceWithPluginConfig( + int device_ordinal, const PluginConfig& plugin_config) { + StreamExecutorConfig config; + config.ordinal = device_ordinal; + config.plugin_config = plugin_config; + config.device_options = DeviceOptions::Default(); + return GetExecutor(config); +} + +port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor( + const StreamExecutorConfig& config) { + mutex_lock lock(mu_); + + port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config); + if (status.ok()) { + return status.ValueOrDie(); + } + + port::StatusOr<std::unique_ptr<StreamExecutor>> executor = + GetUncachedExecutor(config); + if (!executor.ok()) { + return executor.status(); + } + + StreamExecutor* naked_executor = executor.ValueOrDie().get(); + executor_cache_.Insert(config, executor.ConsumeValueOrDie()); + return naked_executor; +} + +port::StatusOr<std::unique_ptr<StreamExecutor>> +CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) { + auto executor = port::MakeUnique<StreamExecutor>(PlatformKind::kCuda, + config.plugin_config); + auto init_status = executor->Init(config.ordinal, config.device_options); + if (!init_status.ok()) { + return port::Status{ + port::error::INTERNAL, + port::Printf( + "failed initializing StreamExecutor for CUDA device ordinal %d: %s", + config.ordinal, init_status.ToString().c_str())}; + } + + return std::move(executor); +} + +void CudaPlatform::RegisterTraceListener( + std::unique_ptr<TraceListener> listener) { + LOG(FATAL) << "not yet implemented: register CUDA trace listener"; +} + +void CudaPlatform::UnregisterTraceListener(TraceListener* listener) { + LOG(FATAL) << "not yet implemented: unregister CUDA trace listener"; +} + +} // namespace cuda + +static void InitializeCudaPlatform() { + // Disabling leak checking, MultiPlatformManager does not destroy its + // registered platforms. + + std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform); + SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform))); +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(cuda_platform, + perftools::gputools::InitializeCudaPlatform()); diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h new file mode 100644 index 0000000000..966d7343f7 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_platform.h @@ -0,0 +1,98 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_ + +#include <memory> +#include "tensorflow/stream_executor/platform/port.h" +#include <vector> + +#include "tensorflow/stream_executor/executor_cache.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/trace_listener.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +// Opaque and unique identifier for the CUDA platform plugin. +// This is needed so that plugins can refer to/identify this platform without +// instantiating a CudaPlatform object. +extern const Platform::Id kCudaPlatformId; + +// Cuda-specific platform plugin, registered as a singleton value via module +// initializer. +class CudaPlatform : public Platform { + public: + CudaPlatform(); + ~CudaPlatform() override; + + // CudaPlatform-specific functionality + // Returns the number of distinct buses / NUMA nodes on the machine. + int BusCount(); + + // Returns the bus/NUMA node for the specified device ordinal. + int DeviceToBus(int device_ordinal); + + // Returns the lowest-ordinal-number StreamExecutor on the specified bus. + port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal); + + // Platform interface implementation: + // Returns the same value as kCudaPlatform above. + Platform::Id id() const override; + + // Returns -1 as a sentinel on internal failure (and logs the error). + int VisibleDeviceCount() const override; + + const string& Name() const override; + + port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override; + + port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig( + int ordinal, const PluginConfig& config) override; + + port::StatusOr<StreamExecutor*> GetExecutor( + const StreamExecutorConfig& config) override; + + port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor( + const StreamExecutorConfig& config) override; + + void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override; + + void UnregisterTraceListener(TraceListener* listener) override; + + private: + // Determines the number of NUMA nodes and the assignment of executor to each. + void InspectNumaNodes(); + + // This platform's name. + string name_; + + // mutex that guards internal state. + mutable mutex mu_; + + // Cache of created executors. + ExecutorCache executor_cache_; + + // The smallest NUMA node value for any device managed by this machine + // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus + // ordinals. The NUMA node space occupied by GPUs is assumed to be dense./ + int min_numa_node_; + + // Larger than the NUMA node value for any device managed by this machine + // manager. + int limit_numa_node_; + + SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc new file mode 100644 index 0000000000..ad48c8b59a --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_rng.cc @@ -0,0 +1,317 @@ +#include "tensorflow/stream_executor/cuda/cuda_rng.h" + +#include <dlfcn.h> + +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_helpers.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/rng.h" +#include "third_party/gpus/cuda/include/curand.h" + +// Formats curandStatus_t to output prettified values into a log stream. +std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) { +#define OSTREAM_CURAND_STATUS(__name) \ + case CURAND_STATUS_##__name: \ + in << "CURAND_STATUS_" #__name; \ + return in; + + switch (status) { + OSTREAM_CURAND_STATUS(SUCCESS) + OSTREAM_CURAND_STATUS(VERSION_MISMATCH) + OSTREAM_CURAND_STATUS(NOT_INITIALIZED) + OSTREAM_CURAND_STATUS(ALLOCATION_FAILED) + OSTREAM_CURAND_STATUS(TYPE_ERROR) + OSTREAM_CURAND_STATUS(OUT_OF_RANGE) + OSTREAM_CURAND_STATUS(LENGTH_NOT_MULTIPLE) + OSTREAM_CURAND_STATUS(LAUNCH_FAILURE) + OSTREAM_CURAND_STATUS(PREEXISTING_FAILURE) + OSTREAM_CURAND_STATUS(INITIALIZATION_FAILED) + OSTREAM_CURAND_STATUS(ARCH_MISMATCH) + OSTREAM_CURAND_STATUS(INTERNAL_ERROR) + default: + in << "curandStatus_t(" << static_cast<int>(status) << ")"; + return in; + } +} + +namespace perftools { +namespace gputools { +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin); + +namespace dynload { + +#define PERFTOOLS_GPUTOOLS_CURAND_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetCurandDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << " in curand DSO; dlerror: " << dlerror(); \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + curandStatus_t operator()(CUDAExecutor * parent, Args... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandCreateGenerator); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandDestroyGenerator); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetStream); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniform); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniformDouble); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetGeneratorOffset); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormal); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormalDouble); + +} // namespace dynload + +template <typename T> +string TypeString(); + +template <> +string TypeString<float>() { + return "float"; +} + +template <> +string TypeString<double>() { + return "double"; +} + +template <> +string TypeString<std::complex<float>>() { + return "std::complex<float>"; +} + +template <> +string TypeString<std::complex<double>>() { + return "std::complex<double>"; +} + +CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {} + +CUDARng::~CUDARng() { + if (rng_ != nullptr) { + dynload::curandDestroyGenerator(parent_, rng_); + } +} + +bool CUDARng::Init() { + mutex_lock lock{mu_}; + CHECK(rng_ == nullptr); + + curandStatus_t ret = + dynload::curandCreateGenerator(parent_, &rng_, CURAND_RNG_PSEUDO_DEFAULT); + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to create random number generator: " << ret; + return false; + } + + CHECK(rng_ != nullptr); + return true; +} + +bool CUDARng::SetStream(Stream *stream) { + curandStatus_t ret = + dynload::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream)); + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for random generation: " << ret; + return false; + } + + return true; +} + +// Returns true if std::complex stores its contents as two consecutive +// elements. Tests int, float and double, as the last two are independent +// specializations. +constexpr bool ComplexIsConsecutiveFloats() { + return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 && + sizeof(std::complex<double>) == 16; +} + +template <typename T> +bool CUDARng::DoPopulateRandUniformInternal(Stream *stream, + DeviceMemory<T> *v) { + mutex_lock lock{mu_}; + static_assert(ComplexIsConsecutiveFloats(), + "std::complex values are not stored as consecutive values"); + + if (!SetStream(stream)) { + return false; + } + + // std::complex<T> is currently implemented as two consecutive T variables. + uint64 element_count = v->ElementCount(); + if (std::is_same<T, std::complex<float>>::value || + std::is_same<T, std::complex<double>>::value) { + element_count *= 2; + } + + curandStatus_t ret; + if (std::is_same<T, float>::value || + std::is_same<T, std::complex<float>>::value) { + ret = dynload::curandGenerateUniform( + parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)), + element_count); + } else { + ret = dynload::curandGenerateUniformDouble( + parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)), + element_count); + } + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount() + << " " << TypeString<T>() << "s at " << v->opaque() << ": " + << ret; + return false; + } + + return true; +} + +bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) { + return DoPopulateRandUniformInternal(stream, v); +} + +bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) { + return DoPopulateRandUniformInternal(stream, v); +} + +bool CUDARng::DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<float>> *v) { + return DoPopulateRandUniformInternal(stream, v); +} + +bool CUDARng::DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<double>> *v) { + return DoPopulateRandUniformInternal(stream, v); +} + +template <typename ElemT, typename FuncT> +bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, + ElemT stddev, + DeviceMemory<ElemT> *v, + FuncT func) { + mutex_lock lock{mu_}; + + if (!SetStream(stream)) { + return false; + } + + uint64 element_count = v->ElementCount(); + curandStatus_t ret = + func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev); + + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount() + << " floats at " << v->opaque() << ": " << ret; + return false; + } + + return true; +} + +bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev, + DeviceMemory<float> *v) { + return DoPopulateRandGaussianInternal(stream, mean, stddev, v, + dynload::curandGenerateNormal); +} + +bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev, + DeviceMemory<double> *v) { + return DoPopulateRandGaussianInternal(stream, mean, stddev, v, + dynload::curandGenerateNormalDouble); +} + +bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) { + mutex_lock lock{mu_}; + CHECK(rng_ != nullptr); + + if (!CheckSeed(seed, seed_bytes)) { + return false; + } + + if (!SetStream(stream)) { + return false; + } + + // Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above) + // (which itself requires 16 for API consistency with host RNG fallbacks). + curandStatus_t ret = dynload::curandSetPseudoRandomGeneratorSeed( + parent_, rng_, *(reinterpret_cast<const uint64 *>(seed))); + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set rng seed: " << ret; + return false; + } + + ret = dynload::curandSetGeneratorOffset(parent_, rng_, 0); + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to reset rng position: " << ret; + return false; + } + return true; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +namespace gpu = ::perftools::gputools; + +REGISTER_MODULE_INITIALIZER(register_curand, { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::RngFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuRandPlugin, "cuRAND", + [](gpu::internal::StreamExecutorInterface + *parent) -> gpu::rng::RngSupport * { + gpu::cuda::CUDAExecutor *cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor *>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuRAND " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + gpu::cuda::CUDARng *rng = new gpu::cuda::CUDARng(cuda_executor); + if (!rng->Init()) { + // Note: Init() will log a more specific error. + delete rng; + return nullptr; + } + return rng; + }); + + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuRAND factory: " + << status.error_message(); + } + + // Prime the cuRAND DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCurandDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuRAND DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kRng, + gpu::cuda::kCuRandPlugin); +}); diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h new file mode 100644 index 0000000000..4e1b82969b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_rng.h @@ -0,0 +1,89 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_ + +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/rng.h" + +typedef struct curandGenerator_st *curandGenerator_t; + +namespace perftools { +namespace gputools { + +class Stream; +template <typename ElemT> +class DeviceMemory; + +namespace cuda { + +// Opaque and unique identifier for the cuRAND plugin. +extern const PluginId kCuRandPlugin; + +class CUDAExecutor; + +// CUDA-platform implementation of the random number generation support +// interface. +// +// Thread-safe post-initialization. +class CUDARng : public rng::RngSupport { + public: + explicit CUDARng(CUDAExecutor *parent); + + // Retrieves a curand library generator handle. This is necessary for + // enqueuing random number generation work onto the device. + // TODO(leary) provide a way for users to select the RNG algorithm. + bool Init(); + + // Releases a curand library generator handle, if one was acquired. + ~CUDARng() override; + + // See rng::RngSupport for details on the following overrides. + bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override; + bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override; + bool DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<float>> *v) override; + bool DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<double>> *v) override; + bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev, + DeviceMemory<float> *v) override; + bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev, + DeviceMemory<double> *v) override; + + bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override; + + private: + // Actually performs the work of generating random numbers - the public + // methods are thin wrappers to this interface. + template <typename T> + bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v); + template <typename ElemT, typename FuncT> + bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev, + DeviceMemory<ElemT> *v, FuncT func); + + // Sets the stream for the internal curand generator. + // + // This is a stateful operation, as the handle can only have one stream set at + // a given time, so it is usually performed right before enqueuing work to do + // with random number generation. + bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // mutex that guards the cuRAND handle for this device. + mutex mu_; + + // CUDAExecutor which instantiated this CUDARng. + // Immutable post-initialization. + CUDAExecutor *parent_; + + // cuRANDalibrary handle on the device. + curandGenerator_t rng_ GUARDED_BY(mu_); + + SE_DISALLOW_COPY_AND_ASSIGN(CUDARng); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/cuda/cuda_stream.cc new file mode 100644 index 0000000000..e70579b55c --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_stream.cc @@ -0,0 +1,51 @@ +#include "tensorflow/stream_executor/cuda/cuda_stream.h" + +#include "tensorflow/stream_executor/lib/status.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +bool CUDAStream::Init() { + return CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_); +} + +void CUDAStream::Destroy() { + { + mutex_lock lock{mu_}; + if (completed_event_ != nullptr) { + port::Status status = + CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_); + if (!status.ok()) { + LOG(ERROR) << status.error_message(); + } + } + } + + CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_); +} + +bool CUDAStream::IsIdle() const { + return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_); +} + +bool CUDAStream::GetOrCreateCompletedEvent(CUevent *completed_event) { + mutex_lock lock{mu_}; + if (completed_event_ != nullptr) { + *completed_event = completed_event_; + return true; + } + + if (!CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_, + CUDADriver::EventFlags::kDisableTiming) + .ok()) { + return false; + } + + *completed_event = completed_event_; + return true; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h new file mode 100644 index 0000000000..f6db64a1bf --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_stream.h @@ -0,0 +1,74 @@ +// Defines the CUDAStream type - the CUDA-specific implementation of the generic +// StreamExecutor Stream interface. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_ + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +class CUDAExecutor; + +// Wraps a CUstream in order to satisfy the platform-independent +// StreamInterface. +// +// Thread-safe post-initialization. +class CUDAStream : public internal::StreamInterface { + public: + explicit CUDAStream(CUDAExecutor *parent) + : parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {} + + // Note: teardown is handled by a parent's call to DeallocateStream. + ~CUDAStream() override {} + + void *CudaStreamHack() override { return cuda_stream_; } + void **CudaStreamMemberHack() override { + return reinterpret_cast<void **>(&cuda_stream_); + } + + // Explicitly initialize the CUDA resources associated with this stream, used + // by StreamExecutor::AllocateStream(). + bool Init(); + + // Explicitly destroy the CUDA resources associated with this stream, used by + // StreamExecutor::DeallocateStream(). + void Destroy(); + + // Returns true if no work is pending or executing on the stream. + bool IsIdle() const; + + // Retrieves an event which indicates that all work enqueued into the stream + // has completed. Ownership of the event is not transferred to the caller, the + // event is owned by this stream. + bool GetOrCreateCompletedEvent(CUevent *completed_event); + + // Returns the CUstream value for passing to the CUDA API. + // + // Precond: this CUDAStream has been allocated (otherwise passing a nullptr + // into the NVIDIA library causes difficult-to-understand faults). + CUstream cuda_stream() const { + DCHECK(cuda_stream_ != nullptr); + return const_cast<CUstream>(cuda_stream_); + } + + CUDAExecutor *parent() const { return parent_; } + + private: + mutex mu_; // mutex that guards the completion event. + CUDAExecutor *parent_; // Executor that spawned this stream. + CUstream cuda_stream_; // Wrapped CUDA stream handle. + + // Event that indicates this stream has completed. + CUevent completed_event_ GUARDED_BY(mu_); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc new file mode 100644 index 0000000000..ad5e13ab6b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_timer.cc @@ -0,0 +1,73 @@ +#include "tensorflow/stream_executor/cuda/cuda_timer.h" + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/lib/status.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +bool CUDATimer::Init() { + CHECK(start_event_ == nullptr && stop_event_ == nullptr); + CUcontext context = parent_->cuda_context(); + if (!CUDADriver::CreateEvent(context, &start_event_, + CUDADriver::EventFlags::kDefault) + .ok()) { + return false; + } + + if (!CUDADriver::CreateEvent(context, &stop_event_, + CUDADriver::EventFlags::kDefault) + .ok()) { + port::Status status = CUDADriver::DestroyEvent(context, &start_event_); + if (!status.ok()) { + LOG(ERROR) << status; + } + return false; + } + + CHECK(start_event_ != nullptr && stop_event_ != nullptr); + return true; +} + +void CUDATimer::Destroy() { + CUcontext context = parent_->cuda_context(); + port::Status status = CUDADriver::DestroyEvent(context, &start_event_); + if (!status.ok()) { + LOG(ERROR) << status; + } + + status = CUDADriver::DestroyEvent(context, &stop_event_); + if (!status.ok()) { + LOG(ERROR) << status; + } +} + +float CUDATimer::GetElapsedMilliseconds() const { + CHECK(start_event_ != nullptr && stop_event_ != nullptr); + // TODO(leary) provide a way to query timer resolution? + // CUDA docs say a resolution of about 0.5us + float elapsed_milliseconds = NAN; + (void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(), + &elapsed_milliseconds, start_event_, + stop_event_); + return elapsed_milliseconds; +} + +bool CUDATimer::Start(CUDAStream *stream) { + return CUDADriver::RecordEvent(parent_->cuda_context(), start_event_, + stream->cuda_stream()) + .ok(); +} + +bool CUDATimer::Stop(CUDAStream *stream) { + return CUDADriver::RecordEvent(parent_->cuda_context(), stop_event_, + stream->cuda_stream()) + .ok(); +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h new file mode 100644 index 0000000000..e49e212403 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_timer.h @@ -0,0 +1,69 @@ +// Defines the CUDATimer type - the CUDA-specific implementation of the generic +// StreamExecutor Timer interface. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_ + +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +class CUDAExecutor; +class CUDAStream; + +// Wraps a pair of CUevents in order to satisfy the platform-independent +// TimerInferface -- both a start and a stop event are present which may be +// recorded in a stream. +class CUDATimer : public internal::TimerInterface { + public: + explicit CUDATimer(CUDAExecutor *parent) + : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {} + + // Note: teardown is explicitly handled in this API by a call to + // StreamExecutor::DeallocateTimer(), which invokes Destroy(). + ~CUDATimer() override {} + + // Allocates the platform-specific pieces of the timer, called as part of + // StreamExecutor::AllocateTimer(). + bool Init(); + + // Deallocates the platform-specific pieces of the timer, called as part of + // StreamExecutor::DeallocateTimer(). + void Destroy(); + + // Records the "timer start" event at the current point in the stream. + bool Start(CUDAStream *stream); + + // Records the "timer stop" event at the current point in the stream. + bool Stop(CUDAStream *stream); + + // Returns the elapsed time, in milliseconds, between the start and stop + // events. + float GetElapsedMilliseconds() const; + + // See perftools::gputools::Timer::Microseconds(). + // TODO(leary) make this into an error code interface... + uint64 Microseconds() const override { + return GetElapsedMilliseconds() * 1e3; + } + + // See perftools::GPUTools::Timer::Nanoseconds(). + uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; } + + private: + CUDAExecutor *parent_; + CUevent start_event_; // Event recorded to indicate the "start" timestamp + // executing in a stream. + CUevent stop_event_; // Event recorded to indicate the "stop" timestamp + // executing in a stream. +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_ diff --git a/tensorflow/stream_executor/cuda/multi_op_activation.h b/tensorflow/stream_executor/cuda/multi_op_activation.h new file mode 100644 index 0000000000..ba2bcd3a91 --- /dev/null +++ b/tensorflow/stream_executor/cuda/multi_op_activation.h @@ -0,0 +1,16 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_ + +namespace perftools { +namespace gputools { +namespace cuda { + +// Type-safe boolean wrapper: denotes whether a ScopedActivateExecutorContext +// may have other ScopedActivateExecutorContexts nested within it. +enum class MultiOpActivation { kNo = false, kYes = true }; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_ |