From f41959ccb2d9d4c722fe8fc3351401d53bcf4900 Mon Sep 17 00:00:00 2001 From: Manjunath Kudlur Date: Fri, 6 Nov 2015 16:27:58 -0800 Subject: TensorFlow: Initial commit of TensorFlow library. TensorFlow is an open source software library for numerical computation using data flow graphs. Base CL: 107276108 --- tensorflow/stream_executor/cuda/cuda_blas.cc | 2184 ++++++++++++++++++++++++++ 1 file changed, 2184 insertions(+) create mode 100644 tensorflow/stream_executor/cuda/cuda_blas.cc (limited to 'tensorflow/stream_executor/cuda/cuda_blas.cc') diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc new file mode 100644 index 0000000000..ef1036bca3 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -0,0 +1,2184 @@ +#include "tensorflow/stream_executor/cuda/cuda_blas.h" + +#include + +#include + +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_helpers.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/status_macros.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "third_party/gpus/cuda/include/cublas_v2.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin); + +namespace dynload { + +#define PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetCublasDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << " in cuBLAS DSO; dlerror: " << dlerror(); \ + return reinterpret_cast(f); \ + } \ + template \ + cublasStatus_t operator()(CUDAExecutor * parent, Args... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +#define PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(__name) \ + PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name) + +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSnrm2) \ + __macro(cublasDnrm2) \ + __macro(cublasScnrm2) \ + __macro(cublasDznrm2) \ + __macro(cublasSdot) \ + __macro(cublasDdot) \ + __macro(cublasCdotu) \ + __macro(cublasCdotc) \ + __macro(cublasZdotu) \ + __macro(cublasZdotc) \ + __macro(cublasSscal) \ + __macro(cublasDscal) \ + __macro(cublasCscal) \ + __macro(cublasCsscal) \ + __macro(cublasZscal) \ + __macro(cublasZdscal) \ + __macro(cublasSaxpy) \ + __macro(cublasDaxpy) \ + __macro(cublasCaxpy) \ + __macro(cublasZaxpy) \ + __macro(cublasScopy) \ + __macro(cublasDcopy) \ + __macro(cublasCcopy) \ + __macro(cublasZcopy) \ + __macro(cublasSswap) \ + __macro(cublasDswap) \ + __macro(cublasCswap) \ + __macro(cublasZswap) \ + __macro(cublasIsamax) \ + __macro(cublasIdamax) \ + __macro(cublasIcamax) \ + __macro(cublasIzamax) \ + __macro(cublasIsamin) \ + __macro(cublasIdamin) \ + __macro(cublasIcamin) \ + __macro(cublasIzamin) \ + __macro(cublasSasum) \ + __macro(cublasDasum) \ + __macro(cublasScasum) \ + __macro(cublasDzasum) \ + __macro(cublasSrot) \ + __macro(cublasDrot) \ + __macro(cublasCrot) \ + __macro(cublasCsrot) \ + __macro(cublasZrot) \ + __macro(cublasZdrot) \ + __macro(cublasSrotg) \ + __macro(cublasDrotg) \ + __macro(cublasCrotg) \ + __macro(cublasZrotg) \ + __macro(cublasSrotm) \ + __macro(cublasDrotm) \ + __macro(cublasSrotmg) \ + __macro(cublasDrotmg) \ + __macro(cublasSgemv) \ + __macro(cublasDgemv) \ + __macro(cublasCgemv) \ + __macro(cublasZgemv) \ + __macro(cublasSgbmv) \ + __macro(cublasDgbmv) \ + __macro(cublasCgbmv) \ + __macro(cublasZgbmv) \ + __macro(cublasStrmv) \ + __macro(cublasDtrmv) \ + __macro(cublasCtrmv) \ + __macro(cublasZtrmv) \ + __macro(cublasStbmv) \ + __macro(cublasDtbmv) \ + __macro(cublasCtbmv) \ + __macro(cublasZtbmv) \ + __macro(cublasStpmv) \ + __macro(cublasDtpmv) \ + __macro(cublasCtpmv) \ + __macro(cublasZtpmv) \ + __macro(cublasStrsv) \ + __macro(cublasDtrsv) \ + __macro(cublasCtrsv) \ + __macro(cublasZtrsv) \ + __macro(cublasStpsv) \ + __macro(cublasDtpsv) \ + __macro(cublasCtpsv) \ + __macro(cublasZtpsv) \ + __macro(cublasStbsv) \ + __macro(cublasDtbsv) \ + __macro(cublasCtbsv) \ + __macro(cublasZtbsv) \ + __macro(cublasSsymv) \ + __macro(cublasDsymv) \ + __macro(cublasCsymv) \ + __macro(cublasZsymv) \ + __macro(cublasChemv) \ + __macro(cublasZhemv) \ + __macro(cublasSsbmv) \ + __macro(cublasDsbmv) \ + __macro(cublasChbmv) \ + __macro(cublasZhbmv) \ + __macro(cublasSspmv) \ + __macro(cublasDspmv) \ + __macro(cublasChpmv) \ + __macro(cublasZhpmv) \ + __macro(cublasSger) \ + __macro(cublasDger) \ + __macro(cublasCgeru) \ + __macro(cublasCgerc) \ + __macro(cublasZgeru) \ + __macro(cublasZgerc) \ + __macro(cublasSsyr) \ + __macro(cublasDsyr) \ + __macro(cublasCsyr) \ + __macro(cublasZsyr) \ + __macro(cublasCher) \ + __macro(cublasZher) \ + __macro(cublasSspr) \ + __macro(cublasDspr) \ + __macro(cublasChpr) \ + __macro(cublasZhpr) \ + __macro(cublasSsyr2) \ + __macro(cublasDsyr2) \ + __macro(cublasCsyr2) \ + __macro(cublasZsyr2) \ + __macro(cublasCher2) \ + __macro(cublasZher2) \ + __macro(cublasSspr2) \ + __macro(cublasDspr2) \ + __macro(cublasChpr2) \ + __macro(cublasZhpr2) \ + __macro(cublasSgemm) \ + __macro(cublasDgemm) \ + __macro(cublasCgemm) \ + __macro(cublasZgemm) \ + __macro(cublasSsyrk) \ + __macro(cublasDsyrk) \ + __macro(cublasCsyrk) \ + __macro(cublasZsyrk) \ + __macro(cublasCherk) \ + __macro(cublasZherk) \ + __macro(cublasSsyr2k) \ + __macro(cublasDsyr2k) \ + __macro(cublasCsyr2k) \ + __macro(cublasZsyr2k) \ + __macro(cublasCher2k) \ + __macro(cublasZher2k) \ + __macro(cublasSsyrkx) \ + __macro(cublasDsyrkx) \ + __macro(cublasCsyrkx) \ + __macro(cublasZsyrkx) \ + __macro(cublasCherkx) \ + __macro(cublasZherkx) \ + __macro(cublasSsymm) \ + __macro(cublasDsymm) \ + __macro(cublasCsymm) \ + __macro(cublasZsymm) \ + __macro(cublasChemm) \ + __macro(cublasZhemm) \ + __macro(cublasStrsm) \ + __macro(cublasDtrsm) \ + __macro(cublasCtrsm) \ + __macro(cublasZtrsm) \ + __macro(cublasStrmm) \ + __macro(cublasDtrmm) \ + __macro(cublasCtrmm) \ + __macro(cublasZtrmm) \ + __macro(cublasSgeam) \ + __macro(cublasDgeam) \ + __macro(cublasCgeam) \ + __macro(cublasZgeam) \ + __macro(cublasSdgmm) \ + __macro(cublasDdgmm) \ + __macro(cublasCdgmm) \ + __macro(cublasZdgmm) + +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasCreate) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasDestroy) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetStream) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetPointerMode) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasGetPointerMode) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasDgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasCgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasZgemmBatched) +CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP) + +} // namespace dynload + +static string ToString(cublasStatus_t status) { + switch (status) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + default: + return port::StrCat(""); + } +} + +// cuBLAS has interfaces that permit pointers to be passed from either the host +// memory space or the device memory space; however, you must instruct it as to +// which address space those pointers are in with cublasSetPointerMode. +// +// This helper sets the cuBLAS pointer mode to a desired value for a cuBLAS call +// you are about to perform in a given scope. +// +// The prior cuBLAS pointer mode is retained and restored when this object goes +// out of scope. +class ScopedCublasPointerMode { + public: + // Note that, because the setting of the cublas pointer mode is fallible, + // construction of this scoped datatype must be paired with a call to + // Init(). + // + // Parameters: + // handle: The cublas library handle to act upon in setting the pointer mode. + explicit ScopedCublasPointerMode(CUDAExecutor *parent, cublasHandle_t handle) + : parent_(parent), handle_(handle), ok_(false) {} + + // Attempts the switch to the requested scoped pointer mode, new_mode. + // + // Note that when false is returned, an appropriate error has already been + // logged. + bool Init(cublasPointerMode_t new_mode) { + cublasStatus_t ret = + dynload::cublasGetPointerMode_v2(parent_, handle_, &old_mode_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret); + return ok_ = false; + } + + ret = dynload::cublasSetPointerMode_v2(parent_, handle_, new_mode); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret); + return ok_ = false; + } + + return ok_ = true; + } + + // Switches back to the prior pointer mode, if the switch operation was + // successful in the first place. + ~ScopedCublasPointerMode() { + if (ok_) { + cublasStatus_t ret = + dynload::cublasSetPointerMode_v2(parent_, handle_, old_mode_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set former cublas pointer mode: " + << ToString(ret); + } + } + } + + private: + CUDAExecutor *parent_; // Executor establishing this pointer mode for. + cublasHandle_t handle_; // Handle to the cuBLAS instance of interest. + cublasPointerMode_t old_mode_; // Prior cuBLAS pointer mode, to be restored. + bool ok_; // Whether the change was successful. +}; + +bool CUDABlas::Init() { + cublasStatus_t ret = dynload::cublasCreate_v2(parent_, &blas_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to create cublas handle: " << ToString(ret); + return false; + } + + return true; +} + +CUDABlas::CUDABlas(cuda::CUDAExecutor *parent) + : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {} + +CUDABlas::~CUDABlas() { + if (blas_ != nullptr) { + dynload::cublasDestroy_v2(parent_, blas_); + } +} + +bool CUDABlas::SetStream(Stream *stream) { + CHECK(stream != nullptr); + CHECK(AsCUDAStreamValue(stream) != nullptr); + CHECK(blas_ != nullptr); + cublasStatus_t ret = + dynload::cublasSetStream_v2(parent_, blas_, AsCUDAStreamValue(stream)); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret); + return false; + } + + return true; +} + +namespace { + +// Helper functions transforming blas arguments into cuBLAS arguments. + +cublasOperation_t CUDABlasTranspose(blas::Transpose trans) { + switch (trans) { + case blas::Transpose::kNoTranspose: + return CUBLAS_OP_N; + case blas::Transpose::kTranspose: + return CUBLAS_OP_T; + case blas::Transpose::kConjugateTranspose: + return CUBLAS_OP_C; + default: + LOG(FATAL) << "Invalid value of blas::Transpose."; + } +} + +cublasFillMode_t CUDABlasUpperLower(blas::UpperLower uplo) { + switch (uplo) { + case blas::UpperLower::kUpper: + return CUBLAS_FILL_MODE_UPPER; + case blas::UpperLower::kLower: + return CUBLAS_FILL_MODE_LOWER; + default: + LOG(FATAL) << "Invalid value of blas::UpperLower."; + } +} + +cublasDiagType_t CUDABlasDiagonal(blas::Diagonal diag) { + switch (diag) { + case blas::Diagonal::kUnit: + return CUBLAS_DIAG_UNIT; + case blas::Diagonal::kNonUnit: + return CUBLAS_DIAG_NON_UNIT; + default: + LOG(FATAL) << "Invalid value of blas::Diagonal."; + } +} + +cublasSideMode_t CUDABlasSide(blas::Side side) { + switch (side) { + case blas::Side::kLeft: + return CUBLAS_SIDE_LEFT; + case blas::Side::kRight: + return CUBLAS_SIDE_RIGHT; + default: + LOG(FATAL) << "Invalid value of blas::Side."; + } +} + +} // namespace + +template +bool CUDABlas::DoBlasInternal(FuncT cublas_func, Stream *stream, + bool pointer_mode_host, Args... args) { + mutex_lock lock{mu_}; + + CHECK(blas_ != nullptr); + if (!SetStream(stream)) { + return false; + } + + ScopedCublasPointerMode pointer_mode{parent_, blas_}; + if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST + : CUBLAS_POINTER_MODE_DEVICE)) { + return false; + } + + cublasStatus_t ret = cublas_func(parent_, blas_, args...); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to run cuBLAS routine " << cublas_func.kName << ": " + << ToString(ret); + return false; + } + + return true; +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *result) { + return DoBlasInternal(dynload::cublasSasum, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *result) { + return DoBlasInternal(dynload::cublasDasum, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasScasum, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasDzasum, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha, + const DeviceMemory &x, int incx, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasSaxpy, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha, + const DeviceMemory &x, int incx, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasDaxpy, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, + std::complex alpha, + const DeviceMemory> &x, int incx, + DeviceMemory> *y, int incy) { + return DoBlasInternal(dynload::cublasCaxpy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, + std::complex alpha, + const DeviceMemory> &x, int incx, + DeviceMemory> *y, int incy) { + return DoBlasInternal(dynload::cublasZaxpy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasScopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasDcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + DeviceMemory> *y, int incy) { + return DoBlasInternal(dynload::cublasCcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + DeviceMemory> *y, int incy) { + return DoBlasInternal(dynload::cublasZcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + const DeviceMemory &y, int incy, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasSdot, stream, false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + const DeviceMemory &y, int incy, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasDdot, stream, false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *result) { + return DoBlasInternal( + dynload::cublasCdotc, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *result) { + return DoBlasInternal( + dynload::cublasZdotc, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *result) { + return DoBlasInternal( + dynload::cublasCdotu, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *result) { + return DoBlasInternal( + dynload::cublasZdotu, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *result) { + return DoBlasInternal(dynload::cublasSnrm2, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *result) { + return DoBlasInternal(dynload::cublasDnrm2, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasScnrm2, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasDznrm2, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory *x, int incx, + DeviceMemory *y, int incy, float c, float s) { + return DoBlasInternal( + dynload::cublasSrot, stream, true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory *x, int incx, + DeviceMemory *y, int incy, double c, + double s) { + return DoBlasInternal( + dynload::cublasDrot, stream, true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory> *x, int incx, + DeviceMemory> *y, int incy, + float c, float s) { + return DoBlasInternal(dynload::cublasCsrot, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory> *x, int incx, + DeviceMemory> *y, int incy, + double c, double s) { + return DoBlasInternal(dynload::cublasZdrot, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory *a, + DeviceMemory *b, DeviceMemory *c, + DeviceMemory *s) { + return DoBlasInternal(dynload::cublasSrotg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(a), + CUDAMemoryMutable(b), CUDAMemoryMutable(c), + CUDAMemoryMutable(s)); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory *a, + DeviceMemory *b, DeviceMemory *c, + DeviceMemory *s) { + return DoBlasInternal(dynload::cublasDrotg, stream, + false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAMemoryMutable(b), + CUDAMemoryMutable(c), CUDAMemoryMutable(s)); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory> *a, + DeviceMemory> *b, + DeviceMemory *c, + DeviceMemory> *s) { + return DoBlasInternal( + dynload::cublasCrotg, stream, false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)), + CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s))); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory> *a, + DeviceMemory> *b, + DeviceMemory *c, + DeviceMemory> *s) { + return DoBlasInternal( + dynload::cublasZrotg, stream, false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)), + CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s))); +} + +bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count, + DeviceMemory *x, int incx, + DeviceMemory *y, int incy, + const DeviceMemory ¶m) { + return DoBlasInternal(dynload::cublasSrotm, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, + CUDAMemory(param)); +} + +bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count, + DeviceMemory *x, int incx, + DeviceMemory *y, int incy, + const DeviceMemory ¶m) { + return DoBlasInternal(dynload::cublasDrotm, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, + CUDAMemory(param)); +} + +bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory *d1, + DeviceMemory *d2, DeviceMemory *x1, + const DeviceMemory &y1, + DeviceMemory *param) { + return DoBlasInternal(dynload::cublasSrotmg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(d1), + CUDAMemoryMutable(d2), CUDAMemoryMutable(x1), + CUDAMemory(y1), CUDAMemoryMutable(param)); +} + +bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory *d1, + DeviceMemory *d2, DeviceMemory *x1, + const DeviceMemory &y1, + DeviceMemory *param) { + return DoBlasInternal(dynload::cublasDrotmg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(d1), + CUDAMemoryMutable(d2), CUDAMemoryMutable(x1), + CUDAMemory(y1), CUDAMemoryMutable(param)); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasSscal, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasDscal, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha, + DeviceMemory> *x, int incx) { + return DoBlasInternal( + dynload::cublasCsscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha, + DeviceMemory> *x, int incx) { + return DoBlasInternal( + dynload::cublasZdscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, + std::complex alpha, + DeviceMemory> *x, int incx) { + return DoBlasInternal( + dynload::cublasCscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, + std::complex alpha, + DeviceMemory> *x, int incx) { + return DoBlasInternal( + dynload::cublasZscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory *x, int incx, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasSswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory *x, int incx, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasDswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory> *x, int incx, + DeviceMemory> *y, int incy) { + return DoBlasInternal(dynload::cublasCswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory> *x, int incx, + DeviceMemory> *y, int incy) { + return DoBlasInternal(dynload::cublasZswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *result) { + return DoBlasInternal(dynload::cublasIsamax, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *result) { + return DoBlasInternal(dynload::cublasIdamax, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasIcamax, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, + int incx, DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasIzamax, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasIsamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory &x, int incx, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasIdamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, int incx, + DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasIcamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory> &x, + int incx, DeviceMemory *result) { + return DoBlasInternal( + dynload::cublasIzamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, float alpha, + const DeviceMemory &a, int lda, + const DeviceMemory &x, int incx, float beta, + DeviceMemory *y, int incy) { + return DoBlasInternal( + dynload::cublasSgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, double alpha, + const DeviceMemory &a, int lda, + const DeviceMemory &x, int incx, double beta, + DeviceMemory *y, int incy) { + return DoBlasInternal( + dynload::cublasDgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasCgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasZgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, float alpha, const DeviceMemory &a, + int lda, const DeviceMemory &x, int incx, + float beta, DeviceMemory *y, int incy) { + return DoBlasInternal( + dynload::cublasSgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, double alpha, const DeviceMemory &a, + int lda, const DeviceMemory &x, int incx, + double beta, DeviceMemory *y, int incy) { + return DoBlasInternal( + dynload::cublasDgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasCgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasZgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, + const DeviceMemory &x, int incx, + const DeviceMemory &y, int incy, + DeviceMemory *a, int lda) { + return DoBlasInternal( + dynload::cublasSger, stream, true /* = pointer_mode_host */, m, n, &alpha, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha, + const DeviceMemory &x, int incx, + const DeviceMemory &y, int incy, + DeviceMemory *a, int lda) { + return DoBlasInternal( + dynload::cublasDger, stream, true /* = pointer_mode_host */, m, n, &alpha, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *a, int lda) { + return DoBlasInternal( + dynload::cublasCgerc, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *a, int lda) { + return DoBlasInternal( + dynload::cublasZgerc, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *a, int lda) { + return DoBlasInternal( + dynload::cublasCgeru, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *a, int lda) { + return DoBlasInternal( + dynload::cublasZgeru, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasChbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasChemv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhemv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, + const DeviceMemory> &x, int incx, + DeviceMemory> *a, int lda) { + return DoBlasInternal( + dynload::cublasCher, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, + const DeviceMemory> &x, int incx, + DeviceMemory> *a, int lda) { + return DoBlasInternal( + dynload::cublasZher, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex alpha, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *a, int lda) { + return DoBlasInternal( + dynload::cublasCher2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex alpha, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *a, int lda) { + return DoBlasInternal( + dynload::cublasZher2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex alpha, + const DeviceMemory> &ap, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasChpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex alpha, + const DeviceMemory> &ap, + const DeviceMemory> &x, int incx, + std::complex beta, + DeviceMemory> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, + const DeviceMemory> &x, int incx, + DeviceMemory> *ap) { + return DoBlasInternal( + dynload::cublasChpr, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, + const DeviceMemory> &x, int incx, + DeviceMemory> *ap) { + return DoBlasInternal( + dynload::cublasZhpr, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex alpha, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *ap) { + return DoBlasInternal( + dynload::cublasChpr2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex alpha, + const DeviceMemory> &x, int incx, + const DeviceMemory> &y, int incy, + DeviceMemory> *ap) { + return DoBlasInternal( + dynload::cublasZhpr2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, float alpha, const DeviceMemory &a, + int lda, const DeviceMemory &x, int incx, + float beta, DeviceMemory *y, int incy) { + return DoBlasInternal( + dynload::cublasSsbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, double alpha, const DeviceMemory &a, + int lda, const DeviceMemory &x, int incx, + double beta, DeviceMemory *y, int incy) { + return DoBlasInternal( + dynload::cublasDsbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory &ap, + const DeviceMemory &x, int incx, float beta, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasSspmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap), + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory &ap, + const DeviceMemory &x, int incx, double beta, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasDspmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap), + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory &x, int incx, + DeviceMemory *ap) { + return DoBlasInternal(dynload::cublasSspr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory &x, int incx, + DeviceMemory *ap) { + return DoBlasInternal(dynload::cublasDspr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory &x, int incx, + const DeviceMemory &y, int incy, + DeviceMemory *ap) { + return DoBlasInternal(dynload::cublasSspr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory &x, int incx, + const DeviceMemory &y, int incy, + DeviceMemory *ap) { + return DoBlasInternal(dynload::cublasDspr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory &a, int lda, + const DeviceMemory &x, int incx, float beta, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasSsymv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory &a, int lda, + const DeviceMemory &x, int incx, double beta, + DeviceMemory *y, int incy) { + return DoBlasInternal(dynload::cublasDsymv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory &x, int incx, + DeviceMemory *a, int lda) { + return DoBlasInternal(dynload::cublasSsyr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory &x, int incx, + DeviceMemory *a, int lda) { + return DoBlasInternal(dynload::cublasDsyr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory &x, int incx, + const DeviceMemory &y, int incy, + DeviceMemory *a, int lda) { + return DoBlasInternal(dynload::cublasSsyr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory &x, int incx, + const DeviceMemory &y, int incy, + DeviceMemory *a, int lda) { + return DoBlasInternal(dynload::cublasDsyr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory &a, int lda, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasStbmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory &a, int lda, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasDtbmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory> &a, + int lda, DeviceMemory> *x, + int incx) { + return DoBlasInternal( + dynload::cublasCtbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory> &a, + int lda, DeviceMemory> *x, + int incx) { + return DoBlasInternal( + dynload::cublasZtbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory &a, int lda, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasStbsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory &a, int lda, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasDtbsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory> &a, + int lda, DeviceMemory> *x, + int incx) { + return DoBlasInternal( + dynload::cublasCtbsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory> &a, + int lda, DeviceMemory> *x, + int incx) { + return DoBlasInternal( + dynload::cublasZtbsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory &ap, DeviceMemory *x, + int incx) { + return DoBlasInternal( + dynload::cublasStpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory &ap, + DeviceMemory *x, int incx) { + return DoBlasInternal( + dynload::cublasDtpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory> &ap, + DeviceMemory> *x, int incx) { + return DoBlasInternal(dynload::cublasCtpmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory> &ap, + DeviceMemory> *x, int incx) { + return DoBlasInternal(dynload::cublasZtpmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory &ap, DeviceMemory *x, + int incx) { + return DoBlasInternal( + dynload::cublasStpsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory &ap, + DeviceMemory *x, int incx) { + return DoBlasInternal( + dynload::cublasDtpsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory> &ap, + DeviceMemory> *x, int incx) { + return DoBlasInternal(dynload::cublasCtpsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory> &ap, + DeviceMemory> *x, int incx) { + return DoBlasInternal(dynload::cublasZtpsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory &a, int lda, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasStrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory &a, int lda, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasDtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory> &a, int lda, + DeviceMemory> *x, int incx) { + return DoBlasInternal(dynload::cublasCtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory> &a, int lda, + DeviceMemory> *x, int incx) { + return DoBlasInternal(dynload::cublasZtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory &a, int lda, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasStrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory &a, int lda, + DeviceMemory *x, int incx) { + return DoBlasInternal(dynload::cublasDtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory> &a, int lda, + DeviceMemory> *x, int incx) { + return DoBlasInternal(dynload::cublasCtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory> &a, int lda, + DeviceMemory> *x, int incx) { + return DoBlasInternal(dynload::cublasZtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + float alpha, const DeviceMemory &a, int lda, + const DeviceMemory &b, int ldb, float beta, + DeviceMemory *c, int ldc) { + VLOG(1) << port::Printf( + "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu " + "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f " + "c=%p ldc=%d", + static_cast(transa), static_cast(transb), m, n, k, alpha, + a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc); + if (transa == blas::Transpose::kNoTranspose) { + if (lda < static_cast(m)) { + LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); " + "precondition violation"; + } + } else { + if (lda < static_cast(k)) { + LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k + << ") (transpose case); precondition violation"; + } + } + if (transb == blas::Transpose::kNoTranspose) { + if (ldb < static_cast(k)) { + LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k + << ") (no transpose case); precondition violation"; + } + } else { + if (ldb < static_cast(n)) { + LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); " + "precondition violation"; + } + } + return DoBlasInternal( + dynload::cublasSgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + double alpha, const DeviceMemory &a, int lda, + const DeviceMemory &b, int ldb, double beta, + DeviceMemory *c, int ldc) { + return DoBlasInternal( + dynload::cublasDgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +template +port::Status CUDABlas::DoBlasGemmBatchedInternal( + FuncT cublas_func, Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha, + const port::ArraySlice *> &a_array, int lda, + const port::ArraySlice *> &b_array, int ldb, T beta, + const port::ArraySlice *> &c_array, int ldc, + int batch_count) { + std::vector a_ptr_vec, b_ptr_vec, c_ptr_vec; + for (int i = 0; i < batch_count; ++i) { + a_ptr_vec.push_back(static_cast(a_array[i]->opaque())); + b_ptr_vec.push_back(static_cast(b_array[i]->opaque())); + c_ptr_vec.push_back(static_cast(c_array[i]->opaque())); + } + + typedef typename CUDAComplexT::type CUDA_T; + SE_ASSIGN_OR_RETURN( + std::unique_ptr> a_ptr_array, + stream->AllocateTemporaryArray(batch_count)); + SE_ASSIGN_OR_RETURN( + std::unique_ptr> b_ptr_array, + stream->AllocateTemporaryArray(batch_count)); + SE_ASSIGN_OR_RETURN( + std::unique_ptr> c_ptr_array, + stream->AllocateTemporaryArray(batch_count)); + + if (!stream->ThenMemcpy(a_ptr_array->mutable_device_memory(), + a_ptr_vec.data(), batch_count * sizeof(T *)) + .ok() || + !stream->ThenMemcpy(b_ptr_array->mutable_device_memory(), + b_ptr_vec.data(), batch_count * sizeof(T *)) + .ok() || + !stream->ThenMemcpy(c_ptr_array->mutable_device_memory(), + c_ptr_vec.data(), batch_count * sizeof(T *)) + .ok()) { + return port::Status(port::error::INTERNAL, + "failed to copy memory from host to device in " + "CUDABlas::DoBlasGemmBatched"); + } + + bool ok = DoBlasInternal( + cublas_func, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), + const_cast(CUDAMemory(a_ptr_array->device_memory())), + lda, + const_cast(CUDAMemory(b_ptr_array->device_memory())), + ldb, CUDAComplex(&beta), + const_cast(CUDAMemory(c_ptr_array->device_memory())), ldc, + batch_count); + + if (ok) { + return port::Status::OK(); + } + return port::Status(port::error::INTERNAL, + "failed BLAS call, see log for details"); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, float alpha, + const port::ArraySlice *> &a_array, int lda, + const port::ArraySlice *> &b_array, int ldb, float beta, + const port::ArraySlice *> &c_array, int ldc, + int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, double alpha, + const port::ArraySlice *> &a_array, int lda, + const port::ArraySlice *> &b_array, int ldb, + double beta, const port::ArraySlice *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex alpha, + const port::ArraySlice> *> &a_array, + int lda, + const port::ArraySlice> *> &b_array, + int ldb, std::complex beta, + const port::ArraySlice> *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex alpha, + const port::ArraySlice> *> &a_array, + int lda, + const port::ArraySlice> *> &b_array, + int ldb, std::complex beta, + const port::ArraySlice> *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal( + dynload::cublasChemm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZhemm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, + const DeviceMemory> &a, int lda, + float beta, DeviceMemory> *c, + int ldc) { + return DoBlasInternal(dynload::cublasCherk, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, + const DeviceMemory> &a, int lda, + double beta, DeviceMemory> *c, + int ldc) { + return DoBlasInternal(dynload::cublasZherk, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + float beta, DeviceMemory> *c, + int ldc) { + return DoBlasInternal(dynload::cublasCher2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, &beta, + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + double beta, DeviceMemory> *c, + int ldc) { + return DoBlasInternal(dynload::cublasZher2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, &beta, + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + float alpha, const DeviceMemory &a, int lda, + const DeviceMemory &b, int ldb, float beta, + DeviceMemory *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + double alpha, const DeviceMemory &a, int lda, + const DeviceMemory &b, int ldb, double beta, + DeviceMemory *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, const DeviceMemory &a, int lda, + float beta, DeviceMemory *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, const DeviceMemory &a, int lda, + double beta, DeviceMemory *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex alpha, + const DeviceMemory> &a, int lda, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex alpha, + const DeviceMemory> &a, int lda, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, const DeviceMemory &a, int lda, + const DeviceMemory &b, int ldb, float beta, + DeviceMemory *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsyr2k, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, const DeviceMemory &a, int lda, + const DeviceMemory &b, int ldb, double beta, + DeviceMemory *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsyr2k, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal(dynload::cublasCsyr2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex alpha, + const DeviceMemory> &a, int lda, + const DeviceMemory> &b, int ldb, + std::complex beta, + DeviceMemory> *c, int ldc) { + return DoBlasInternal(dynload::cublasZsyr2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, float alpha, + const DeviceMemory &a, int lda, + DeviceMemory *b, int ldb) { + return DoBlasInternal( + dynload::cublasStrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda, + CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, double alpha, + const DeviceMemory &a, int lda, + DeviceMemory *b, int ldb) { + return DoBlasInternal( + dynload::cublasDtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda, + CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + DeviceMemory> *b, int ldb) { + return DoBlasInternal( + dynload::cublasCtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb, + CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + DeviceMemory> *b, int ldb) { + return DoBlasInternal( + dynload::cublasZtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb, + CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, float alpha, + const DeviceMemory &a, int lda, + DeviceMemory *b, int ldb) { + return DoBlasInternal(dynload::cublasStrsm, stream, + true /* = pointer_mode_host */, CUDABlasSide(side), + CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, double alpha, + const DeviceMemory &a, int lda, + DeviceMemory *b, int ldb) { + return DoBlasInternal(dynload::cublasDtrsm, stream, + true /* = pointer_mode_host */, CUDABlasSide(side), + CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + DeviceMemory> *b, int ldb) { + return DoBlasInternal( + dynload::cublasCtrsm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex alpha, + const DeviceMemory> &a, int lda, + DeviceMemory> *b, int ldb) { + return DoBlasInternal( + dynload::cublasZtrsm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +} // namespace cuda + +namespace gpu = ::perftools::gputools; + +void initialize_cublas() { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuBlasPlugin, "cuBLAS", + [](gpu::internal::StreamExecutorInterface + *parent) -> gpu::blas::BlasSupport * { + gpu::cuda::CUDAExecutor *cuda_executor = + dynamic_cast(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuBLAS " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + gpu::cuda::CUDABlas *blas = + new gpu::cuda::CUDABlas(cuda_executor); + if (!blas->Init()) { + // Note: Init() will log a more specific error. + delete blas; + return nullptr; + } + return blas; + }); + + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuBLAS factory: " + << status.error_message(); + } + + // Prime the cuBLAS DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCublasDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuBLAS DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kBlas, + gpu::cuda::kCuBlasPlugin); +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(register_cublas, + { perftools::gputools::initialize_cublas(); }); -- cgit v1.2.3