27 files changed, 9108 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/cuda/cuda_activation.cc
new file mode 100644
index 0000000000..32d2c0d424
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_activation.cc
@@ -0,0 +1,30 @@
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+CUcontext ExtractCudaContext(CUDAExecutor *cuda_exec);
+CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec);
+
+ScopedActivateExecutorContext::ScopedActivateExecutorContext(
+    CUDAExecutor *cuda_exec, MultiOpActivation moa)
+    : cuda_exec_(cuda_exec),
+      driver_scoped_activate_context_(
+          new ScopedActivateContext{ExtractCudaContext(cuda_exec), moa}) {}
+
+ScopedActivateExecutorContext::ScopedActivateExecutorContext(
+    StreamExecutor *stream_exec, MultiOpActivation moa)
+    : ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec), moa) {}
+
+ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
+  delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
new file mode 100644
index 0000000000..4181d13d0a
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -0,0 +1,53 @@
+// This file contains APIs that assume a StreamExecutor is backed by CUDA.
+// It reaches into the CUDA implementation to activate an underlying CUDA
+// context.
+//
+// Having this file separate from cuda_gpu_executor.h means that dependent
+// code does not also have to depend on cuda.h.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
+
+#include "tensorflow/stream_executor/cuda/multi_op_activation.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace perftools {
+namespace gputools {
+
+class StreamExecutor;
+
+namespace cuda {
+
+class CUDAExecutor;
+class ScopedActivateContext;
+
+// Activates a CUDA context within an enclosing scope.
+class ScopedActivateExecutorContext {
+ public:
+  // Form that takes a CUDA executor implementation.
+  explicit ScopedActivateExecutorContext(
+      CUDAExecutor* cuda_exec, MultiOpActivation moa = MultiOpActivation::kNo);
+
+  // Form that takes a pImpl executor and extracts a CUDA implementation --
+  // fatal failure if it is not CUDA inside.
+  explicit ScopedActivateExecutorContext(
+      StreamExecutor* stream_exec,
+      MultiOpActivation moa = MultiOpActivation::kNo);
+
+  ~ScopedActivateExecutorContext();
+
+ private:
+  // The CUDA executor implementation whose context is activated.
+  CUDAExecutor* cuda_exec_;
+
+  // The cuda.h-using datatype that we wrap.
+  ScopedActivateContext* driver_scoped_activate_context_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
new file mode 100644
index 0000000000..ef1036bca3
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -0,0 +1,2184 @@
+#include "tensorflow/stream_executor/cuda/cuda_blas.h"
+
+#include <dlfcn.h>
+
+#include <complex>
+
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/cuda/cuda_helpers.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/status_macros.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
+
+namespace dynload {
+
+#define PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                            \
+    static const char *kName;                                               \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;        \
+    static void *GetDsoHandle() {                                           \
+      static auto status = internal::CachedDsoLoader::GetCublasDsoHandle(); \
+      return status.ValueOrDie();                                           \
+    }                                                                       \
+    static FuncPointerT DynLoad() {                                         \
+      static void *f = dlsym(GetDsoHandle(), kName);                        \
+      CHECK(f != nullptr) << "could not find " << kName                     \
+                          << " in cuBLAS DSO; dlerror: " << dlerror();      \
+      return reinterpret_cast<FuncPointerT>(f);                             \
+    }                                                                       \
+    template <typename... Args>                                             \
+    cublasStatus_t operator()(CUDAExecutor * parent, Args... args) {        \
+      cuda::ScopedActivateExecutorContext sac{parent};                      \
+      return DynLoad()(args...);                                            \
+    }                                                                       \
+  } __name;                                                                 \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#define PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(__name) \
+  PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)
+
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSnrm2)                    \
+  __macro(cublasDnrm2)                    \
+  __macro(cublasScnrm2)                   \
+  __macro(cublasDznrm2)                   \
+  __macro(cublasSdot)                     \
+  __macro(cublasDdot)                     \
+  __macro(cublasCdotu)                    \
+  __macro(cublasCdotc)                    \
+  __macro(cublasZdotu)                    \
+  __macro(cublasZdotc)                    \
+  __macro(cublasSscal)                    \
+  __macro(cublasDscal)                    \
+  __macro(cublasCscal)                    \
+  __macro(cublasCsscal)                   \
+  __macro(cublasZscal)                    \
+  __macro(cublasZdscal)                   \
+  __macro(cublasSaxpy)                    \
+  __macro(cublasDaxpy)                    \
+  __macro(cublasCaxpy)                    \
+  __macro(cublasZaxpy)                    \
+  __macro(cublasScopy)                    \
+  __macro(cublasDcopy)                    \
+  __macro(cublasCcopy)                    \
+  __macro(cublasZcopy)                    \
+  __macro(cublasSswap)                    \
+  __macro(cublasDswap)                    \
+  __macro(cublasCswap)                    \
+  __macro(cublasZswap)                    \
+  __macro(cublasIsamax)                   \
+  __macro(cublasIdamax)                   \
+  __macro(cublasIcamax)                   \
+  __macro(cublasIzamax)                   \
+  __macro(cublasIsamin)                   \
+  __macro(cublasIdamin)                   \
+  __macro(cublasIcamin)                   \
+  __macro(cublasIzamin)                   \
+  __macro(cublasSasum)                    \
+  __macro(cublasDasum)                    \
+  __macro(cublasScasum)                   \
+  __macro(cublasDzasum)                   \
+  __macro(cublasSrot)                     \
+  __macro(cublasDrot)                     \
+  __macro(cublasCrot)                     \
+  __macro(cublasCsrot)                    \
+  __macro(cublasZrot)                     \
+  __macro(cublasZdrot)                    \
+  __macro(cublasSrotg)                    \
+  __macro(cublasDrotg)                    \
+  __macro(cublasCrotg)                    \
+  __macro(cublasZrotg)                    \
+  __macro(cublasSrotm)                    \
+  __macro(cublasDrotm)                    \
+  __macro(cublasSrotmg)                   \
+  __macro(cublasDrotmg)                   \
+  __macro(cublasSgemv)                    \
+  __macro(cublasDgemv)                    \
+  __macro(cublasCgemv)                    \
+  __macro(cublasZgemv)                    \
+  __macro(cublasSgbmv)                    \
+  __macro(cublasDgbmv)                    \
+  __macro(cublasCgbmv)                    \
+  __macro(cublasZgbmv)                    \
+  __macro(cublasStrmv)                    \
+  __macro(cublasDtrmv)                    \
+  __macro(cublasCtrmv)                    \
+  __macro(cublasZtrmv)                    \
+  __macro(cublasStbmv)                    \
+  __macro(cublasDtbmv)                    \
+  __macro(cublasCtbmv)                    \
+  __macro(cublasZtbmv)                    \
+  __macro(cublasStpmv)                    \
+  __macro(cublasDtpmv)                    \
+  __macro(cublasCtpmv)                    \
+  __macro(cublasZtpmv)                    \
+  __macro(cublasStrsv)                    \
+  __macro(cublasDtrsv)                    \
+  __macro(cublasCtrsv)                    \
+  __macro(cublasZtrsv)                    \
+  __macro(cublasStpsv)                    \
+  __macro(cublasDtpsv)                    \
+  __macro(cublasCtpsv)                    \
+  __macro(cublasZtpsv)                    \
+  __macro(cublasStbsv)                    \
+  __macro(cublasDtbsv)                    \
+  __macro(cublasCtbsv)                    \
+  __macro(cublasZtbsv)                    \
+  __macro(cublasSsymv)                    \
+  __macro(cublasDsymv)                    \
+  __macro(cublasCsymv)                    \
+  __macro(cublasZsymv)                    \
+  __macro(cublasChemv)                    \
+  __macro(cublasZhemv)                    \
+  __macro(cublasSsbmv)                    \
+  __macro(cublasDsbmv)                    \
+  __macro(cublasChbmv)                    \
+  __macro(cublasZhbmv)                    \
+  __macro(cublasSspmv)                    \
+  __macro(cublasDspmv)                    \
+  __macro(cublasChpmv)                    \
+  __macro(cublasZhpmv)                    \
+  __macro(cublasSger)                     \
+  __macro(cublasDger)                     \
+  __macro(cublasCgeru)                    \
+  __macro(cublasCgerc)                    \
+  __macro(cublasZgeru)                    \
+  __macro(cublasZgerc)                    \
+  __macro(cublasSsyr)                     \
+  __macro(cublasDsyr)                     \
+  __macro(cublasCsyr)                     \
+  __macro(cublasZsyr)                     \
+  __macro(cublasCher)                     \
+  __macro(cublasZher)                     \
+  __macro(cublasSspr)                     \
+  __macro(cublasDspr)                     \
+  __macro(cublasChpr)                     \
+  __macro(cublasZhpr)                     \
+  __macro(cublasSsyr2)                    \
+  __macro(cublasDsyr2)                    \
+  __macro(cublasCsyr2)                    \
+  __macro(cublasZsyr2)                    \
+  __macro(cublasCher2)                    \
+  __macro(cublasZher2)                    \
+  __macro(cublasSspr2)                    \
+  __macro(cublasDspr2)                    \
+  __macro(cublasChpr2)                    \
+  __macro(cublasZhpr2)                    \
+  __macro(cublasSgemm)                    \
+  __macro(cublasDgemm)                    \
+  __macro(cublasCgemm)                    \
+  __macro(cublasZgemm)                    \
+  __macro(cublasSsyrk)                    \
+  __macro(cublasDsyrk)                    \
+  __macro(cublasCsyrk)                    \
+  __macro(cublasZsyrk)                    \
+  __macro(cublasCherk)                    \
+  __macro(cublasZherk)                    \
+  __macro(cublasSsyr2k)                   \
+  __macro(cublasDsyr2k)                   \
+  __macro(cublasCsyr2k)                   \
+  __macro(cublasZsyr2k)                   \
+  __macro(cublasCher2k)                   \
+  __macro(cublasZher2k)                   \
+  __macro(cublasSsyrkx)                   \
+  __macro(cublasDsyrkx)                   \
+  __macro(cublasCsyrkx)                   \
+  __macro(cublasZsyrkx)                   \
+  __macro(cublasCherkx)                   \
+  __macro(cublasZherkx)                   \
+  __macro(cublasSsymm)                    \
+  __macro(cublasDsymm)                    \
+  __macro(cublasCsymm)                    \
+  __macro(cublasZsymm)                    \
+  __macro(cublasChemm)                    \
+  __macro(cublasZhemm)                    \
+  __macro(cublasStrsm)                    \
+  __macro(cublasDtrsm)                    \
+  __macro(cublasCtrsm)                    \
+  __macro(cublasZtrsm)                    \
+  __macro(cublasStrmm)                    \
+  __macro(cublasDtrmm)                    \
+  __macro(cublasCtrmm)                    \
+  __macro(cublasZtrmm)                    \
+  __macro(cublasSgeam)                    \
+  __macro(cublasDgeam)                    \
+  __macro(cublasCgeam)                    \
+  __macro(cublasZgeam)                    \
+  __macro(cublasSdgmm)                    \
+  __macro(cublasDdgmm)                    \
+  __macro(cublasCdgmm)                    \
+  __macro(cublasZdgmm)
+
+PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasCreate)
+PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasDestroy)
+PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetStream)
+PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetPointerMode)
+PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasGetPointerMode)
+PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmBatched)
+PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasDgemmBatched)
+PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasCgemmBatched)
+PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasZgemmBatched)
+CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP)
+
+}  // namespace dynload
+
+static string ToString(cublasStatus_t status) {
+  switch (status) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+    default:
+      return port::StrCat("<invalid cublas status: ", status, ">");
+  }
+}
+
+// cuBLAS has interfaces that permit pointers to be passed from either the host
+// memory space or the device memory space; however, you must instruct it as to
+// which address space those pointers are in with cublasSetPointerMode.
+//
+// This helper sets the cuBLAS pointer mode to a desired value for a cuBLAS call
+// you are about to perform in a given scope.
+//
+// The prior cuBLAS pointer mode is retained and restored when this object goes
+// out of scope.
+class ScopedCublasPointerMode {
+ public:
+  // Note that, because the setting of the cublas pointer mode is fallible,
+  // construction of this scoped datatype must be paired with a call to
+  // Init().
+  //
+  // Parameters:
+  //  handle: The cublas library handle to act upon in setting the pointer mode.
+  explicit ScopedCublasPointerMode(CUDAExecutor *parent, cublasHandle_t handle)
+      : parent_(parent), handle_(handle), ok_(false) {}
+
+  // Attempts the switch to the requested scoped pointer mode, new_mode.
+  //
+  // Note that when false is returned, an appropriate error has already been
+  // logged.
+  bool Init(cublasPointerMode_t new_mode) {
+    cublasStatus_t ret =
+        dynload::cublasGetPointerMode_v2(parent_, handle_, &old_mode_);
+    if (ret != CUBLAS_STATUS_SUCCESS) {
+      LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret);
+      return ok_ = false;
+    }
+
+    ret = dynload::cublasSetPointerMode_v2(parent_, handle_, new_mode);
+    if (ret != CUBLAS_STATUS_SUCCESS) {
+      LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret);
+      return ok_ = false;
+    }
+
+    return ok_ = true;
+  }
+
+  // Switches back to the prior pointer mode, if the switch operation was
+  // successful in the first place.
+  ~ScopedCublasPointerMode() {
+    if (ok_) {
+      cublasStatus_t ret =
+          dynload::cublasSetPointerMode_v2(parent_, handle_, old_mode_);
+      if (ret != CUBLAS_STATUS_SUCCESS) {
+        LOG(ERROR) << "failed to set former cublas pointer mode: "
+                   << ToString(ret);
+      }
+    }
+  }
+
+ private:
+  CUDAExecutor *parent_;   // Executor establishing this pointer mode for.
+  cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
+  cublasPointerMode_t old_mode_;  // Prior cuBLAS pointer mode, to be restored.
+  bool ok_;                       // Whether the change was successful.
+};
+
+bool CUDABlas::Init() {
+  cublasStatus_t ret = dynload::cublasCreate_v2(parent_, &blas_);
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to create cublas handle: " << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+CUDABlas::CUDABlas(cuda::CUDAExecutor *parent)
+    : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
+
+CUDABlas::~CUDABlas() {
+  if (blas_ != nullptr) {
+    dynload::cublasDestroy_v2(parent_, blas_);
+  }
+}
+
+bool CUDABlas::SetStream(Stream *stream) {
+  CHECK(stream != nullptr);
+  CHECK(AsCUDAStreamValue(stream) != nullptr);
+  CHECK(blas_ != nullptr);
+  cublasStatus_t ret =
+      dynload::cublasSetStream_v2(parent_, blas_, AsCUDAStreamValue(stream));
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+namespace {
+
+// Helper functions transforming blas arguments into cuBLAS arguments.
+
+cublasOperation_t CUDABlasTranspose(blas::Transpose trans) {
+  switch (trans) {
+    case blas::Transpose::kNoTranspose:
+      return CUBLAS_OP_N;
+    case blas::Transpose::kTranspose:
+      return CUBLAS_OP_T;
+    case blas::Transpose::kConjugateTranspose:
+      return CUBLAS_OP_C;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Transpose.";
+  }
+}
+
+cublasFillMode_t CUDABlasUpperLower(blas::UpperLower uplo) {
+  switch (uplo) {
+    case blas::UpperLower::kUpper:
+      return CUBLAS_FILL_MODE_UPPER;
+    case blas::UpperLower::kLower:
+      return CUBLAS_FILL_MODE_LOWER;
+    default:
+      LOG(FATAL) << "Invalid value of blas::UpperLower.";
+  }
+}
+
+cublasDiagType_t CUDABlasDiagonal(blas::Diagonal diag) {
+  switch (diag) {
+    case blas::Diagonal::kUnit:
+      return CUBLAS_DIAG_UNIT;
+    case blas::Diagonal::kNonUnit:
+      return CUBLAS_DIAG_NON_UNIT;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Diagonal.";
+  }
+}
+
+cublasSideMode_t CUDABlasSide(blas::Side side) {
+  switch (side) {
+    case blas::Side::kLeft:
+      return CUBLAS_SIDE_LEFT;
+    case blas::Side::kRight:
+      return CUBLAS_SIDE_RIGHT;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Side.";
+  }
+}
+
+}  // namespace
+
+template <typename FuncT, typename... Args>
+bool CUDABlas::DoBlasInternal(FuncT cublas_func, Stream *stream,
+                              bool pointer_mode_host, Args... args) {
+  mutex_lock lock{mu_};
+
+  CHECK(blas_ != nullptr);
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  ScopedCublasPointerMode pointer_mode{parent_, blas_};
+  if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
+                                           : CUBLAS_POINTER_MODE_DEVICE)) {
+    return false;
+  }
+
+  cublasStatus_t ret = cublas_func(parent_, blas_, args...);
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to run cuBLAS routine " << cublas_func.kName << ": "
+               << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(dynload::cublasSasum, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(dynload::cublasDasum, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(
+      dynload::cublasScasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(
+      dynload::cublasDzasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(dynload::cublasSaxpy, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(dynload::cublasDaxpy, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  return DoBlasInternal(dynload::cublasCaxpy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
+                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  return DoBlasInternal(dynload::cublasZaxpy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
+                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(dynload::cublasScopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(dynload::cublasDcopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  return DoBlasInternal(dynload::cublasCcopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAComplex(CUDAMemory(x)), incx,
+                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  return DoBlasInternal(dynload::cublasZcopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAComplex(CUDAMemory(x)), incx,
+                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
+                         const DeviceMemory<float> &x, int incx,
+                         const DeviceMemory<float> &y, int incy,
+                         DeviceMemory<float> *result) {
+  return DoBlasInternal(
+      dynload::cublasSdot, stream, false /* = pointer_mode_host */, elem_count,
+      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
+                         const DeviceMemory<double> &x, int incx,
+                         const DeviceMemory<double> &y, int incy,
+                         DeviceMemory<double> *result) {
+  return DoBlasInternal(
+      dynload::cublasDdot, stream, false /* = pointer_mode_host */, elem_count,
+      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *result) {
+  return DoBlasInternal(
+      dynload::cublasCdotc, stream, false /* = pointer_mode_host */, elem_count,
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
+      CUDAComplex(CUDAMemoryMutable(result)));
+}
+
+bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *result) {
+  return DoBlasInternal(
+      dynload::cublasZdotc, stream, false /* = pointer_mode_host */, elem_count,
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
+      CUDAComplex(CUDAMemoryMutable(result)));
+}
+
+bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *result) {
+  return DoBlasInternal(
+      dynload::cublasCdotu, stream, false /* = pointer_mode_host */, elem_count,
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
+      CUDAComplex(CUDAMemoryMutable(result)));
+}
+
+bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *result) {
+  return DoBlasInternal(
+      dynload::cublasZdotu, stream, false /* = pointer_mode_host */, elem_count,
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
+      CUDAComplex(CUDAMemoryMutable(result)));
+}
+
+bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(dynload::cublasSnrm2, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(dynload::cublasDnrm2, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(
+      dynload::cublasScnrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(
+      dynload::cublasDznrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<float> *x, int incx,
+                         DeviceMemory<float> *y, int incy, float c, float s) {
+  return DoBlasInternal(
+      dynload::cublasSrot, stream, true /* = pointer_mode_host */, elem_count,
+      CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
+}
+
+bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<double> *x, int incx,
+                         DeviceMemory<double> *y, int incy, double c,
+                         double s) {
+  return DoBlasInternal(
+      dynload::cublasDrot, stream, true /* = pointer_mode_host */, elem_count,
+      CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
+}
+
+bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<std::complex<float>> *x, int incx,
+                         DeviceMemory<std::complex<float>> *y, int incy,
+                         float c, float s) {
+  return DoBlasInternal(dynload::cublasCsrot, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAComplex(CUDAMemoryMutable(x)), incx,
+                        CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
+}
+
+bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<std::complex<double>> *x, int incx,
+                         DeviceMemory<std::complex<double>> *y, int incy,
+                         double c, double s) {
+  return DoBlasInternal(dynload::cublasZdrot, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAComplex(CUDAMemoryMutable(x)), incx,
+                        CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
+}
+
+bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
+                          DeviceMemory<float> *b, DeviceMemory<float> *c,
+                          DeviceMemory<float> *s) {
+  return DoBlasInternal(dynload::cublasSrotg, stream,
+                        false /* = pointer_mode_host */, CUDAMemoryMutable(a),
+                        CUDAMemoryMutable(b), CUDAMemoryMutable(c),
+                        CUDAMemoryMutable(s));
+}
+
+bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
+                          DeviceMemory<double> *b, DeviceMemory<double> *c,
+                          DeviceMemory<double> *s) {
+  return DoBlasInternal(dynload::cublasDrotg, stream,
+                        false /* = pointer_mode_host */,
+                        CUDAComplex(CUDAMemoryMutable(a)), CUDAMemoryMutable(b),
+                        CUDAMemoryMutable(c), CUDAMemoryMutable(s));
+}
+
+bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
+                          DeviceMemory<std::complex<float>> *b,
+                          DeviceMemory<float> *c,
+                          DeviceMemory<std::complex<float>> *s) {
+  return DoBlasInternal(
+      dynload::cublasCrotg, stream, false /* = pointer_mode_host */,
+      CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
+      CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
+}
+
+bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
+                          DeviceMemory<std::complex<double>> *b,
+                          DeviceMemory<double> *c,
+                          DeviceMemory<std::complex<double>> *s) {
+  return DoBlasInternal(
+      dynload::cublasZrotg, stream, false /* = pointer_mode_host */,
+      CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
+      CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
+}
+
+bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
+                          DeviceMemory<float> *x, int incx,
+                          DeviceMemory<float> *y, int incy,
+                          const DeviceMemory<float> &param) {
+  return DoBlasInternal(dynload::cublasSrotm, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
+                        CUDAMemory(param));
+}
+
+bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
+                          DeviceMemory<double> *x, int incx,
+                          DeviceMemory<double> *y, int incy,
+                          const DeviceMemory<double> &param) {
+  return DoBlasInternal(dynload::cublasDrotm, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
+                        CUDAMemory(param));
+}
+
+bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
+                           DeviceMemory<float> *d2, DeviceMemory<float> *x1,
+                           const DeviceMemory<float> &y1,
+                           DeviceMemory<float> *param) {
+  return DoBlasInternal(dynload::cublasSrotmg, stream,
+                        false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
+                        CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
+                        CUDAMemory(y1), CUDAMemoryMutable(param));
+}
+
+bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
+                           DeviceMemory<double> *d2, DeviceMemory<double> *x1,
+                           const DeviceMemory<double> &y1,
+                           DeviceMemory<double> *param) {
+  return DoBlasInternal(dynload::cublasDrotmg, stream,
+                        false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
+                        CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
+                        CUDAMemory(y1), CUDAMemoryMutable(param));
+}
+
+bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
+                          DeviceMemory<float> *x, int incx) {
+  return DoBlasInternal(dynload::cublasSscal, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(dynload::cublasDscal, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  return DoBlasInternal(
+      dynload::cublasCsscal, stream, true /* = pointer_mode_host */, elem_count,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  return DoBlasInternal(
+      dynload::cublasZdscal, stream, true /* = pointer_mode_host */, elem_count,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
+                          std::complex<float> alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  return DoBlasInternal(
+      dynload::cublasCscal, stream, true /* = pointer_mode_host */, elem_count,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
+                          std::complex<double> alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  return DoBlasInternal(
+      dynload::cublasZscal, stream, true /* = pointer_mode_host */, elem_count,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<float> *x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(dynload::cublasSswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<double> *x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(dynload::cublasDswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<std::complex<float>> *x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  return DoBlasInternal(dynload::cublasCswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAComplex(CUDAMemoryMutable(x)), incx,
+                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<std::complex<double>> *x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  return DoBlasInternal(dynload::cublasZswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        CUDAComplex(CUDAMemoryMutable(x)), incx,
+                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<float> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(dynload::cublasIsamax, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<double> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(dynload::cublasIdamax, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<float>> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      dynload::cublasIcamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<double>> &x,
+                           int incx, DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      dynload::cublasIzamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<float> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      dynload::cublasIsamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<double> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      dynload::cublasIdamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<float>> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      dynload::cublasIcamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<double>> &x,
+                           int incx, DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      dynload::cublasIzamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+}
+
+bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasSgbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
+      CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasDgbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
+      CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasCgbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasZgbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasSgemv, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
+      incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, double alpha, const DeviceMemory<double> &a,
+                          int lda, const DeviceMemory<double> &x, int incx,
+                          double beta, DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasDgemv, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
+      incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasCgemv, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasZgemv, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
+                         const DeviceMemory<float> &x, int incx,
+                         const DeviceMemory<float> &y, int incy,
+                         DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasSger, stream, true /* = pointer_mode_host */, m, n, &alpha,
+      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+}
+
+bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
+                         const DeviceMemory<double> &x, int incx,
+                         const DeviceMemory<double> &y, int incy,
+                         DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasDger, stream, true /* = pointer_mode_host */, m, n, &alpha,
+      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+}
+
+bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasCgerc, stream, true /* = pointer_mode_host */, m, n,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+}
+
+bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasZgerc, stream, true /* = pointer_mode_host */, m, n,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+}
+
+bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasCgeru, stream, true /* = pointer_mode_host */, m, n,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+}
+
+bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasZgeru, stream, true /* = pointer_mode_host */, m, n,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+}
+
+bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasChbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasZhbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasChemv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasZhemv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha,
+                         const DeviceMemory<std::complex<float>> &x, int incx,
+                         DeviceMemory<std::complex<float>> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasCher, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(CUDAMemoryMutable(a)), lda);
+}
+
+bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha,
+                         const DeviceMemory<std::complex<double>> &x, int incx,
+                         DeviceMemory<std::complex<double>> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasZher, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(CUDAMemoryMutable(a)), lda);
+}
+
+bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasCher2, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
+      CUDAComplex(CUDAMemoryMutable(a)), lda);
+}
+
+bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  return DoBlasInternal(
+      dynload::cublasZher2, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
+      CUDAComplex(CUDAMemoryMutable(a)), lda);
+}
+
+bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasChpmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasZhpmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+}
+
+bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha,
+                         const DeviceMemory<std::complex<float>> &x, int incx,
+                         DeviceMemory<std::complex<float>> *ap) {
+  return DoBlasInternal(
+      dynload::cublasChpr, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
+}
+
+bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha,
+                         const DeviceMemory<std::complex<double>> &x, int incx,
+                         DeviceMemory<std::complex<double>> *ap) {
+  return DoBlasInternal(
+      dynload::cublasZhpr, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
+}
+
+bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *ap) {
+  return DoBlasInternal(
+      dynload::cublasChpr2, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
+      CUDAComplex(CUDAMemoryMutable(ap)));
+}
+
+bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *ap) {
+  return DoBlasInternal(
+      dynload::cublasZhpr2, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
+      CUDAComplex(CUDAMemoryMutable(ap)));
+}
+
+bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasSsbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
+      incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, double alpha, const DeviceMemory<double> &a,
+                          int lda, const DeviceMemory<double> &x, int incx,
+                          double beta, DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(
+      dynload::cublasDsbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
+      incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &ap,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(dynload::cublasSspmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
+                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &ap,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(dynload::cublasDspmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
+                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha, const DeviceMemory<float> &x, int incx,
+                         DeviceMemory<float> *ap) {
+  return DoBlasInternal(dynload::cublasSspr, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
+                        incx, CUDAMemoryMutable(ap));
+}
+
+bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha, const DeviceMemory<double> &x, int incx,
+                         DeviceMemory<double> *ap) {
+  return DoBlasInternal(dynload::cublasDspr, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
+                        incx, CUDAMemoryMutable(ap));
+}
+
+bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &x, int incx,
+                          const DeviceMemory<float> &y, int incy,
+                          DeviceMemory<float> *ap) {
+  return DoBlasInternal(dynload::cublasSspr2, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
+                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
+}
+
+bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &x, int incx,
+                          const DeviceMemory<double> &y, int incy,
+                          DeviceMemory<double> *ap) {
+  return DoBlasInternal(dynload::cublasDspr2, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
+                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
+}
+
+bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(dynload::cublasSsymv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
+                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(dynload::cublasDsymv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
+                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+}
+
+bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha, const DeviceMemory<float> &x, int incx,
+                         DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(dynload::cublasSsyr, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
+                        incx, CUDAMemoryMutable(a), lda);
+}
+
+bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha, const DeviceMemory<double> &x, int incx,
+                         DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(dynload::cublasDsyr, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
+                        incx, CUDAMemoryMutable(a), lda);
+}
+
+bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &x, int incx,
+                          const DeviceMemory<float> &y, int incy,
+                          DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(dynload::cublasSsyr2, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
+                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+}
+
+bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &x, int incx,
+                          const DeviceMemory<double> &y, int incy,
+                          DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(dynload::cublasDsyr2, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
+                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+}
+
+bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  return DoBlasInternal(dynload::cublasStbmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(dynload::cublasDtbmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<float>> &a,
+                          int lda, DeviceMemory<std::complex<float>> *x,
+                          int incx) {
+  return DoBlasInternal(
+      dynload::cublasCtbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
+      CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<double>> &a,
+                          int lda, DeviceMemory<std::complex<double>> *x,
+                          int incx) {
+  return DoBlasInternal(
+      dynload::cublasZtbmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
+      CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  return DoBlasInternal(dynload::cublasStbsv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(dynload::cublasDtbsv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<float>> &a,
+                          int lda, DeviceMemory<std::complex<float>> *x,
+                          int incx) {
+  return DoBlasInternal(
+      dynload::cublasCtbsv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
+      CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<double>> &a,
+                          int lda, DeviceMemory<std::complex<double>> *x,
+                          int incx) {
+  return DoBlasInternal(
+      dynload::cublasZtbsv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
+      CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &ap, DeviceMemory<float> *x,
+                          int incx) {
+  return DoBlasInternal(
+      dynload::cublasStpmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &ap,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(
+      dynload::cublasDtpmv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  return DoBlasInternal(dynload::cublasCtpmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
+                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  return DoBlasInternal(dynload::cublasZtpmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
+                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &ap, DeviceMemory<float> *x,
+                          int incx) {
+  return DoBlasInternal(
+      dynload::cublasStpsv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &ap,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(
+      dynload::cublasDtpsv, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  return DoBlasInternal(dynload::cublasCtpsv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
+                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  return DoBlasInternal(dynload::cublasZtpsv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
+                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  return DoBlasInternal(dynload::cublasStrmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(dynload::cublasDtrmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  return DoBlasInternal(dynload::cublasCtrmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
+                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  return DoBlasInternal(dynload::cublasZtrmv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
+                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  return DoBlasInternal(dynload::cublasStrsv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(dynload::cublasDtrsv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
+                        CUDAMemoryMutable(x), incx);
+}
+
+bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  return DoBlasInternal(dynload::cublasCtrsv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
+                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  return DoBlasInternal(dynload::cublasZtrsv, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
+                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+}
+
+bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &b, int ldb, float beta,
+                          DeviceMemory<float> *c, int ldc) {
+  VLOG(1) << port::Printf(
+      "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
+  if (transa == blas::Transpose::kNoTranspose) {
+    if (lda < static_cast<int64>(m)) {
+      LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
+                      "precondition violation";
+    }
+  } else {
+    if (lda < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
+                   << ") (transpose case); precondition violation";
+    }
+  }
+  if (transb == blas::Transpose::kNoTranspose) {
+    if (ldb < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
+                   << ") (no transpose case); precondition violation";
+    }
+  } else {
+    if (ldb < static_cast<int64>(n)) {
+      LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
+                      "precondition violation";
+    }
+  }
+  return DoBlasInternal(
+      dynload::cublasSgemm, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+}
+
+bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &b, int ldb, double beta,
+                          DeviceMemory<double> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasDgemm, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+}
+
+bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasCgemm, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
+      CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasZgemm, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
+      CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+template <typename T, typename FuncT>
+port::Status CUDABlas::DoBlasGemmBatchedInternal(
+    FuncT cublas_func, Stream *stream, blas::Transpose transa,
+    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+    const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+    const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
+    int batch_count) {
+  std::vector<T *> a_ptr_vec, b_ptr_vec, c_ptr_vec;
+  for (int i = 0; i < batch_count; ++i) {
+    a_ptr_vec.push_back(static_cast<T *>(a_array[i]->opaque()));
+    b_ptr_vec.push_back(static_cast<T *>(b_array[i]->opaque()));
+    c_ptr_vec.push_back(static_cast<T *>(c_array[i]->opaque()));
+  }
+
+  typedef typename CUDAComplexT<T>::type CUDA_T;
+  SE_ASSIGN_OR_RETURN(
+      std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_ptr_array,
+      stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
+  SE_ASSIGN_OR_RETURN(
+      std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_ptr_array,
+      stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
+  SE_ASSIGN_OR_RETURN(
+      std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_ptr_array,
+      stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
+
+  if (!stream->ThenMemcpy(a_ptr_array->mutable_device_memory(),
+                          a_ptr_vec.data(), batch_count * sizeof(T *))
+           .ok() ||
+      !stream->ThenMemcpy(b_ptr_array->mutable_device_memory(),
+                          b_ptr_vec.data(), batch_count * sizeof(T *))
+           .ok() ||
+      !stream->ThenMemcpy(c_ptr_array->mutable_device_memory(),
+                          c_ptr_vec.data(), batch_count * sizeof(T *))
+           .ok()) {
+    return port::Status(port::error::INTERNAL,
+                        "failed to copy memory from host to device in "
+                        "CUDABlas::DoBlasGemmBatched");
+  }
+
+  bool ok = DoBlasInternal(
+      cublas_func, stream, true /* = pointer_mode_host */,
+      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+      CUDAComplex(&alpha),
+      const_cast<const CUDA_T **>(CUDAMemory(a_ptr_array->device_memory())),
+      lda,
+      const_cast<const CUDA_T **>(CUDAMemory(b_ptr_array->device_memory())),
+      ldb, CUDAComplex(&beta),
+      const_cast<CUDA_T **>(CUDAMemory(c_ptr_array->device_memory())), ldc,
+      batch_count);
+
+  if (ok) {
+    return port::Status::OK();
+  }
+  return port::Status(port::error::INTERNAL,
+                      "failed BLAS call, see log for details");
+}
+
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
+    int batch_count) {
+  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
+      dynload::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha,
+      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
+}
+
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha,
+    const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
+    double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
+    int ldc, int batch_count) {
+  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
+      dynload::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha,
+      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
+}
+
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array,
+    int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
+    int ldb, std::complex<float> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
+    int ldc, int batch_count) {
+  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
+      dynload::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha,
+      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
+}
+
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array,
+    int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
+    int ldb, std::complex<double> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
+    int ldc, int batch_count) {
+  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
+      dynload::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha,
+      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
+}
+
+bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasChemm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasZhemm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          float alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          float beta, DeviceMemory<std::complex<float>> *c,
+                          int ldc) {
+  return DoBlasInternal(dynload::cublasCherk, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
+                        &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          double alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          double beta, DeviceMemory<std::complex<double>> *c,
+                          int ldc) {
+  return DoBlasInternal(dynload::cublasZherk, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
+                        &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<float> alpha,
+                           const DeviceMemory<std::complex<float>> &a, int lda,
+                           const DeviceMemory<std::complex<float>> &b, int ldb,
+                           float beta, DeviceMemory<std::complex<float>> *c,
+                           int ldc) {
+  return DoBlasInternal(dynload::cublasCher2k, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
+                        CUDAComplex(CUDAMemory(b)), ldb, &beta,
+                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<double> alpha,
+                           const DeviceMemory<std::complex<double>> &a, int lda,
+                           const DeviceMemory<std::complex<double>> &b, int ldb,
+                           double beta, DeviceMemory<std::complex<double>> *c,
+                           int ldc) {
+  return DoBlasInternal(dynload::cublasZher2k, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
+                        CUDAComplex(CUDAMemory(b)), ldb, &beta,
+                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &b, int ldb, float beta,
+                          DeviceMemory<float> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasSsymm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
+      lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+}
+
+bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &b, int ldb, double beta,
+                          DeviceMemory<double> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasDsymm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
+      lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+}
+
+bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasCsymm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasZsymm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
+      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          float beta, DeviceMemory<float> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasSsyrk, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
+      CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
+}
+
+bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          double beta, DeviceMemory<double> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasDsyrk, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
+      CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
+}
+
+bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasCsyrk, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
+      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasZsyrk, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
+      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
+      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           float alpha, const DeviceMemory<float> &a, int lda,
+                           const DeviceMemory<float> &b, int ldb, float beta,
+                           DeviceMemory<float> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasSsyr2k, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
+      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+}
+
+bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           double alpha, const DeviceMemory<double> &a, int lda,
+                           const DeviceMemory<double> &b, int ldb, double beta,
+                           DeviceMemory<double> *c, int ldc) {
+  return DoBlasInternal(
+      dynload::cublasDsyr2k, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
+      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+}
+
+bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<float> alpha,
+                           const DeviceMemory<std::complex<float>> &a, int lda,
+                           const DeviceMemory<std::complex<float>> &b, int ldb,
+                           std::complex<float> beta,
+                           DeviceMemory<std::complex<float>> *c, int ldc) {
+  return DoBlasInternal(dynload::cublasCsyr2k, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
+                        CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<double> alpha,
+                           const DeviceMemory<std::complex<double>> &a, int lda,
+                           const DeviceMemory<std::complex<double>> &b, int ldb,
+                           std::complex<double> beta,
+                           DeviceMemory<std::complex<double>> *c, int ldc) {
+  return DoBlasInternal(dynload::cublasZsyr2k, stream,
+                        true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
+                        CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+}
+
+bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) {
+  return DoBlasInternal(
+      dynload::cublasStrmm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
+      CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
+      CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
+}
+
+bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) {
+  return DoBlasInternal(
+      dynload::cublasDtrmm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
+      CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
+      CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
+}
+
+bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) {
+  return DoBlasInternal(
+      dynload::cublasCtrmm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
+      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
+      CUDAComplex(CUDAMemoryMutable(b)), ldb);
+}
+
+bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) {
+  return DoBlasInternal(
+      dynload::cublasZtrmm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
+      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
+      CUDAComplex(CUDAMemoryMutable(b)), ldb);
+}
+
+bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) {
+  return DoBlasInternal(dynload::cublasStrsm, stream,
+                        true /* = pointer_mode_host */, CUDABlasSide(side),
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
+                        CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
+                        lda, CUDAMemoryMutable(b), ldb);
+}
+
+bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) {
+  return DoBlasInternal(dynload::cublasDtrsm, stream,
+                        true /* = pointer_mode_host */, CUDABlasSide(side),
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
+                        CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
+                        lda, CUDAMemoryMutable(b), ldb);
+}
+
+bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) {
+  return DoBlasInternal(
+      dynload::cublasCtrsm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
+      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
+}
+
+bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) {
+  return DoBlasInternal(
+      dynload::cublasZtrsm, stream, true /* = pointer_mode_host */,
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
+      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
+      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
+}
+
+}  // namespace cuda
+
+namespace gpu = ::perftools::gputools;
+
+void initialize_cublas() {
+  gpu::port::Status status =
+      gpu::PluginRegistry::Instance()
+          ->RegisterFactory<gpu::PluginRegistry::BlasFactory>(
+              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuBlasPlugin, "cuBLAS",
+              [](gpu::internal::StreamExecutorInterface
+                     *parent) -> gpu::blas::BlasSupport * {
+                gpu::cuda::CUDAExecutor *cuda_executor =
+                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
+                if (cuda_executor == nullptr) {
+                  LOG(ERROR)
+                      << "Attempting to initialize an instance of the cuBLAS "
+                      << "support library with a non-CUDA StreamExecutor";
+                  return nullptr;
+                }
+
+                gpu::cuda::CUDABlas *blas =
+                    new gpu::cuda::CUDABlas(cuda_executor);
+                if (!blas->Init()) {
+                  // Note: Init() will log a more specific error.
+                  delete blas;
+                  return nullptr;
+                }
+                return blas;
+              });
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to register cuBLAS factory: "
+               << status.error_message();
+  }
+
+  // Prime the cuBLAS DSO. The loader will log more information.
+  auto statusor = gpu::internal::CachedDsoLoader::GetCublasDsoHandle();
+  if (!statusor.ok()) {
+    LOG(INFO) << "Unable to load cuBLAS DSO.";
+  }
+
+  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
+                                                     gpu::PluginKind::kBlas,
+                                                     gpu::cuda::kCuBlasPlugin);
+}
+
+}  // namespace gputools
+}  // namespace perftools
+
+REGISTER_MODULE_INITIALIZER(register_cublas,
+                            { perftools::gputools::initialize_cublas(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
new file mode 100644
index 0000000000..1dfec2ebc5
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -0,0 +1,100 @@
+// CUDA-specific support for BLAS functionality -- this wraps the cuBLAS library
+// capabilities, and is only included into CUDA implementation code -- it will
+// not introduce cuda headers into other code.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
+
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/lib/stringpiece.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+
+typedef struct cublasContext *cublasHandle_t;
+
+namespace perftools {
+namespace gputools {
+
+class Stream;
+
+namespace cuda {
+
+// Opaque and unique identifier for the cuBLAS plugin.
+extern const PluginId kCuBlasPlugin;
+
+class CUDAExecutor;
+
+// BLAS plugin for CUDA platform via cuBLAS library.
+//
+// This satisfies the platform-agnostic BlasSupport interface.
+//
+// Note that the cuBLAS handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// to. This simply happens as an artifact of creating the cuBLAS handle when a
+// CUDA context is active.
+//
+// Thread-safe post-initialization.
+class CUDABlas : public blas::BlasSupport {
+ public:
+  explicit CUDABlas(CUDAExecutor *parent);
+
+  // Allocates a cuBLAS handle.
+  bool Init();
+
+  // Releases the cuBLAS handle, if present.
+  ~CUDABlas() override;
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES
+
+ private:
+  // Tells cuBLAS to enqueue the BLAS operation onto a particular Stream.
+  //
+  // cuBLAS is stateful, and only be associated with one stream (in order to
+  // enqueue dispatch) at a given time. As a result, this generally must be
+  // invoked before calling into cuBLAS.
+  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // A helper function that calls the real cuBLAS function together with error
+  // handling.
+  //
+  // cublas_func:        cuBLAS function pointer.
+  // cublas_name:        cuBLAS function name.
+  // stream:             Stream to enqueue the BLAS operation onto.
+  // pointer_mode_host:  Indicate if the pointer to a scalar value is from host
+  //                     (true) or device (false).
+  // args:               Arguments of cuBLAS function.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
+                      Args... args);
+
+  // A helper function to implement DoBlasGemmBatched interfaces for generic
+  // types.
+  template <typename T, typename FuncT>
+  port::Status DoBlasGemmBatchedInternal(
+      FuncT cublas_func, Stream *stream, blas::Transpose transa,
+      blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+      const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
+      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+      const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
+      int batch_count);
+
+  // mutex that guards the cuBLAS handle for this device.
+  mutex mu_;
+
+  // CUDAExecutor which instantiated this CUDABlas.
+  // Immutable post-initialization.
+  CUDAExecutor *parent_;
+
+  // cuBLAS library handle on the device.
+  cublasHandle_t blas_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
new file mode 100644
index 0000000000..c01c9978a1
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -0,0 +1,260 @@
+#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+
+#include <dirent.h>
+#include <limits.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/stream_executor/lib/stringpiece.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
+
+string DriverVersionToString(DriverVersion version) {
+  return port::Printf("%d.%d", std::get<0>(version), std::get<1>(version));
+}
+
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
+  if (!version.ok()) {
+    return version.status().ToString();
+  }
+
+  return DriverVersionToString(version.ValueOrDie());
+}
+
+port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
+  std::vector<string> pieces = port::Split(value, '.');
+  if (pieces.size() != 2) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        port::Printf("expected %%d.%%d form for driver version; got \"%s\"",
+                     value.c_str())};
+  }
+
+  int major;
+  int minor;
+  if (!port::safe_strto32(pieces[0], &major)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        port::Printf("could not parse major version number \"%s\" as an "
+                     "integer from string \"%s\"",
+                     pieces[0].c_str(), value.c_str())};
+  }
+  if (!port::safe_strto32(pieces[1], &minor)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        port::Printf("could not parse minor version number \"%s\" as an "
+                     "integer from string \"%s\"",
+                     pieces[1].c_str(), value.c_str())};
+  }
+
+  DriverVersion result{major, minor};
+  VLOG(2) << "version string \"" << value << "\" made value "
+          << DriverVersionToString(result);
+  return result;
+}
+
+// -- class Diagnostician
+
+string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
+  return port::StrCat("/dev/nvidia", dev_node_ordinal);
+}
+
+void Diagnostician::LogDiagnosticInformation() {
+  if (access(kDriverVersionPath, F_OK) != 0) {
+    LOG(INFO) << "kernel driver does not appear to be running on this host "
+              << "(" << port::Hostname() << "): "
+              << "/proc/driver/nvidia/version does not exist";
+    return;
+  }
+  auto dev0_path = GetDevNodePath(0);
+  if (access(dev0_path.c_str(), F_OK) != 0) {
+    LOG(INFO) << "no NVIDIA GPU device is present: " << dev0_path
+              << " does not exist";
+    return;
+  }
+
+  LOG(INFO) << "retrieving CUDA diagnostic information for host: "
+            << port::Hostname();
+
+
+  LogDriverVersionInformation();
+}
+
+/* static */ void Diagnostician::LogDriverVersionInformation() {
+  LOG(INFO) << "hostname: " << port::Hostname();
+
+  if (VLOG_IS_ON(1)) {
+    const char *value = getenv("LD_LIBRARY_PATH");
+    string library_path = value == nullptr ? "" : value;
+    VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
+
+    std::vector<string> pieces = port::Split(library_path, ':');
+    for (auto piece : pieces) {
+      if (piece.empty()) {
+        continue;
+      }
+      DIR *dir = opendir(piece.c_str());
+      if (dir == nullptr) {
+        VLOG(1) << "could not open \"" << piece << "\"";
+        continue;
+      }
+      while (dirent *entity = readdir(dir)) {
+        VLOG(1) << piece << " :: " << entity->d_name;
+      }
+      closedir(dir);
+    }
+  }
+
+  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
+  LOG(INFO) << "libcuda reported version is: "
+            << DriverVersionStatusToString(dso_version);
+
+  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
+  LOG(INFO) << "kernel reported version is: "
+            << DriverVersionStatusToString(kernel_version);
+  if (kernel_version.ok() && dso_version.ok()) {
+    WarnOnDsoKernelMismatch(dso_version, kernel_version);
+  }
+}
+
+// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+// driver-interfacing DSO version number. Returns it as a string.
+port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
+  port::StatusOr<DriverVersion> result{port::Status{
+      port::error::NOT_FOUND,
+      "was unable to find libcuda.so DSO loaded into this program"}};
+
+  // Callback used when iterating through DSOs. Looks for the driver-interfacing
+  // DSO and yields its version number into the callback data, when found.
+  auto iterate_phdr =
+      [](struct dl_phdr_info *info, size_t size, void *data) -> int {
+    if (strstr(info->dlpi_name, "libcuda.so")) {
+      VLOG(1) << "found DLL info with name: " << info->dlpi_name;
+      char resolved_path[PATH_MAX] = {0};
+      if (realpath(info->dlpi_name, resolved_path) == nullptr) {
+        return 0;
+      }
+      VLOG(1) << "found DLL info with resolved path: " << resolved_path;
+      const char *slash = rindex(resolved_path, '/');
+      if (slash == nullptr) {
+        return 0;
+      }
+      const char *so_suffix = ".so.";
+      const char *dot = strstr(slash, so_suffix);
+      if (dot == nullptr) {
+        return 0;
+      }
+      string dso_version = dot + strlen(so_suffix);
+      // TODO(b/22689637): Eliminate the explicit namespace if possible.
+      auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
+      auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
+      *result = StringToDriverVersion(stripped_dso_version);
+      return 1;
+    }
+    return 0;
+  };
+
+  dl_iterate_phdr(iterate_phdr, &result);
+
+  return result;
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
+    const string &driver_version_file_contents) {
+  static const char *kDriverFilePrelude = "Kernel Module  ";
+  size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
+  if (offset == string::npos) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        port::StrCat("could not find kernel module information in "
+                     "driver version file contents: \"",
+                     driver_version_file_contents, "\"")};
+  }
+
+  string version_and_rest = driver_version_file_contents.substr(
+      offset + strlen(kDriverFilePrelude), string::npos);
+  size_t space_index = version_and_rest.find(" ");
+  auto kernel_version = version_and_rest.substr(0, space_index);
+  // TODO(b/22689637): Eliminate the explicit namespace if possible.
+  auto stripped_kernel_version =
+      port::StripSuffixString(kernel_version, ".ld64");
+  return StringToDriverVersion(stripped_kernel_version);
+}
+
+void Diagnostician::WarnOnDsoKernelMismatch(
+    port::StatusOr<DriverVersion> dso_version,
+    port::StatusOr<DriverVersion> kernel_version) {
+  if (kernel_version.ok() && dso_version.ok() &&
+      dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
+    LOG(INFO) << "kernel version seems to match DSO: "
+              << DriverVersionToString(kernel_version.ValueOrDie());
+  } else {
+    LOG(ERROR) << "kernel version "
+               << DriverVersionStatusToString(kernel_version)
+               << " does not match DSO version "
+               << DriverVersionStatusToString(dso_version)
+               << " -- cannot find working devices in this configuration";
+  }
+}
+
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
+  FILE *driver_version_file = fopen(kDriverVersionPath, "r");
+  if (driver_version_file == nullptr) {
+    return port::Status{
+        port::error::PERMISSION_DENIED,
+        port::StrCat("could not open driver version path for reading: ",
+                     kDriverVersionPath)};
+  }
+
+  static const int kContentsSize = 1024;
+  port::InlinedVector<char, 4> contents(kContentsSize);
+  size_t retcode =
+      fread(contents.begin(), 1, kContentsSize - 2, driver_version_file);
+  if (retcode < kContentsSize - 1) {
+    contents[retcode] = '\0';
+  }
+  contents[kContentsSize - 1] = '\0';
+
+  if (retcode != 0) {
+    LOG(INFO) << "driver version file contents: \"\"\"" << contents.begin()
+              << "\"\"\"";
+    fclose(driver_version_file);
+    return FindKernelModuleVersion(string{contents.begin()});
+  }
+
+  auto status =
+      port::Status{port::error::INTERNAL,
+                   port::StrCat("failed to read driver version file contents: ",
+                                kDriverVersionPath, "; ferror: ",
+                                ferror(driver_version_file))};
+  fclose(driver_version_file);
+  return status;
+}
+
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
new file mode 100644
index 0000000000..005b3dc310
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -0,0 +1,85 @@
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
+
+#include <tuple>
+
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// e.g. DriverVersion{331, 79}
+using DriverVersion = std::tuple<int, int>;
+
+// Converts a parsed driver version to string form.
+string DriverVersionToString(DriverVersion version);
+
+// Converts a parsed driver version or status value to natural string form.
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+
+// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
+
+class Diagnostician {
+ public:
+  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+  // not initializing).
+  //
+  // Note: if we're running on a machine that has no GPUs, we don't want to
+  // produce very much log spew beyond saying, "looks like there's no CUDA
+  // kernel
+  // module running".
+  //
+  // Note: we use non-Google-File:: API here because we may be called before
+  // InitGoogle has completed.
+  static void LogDiagnosticInformation();
+
+  // Given the driver version file contents, finds the kernel module version and
+  // returns it as a string.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
+      const string &driver_version_file_contents);
+
+  // Extracts the kernel driver version from the current host.
+  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+  // driver-interfacing DSO version number. Returns it as a string.
+  static port::StatusOr<DriverVersion> FindDsoVersion();
+
+  // Logs information about the kernel driver version and userspace driver
+  // library version.
+  static void LogDriverVersionInformation();
+
+ private:
+  // Logs information about the loaded nvidia-related kernel modules.
+  static void LogKernelModuleInformation();
+
+  // Given the DSO version number and the driver version file contents, extracts
+  // the driver version and compares, warning the user in the case of
+  // incompatability.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static void WarnOnDsoKernelMismatch(
+      port::StatusOr<DriverVersion> dso_version,
+      port::StatusOr<DriverVersion> kernel_version);
+
+  // Logs information about the dev nodes present on this machine: their
+  // existence, permissions, accessibility from this uid/gid.
+  static void LogDevNodeDiagnosticInformation();
+
+  static string GetDevNodePath(int dev_node_ordinal);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
new file mode 100644
index 0000000000..6e4403512b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -0,0 +1,1074 @@
+#include "tensorflow/stream_executor/cuda/cuda_dnn.h"
+
+#include <dlfcn.h>
+#include <functional>
+
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform.h"
+#include "third_party/gpus/cuda/include/cudnn.h"
+
+namespace {
+
+// Converts (via narrowing) a type T value to a type U, and checks that the
+// value has no value change due to the conversion.
+template <typename WideT, typename NarrowT>
+NarrowT CheckedNarrowing(const WideT& wide) {
+  NarrowT narrow = wide;
+  CHECK_EQ(narrow, wide)
+      << "checked narrowing failed; values not equal post-conversion";
+  return narrow;
+}
+
+}  // namespace
+
+namespace perftools {
+namespace gputools {
+
+using dnn::BatchDescriptor;
+using dnn::FilterDescriptor;
+using dnn::ConvolutionDescriptor;
+using dnn::PoolingDescriptor;
+
+namespace cuda {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
+
+extern CUstream AsCUDAStreamValue(Stream* stream);
+
+string ToString(cudnnStatus_t status) {
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+      return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+      return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+      return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+      return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+      return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+      return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+      return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+      return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+      return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+      return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+      return "CUDNN_STATUS_LICENSE_ERROR";
+    default:
+      return port::StrCat("<unknown cudnn status: ", static_cast<int>(status),
+                          ">");
+  }
+}
+
+namespace dynload {
+
+static port::ThreadPool* InitCudnnThreadpool() {
+  port::ThreadPool* cudnn_threadpool_;
+  port::ThreadOptions options;
+  // TBD(keveman): Conservatively setting the stack size and guard size to 2MB,
+  // until we can get some guarantees from NVIDIA on the minimum stack space
+  // they will work with.
+  options.stack_size = 2 * 1024 * 1024;
+  options.guard_size = 2 * 1024 * 1024;
+  cudnn_threadpool_ = new port::ThreadPool(port::Env::Default(), options,
+                                           "cudnn_threadpool", 1);
+  CHECK(cudnn_threadpool_);
+  return cudnn_threadpool_;
+}
+
+static mutex cudnn_threadpool_mu(LINKER_INITIALIZED);
+static port::ThreadPool* GetCudaThreadpool() {
+  mutex_lock lock(cudnn_threadpool_mu);
+  static port::ThreadPool* cudnn_threadpool = InitCudnnThreadpool();
+  return cudnn_threadpool;
+}
+
+#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                           \
+    static const char* kName;                                              \
+    typedef std::add_pointer<decltype(::__name)>::type FuncPointerT;       \
+    static void* GetDsoHandle() {                                          \
+      static auto result = internal::CachedDsoLoader::GetCudnnDsoHandle(); \
+      return result.ValueOrDie();                                          \
+    }                                                                      \
+    static FuncPointerT DynLoad() {                                        \
+      static void* f = dlsym(GetDsoHandle(), kName);                       \
+      if (f == nullptr) {                                                  \
+        LOG(FATAL) << "could not find " << kName                           \
+                   << " in cudnn DSO; dlerror: " << dlerror();             \
+      }                                                                    \
+      return reinterpret_cast<FuncPointerT>(f);                            \
+    }                                                                      \
+    template <typename... Args>                                            \
+    void CallWrapper(CUDAExecutor* parent, port::Notification* n,          \
+                     cudnnStatus_t* retval, const Args&... args) {         \
+      cuda::ScopedActivateExecutorContext sac{parent};                     \
+      *retval = DynLoad()(args...);                                        \
+      n->Notify();                                                         \
+    }                                                                      \
+    template <typename... Args>                                            \
+    cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) {         \
+      port::Notification n;                                                \
+      cudnnStatus_t retval;                                                \
+      auto call_func_closure =                                             \
+          std::bind(&DynLoadShim__##__name::CallWrapper<Args...>, this,    \
+                    parent, &n, &retval, args...);                         \
+      GetCudaThreadpool()->Schedule(call_func_closure);                    \
+      n.WaitForNotification();                                             \
+      return retval;                                                       \
+    }                                                                      \
+  } __name;                                                                \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                                      \
+  __macro(cudnnSetTensor4dDescriptor) __macro(                               \
+      cudnnGetConvolutionNdForwardOutputDim)                                 \
+      __macro(cudnnGetConvolutionForwardAlgorithm) __macro(                  \
+          cudnnCreateTensorDescriptor) __macro(cudnnDestroyTensorDescriptor) \
+          __macro(cudnnCreateFilterDescriptor)                               \
+              __macro(cudnnSetFilter4dDescriptor)                            \
+                  __macro(cudnnSetPooling2dDescriptor)                       \
+                      __macro(cudnnDestroyFilterDescriptor)                  \
+                          __macro(cudnnCreateConvolutionDescriptor)          \
+                              __macro(cudnnCreatePoolingDescriptor)          \
+                                  __macro(cudnnAddTensor)                    \
+                                      __macro(cudnnDestroyPoolingDescriptor)
+
+CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH(__macro)            \
+  __macro(cudnnSetConvolution2dDescriptor)         \
+  __macro(cudnnDestroyConvolutionDescriptor)       \
+  __macro(cudnnCreate)                             \
+  __macro(cudnnDestroy)                            \
+  __macro(cudnnSetStream)                          \
+  __macro(cudnnActivationForward)                  \
+  __macro(cudnnConvolutionForward)                 \
+  __macro(cudnnConvolutionBackwardData)            \
+  __macro(cudnnConvolutionBackwardFilter)          \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize) \
+  __macro(cudnnTransformTensor)                    \
+  __macro(cudnnPoolingForward)                     \
+  __macro(cudnnPoolingBackward)
+// clang-format on
+
+CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH
+
+}  // namespace dynload
+
+namespace {
+
+cudnnHandle_t ToHandle(void* opaque_handle) {
+  return static_cast<cudnnHandle_t>(opaque_handle);
+}
+
+}  // namespace
+
+CudnnSupport::CudnnSupport(CUDAExecutor* parent)
+    : parent_(parent), dnn_handle_(nullptr) {}
+
+CudnnSupport::~CudnnSupport() {
+  auto status = dynload::cudnnDestroy(parent_, ToHandle(dnn_handle_));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "could not destroy cudnn handle: " << ToString(status);
+  }
+}
+
+port::Status CudnnSupport::Init() {
+  auto status = dynload::cudnnCreate(
+      parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_));
+  if (status == CUDNN_STATUS_SUCCESS) {
+    return port::Status::OK();
+  }
+
+  LOG(ERROR) << "could not create cudnn handle: " << ToString(status);
+  if (status == CUDNN_STATUS_NOT_INITIALIZED) {
+    // This is the error code that the driver returns when we're not running a
+    // sufficient CUDA driver -- cudnn requires 6.5+ compatibility, which
+    // starts with the 340.XX driver series.
+    auto result = cuda::Diagnostician::FindKernelDriverVersion();
+    if (!result.ok()) {
+      LOG(ERROR) << "error retrieving driver version: "
+                 << DriverVersionStatusToString(result);
+    } else {
+      const auto& version = result.ValueOrDie();
+      LOG(INFO) << "running driver version: " << DriverVersionToString(version);
+      if (std::get<0>(version) < 340) {
+        LOG(ERROR)
+            << "cudnn library is only supported on 340.XX+ driver versions";
+      }
+    }
+  }
+  return port::Status{port::error::INTERNAL,
+                      port::StrCat("cudnn library could not create a handle: ",
+                                   ToString(status))};
+}
+
+// Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
+class ScopedTensorDescriptor {
+ public:
+  ScopedTensorDescriptor(CUDAExecutor* parent,
+                         const BatchDescriptor& batch_descriptor,
+                         cudnnDataType_t elem_type)
+      : parent_(parent), handle_(nullptr) {
+    cudnnStatus_t status =
+        dynload::cudnnCreateTensorDescriptor(parent_, &handle_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(FATAL) << "could not create cudnn tensor descriptor: "
+                 << ToString(status);
+    }
+
+    cudnnTensorFormat_t format;
+    switch (batch_descriptor.layout()) {
+      case dnn::DataLayout::kBatchYXDepth:
+        format = CUDNN_TENSOR_NHWC;
+        break;
+      case dnn::DataLayout::kBatchDepthYX:
+        format = CUDNN_TENSOR_NCHW;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported tensor format "
+                   << DataLayoutString(batch_descriptor.layout());
+        break;
+    }
+
+    status = dynload::cudnnSetTensor4dDescriptor(
+        parent_, handle_, format, elem_type,
+        CheckedNarrowing<int64, int>(batch_descriptor.count()),
+        CheckedNarrowing<int64, int>(batch_descriptor.feature_map_count()),
+        CheckedNarrowing<int64, int>(batch_descriptor.height()),
+        CheckedNarrowing<int64, int>(batch_descriptor.width()));
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(FATAL) << "could not set cudnn tensor descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  ~ScopedTensorDescriptor() {
+    cudnnStatus_t status =
+        dynload::cudnnDestroyTensorDescriptor(parent_, handle_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(ERROR) << "could not destroy cudnn tensor descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  cudnnTensorDescriptor_t handle() const { return handle_; }
+
+ private:
+  CUDAExecutor* parent_;            // Parent executor. Not owned.
+  cudnnTensorDescriptor_t handle_;  // Owned.
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+};
+
+// Turns a FilterDescriptor structure into a cudnn filter handle within a scope.
+class ScopedFilterDescriptor {
+ public:
+  ScopedFilterDescriptor(CUDAExecutor* parent,
+                         const FilterDescriptor& filter_descriptor,
+                         cudnnDataType_t elem_type)
+      : parent_(parent), handle_(nullptr) {
+    cudnnStatus_t status =
+        dynload::cudnnCreateFilterDescriptor(parent_, &handle_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(FATAL) << "could not create cudnn filter descriptor: "
+                 << ToString(status);
+    }
+
+    // TODO(b/23032134): Even if the filter layout is not supported,
+    // cudnnSetFilter4DDescriptor will return CUDNN_STATUS_SUCCESS because it
+    // does not take layout as an input. Maybe force cuDNN by giving wrong
+    // inputs intentionally?
+    switch (filter_descriptor.layout()) {
+      case dnn::FilterLayout::kOutputInputYX:
+        break;
+      default:
+        LOG(FATAL) << "Unsupported filter format "
+                   << FilterLayoutString(filter_descriptor.layout());
+        break;
+    }
+
+    status = dynload::cudnnSetFilter4dDescriptor(
+        parent_, handle_, elem_type,
+        CheckedNarrowing<int64, int>(
+            filter_descriptor.output_feature_map_count()),
+        CheckedNarrowing<int64, int>(
+            filter_descriptor.input_feature_map_count()),
+        CheckedNarrowing<int64, int>(filter_descriptor.input_filter_height()),
+        CheckedNarrowing<int64, int>(filter_descriptor.input_filter_width()));
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(FATAL) << "could not set cudnn filter descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  ~ScopedFilterDescriptor() {
+    cudnnStatus_t status =
+        dynload::cudnnDestroyFilterDescriptor(parent_, handle_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(ERROR) << "could not destroy cudnn filter descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  cudnnFilterDescriptor_t handle() const { return handle_; }
+
+ private:
+  // Parent executor object. Not owned.
+  CUDAExecutor* parent_;
+
+  // cudnn filter descriptor this object creates. Owned.
+  cudnnFilterDescriptor_t handle_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+};
+
+// Turns a ConvolutionDescriptor structure into a cudnn convolution handle
+// within a scope.
+class ScopedConvolutionDescriptor {
+ public:
+  ScopedConvolutionDescriptor(
+      CUDAExecutor* parent, const ConvolutionDescriptor& convolution_descriptor)
+      : parent_(parent), handle_(nullptr) {
+    cudnnStatus_t status =
+        dynload::cudnnCreateConvolutionDescriptor(parent_, &handle_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(FATAL) << "could not create cudnn convolution descriptor: "
+                 << ToString(status);
+    }
+
+    status = dynload::cudnnSetConvolution2dDescriptor(
+        parent_, handle_, CheckedNarrowing<int64, int>(
+                              convolution_descriptor.zero_padding_height()),
+        CheckedNarrowing<int64, int>(
+            convolution_descriptor.zero_padding_width()),
+        CheckedNarrowing<int64, int>(
+            convolution_descriptor.vertical_filter_stride()),
+        CheckedNarrowing<int64, int>(
+            convolution_descriptor.horizontal_filter_stride()),
+        // TODO(leary) not sure what the following two params do.
+        1 /* = upscale_input_x */, 1 /* = upscale_input_y */,
+        // NOTE(keveman): cuDNN supports convolution and cross correlation.
+        // However, almost all the use cases do cross correlation, so just hard
+        // coding it here.
+        CUDNN_CROSS_CORRELATION);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(FATAL) << "could not set cudnn convolution descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  ~ScopedConvolutionDescriptor() {
+    cudnnStatus_t status =
+        dynload::cudnnDestroyConvolutionDescriptor(parent_, handle_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(ERROR) << "could not destroy cudnn convolution descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  cudnnConvolutionDescriptor_t handle() const { return handle_; }
+
+ private:
+  CUDAExecutor* parent_;                 // Parent executor. Not owned.
+  cudnnConvolutionDescriptor_t handle_;  // Owned.
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+};
+
+// Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
+// within a scope.
+class ScopedPoolingDescriptor {
+ public:
+  ScopedPoolingDescriptor(CUDAExecutor* parent,
+                          const PoolingDescriptor& pooling_descriptor)
+      : parent_(parent), handle_(nullptr) {
+    cudnnStatus_t status =
+        dynload::cudnnCreatePoolingDescriptor(parent_, &handle_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(FATAL) << "could not create cudnn pooling descriptor: "
+                 << ToString(status);
+    }
+    status = dynload::cudnnSetPooling2dDescriptor(
+        parent_, handle_,
+        (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
+             ? CUDNN_POOLING_MAX
+             : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING),
+        CheckedNarrowing<int64, int>(pooling_descriptor.window_height()),
+        CheckedNarrowing<int64, int>(pooling_descriptor.window_width()),
+        CheckedNarrowing<int64, int>(pooling_descriptor.vertical_padding()),
+        CheckedNarrowing<int64, int>(pooling_descriptor.horizontal_padding()),
+        CheckedNarrowing<int64, int>(pooling_descriptor.vertical_stride()),
+        CheckedNarrowing<int64, int>(pooling_descriptor.horizontal_stride()));
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(FATAL) << "could not set cudnn pooling descriptor: "
+                 << ToString(status);
+    }
+  }
+  ~ScopedPoolingDescriptor() {
+    cudnnStatus_t status =
+        dynload::cudnnDestroyPoolingDescriptor(parent_, handle_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+      LOG(ERROR) << "could not destroy cudnn pooling descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  cudnnPoolingDescriptor_t handle() const { return handle_; }
+
+ private:
+  CUDAExecutor* parent_;             // Parent executor. Not owned.
+  cudnnPoolingDescriptor_t handle_;  // Owned.
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+};
+
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<float>& input_data,
+    const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<float>& filter_data,
+    const ConvolutionDescriptor& convolution_descriptor,
+    const BatchDescriptor& output_descriptor,
+    DeviceMemory<float>* output_data) {
+  ScopedTensorDescriptor input_4d{parent_, batch_descriptor, CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor output_4d{parent_, output_descriptor,
+                                   CUDNN_DATA_FLOAT};
+  ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT};
+  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor};
+
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
+  }
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  // The NO_WORKSPACE versions are possibly slower for certain shapes, but
+  // not so for the shapes currently used by Brain. Also, it seems prudent to
+  // keep cuMemAlloc off the critical path.
+  cudnnConvolutionFwdAlgo_t algo;
+  status = dynload::cudnnGetConvolutionForwardAlgorithm(
+      parent_, ToHandle(dnn_handle_), input_4d.handle(), filter.handle(),
+      conv.handle(), output_4d.handle(), CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0,
+      &algo);
+
+  CHECK_EQ(status, CUDNN_STATUS_SUCCESS)
+      << "Unable to find a suitable algorithm for doing forward convolution";
+
+  status = dynload::cudnnConvolutionForward(
+      parent_, ToHandle(dnn_handle_), &alpha, input_4d.handle(),
+      input_data.opaque(), filter.handle(), filter_data.opaque(), conv.handle(),
+      algo, nullptr /* workspace ptr */, 0 /* workspace size */, &beta,
+      output_4d.handle(), output_data->opaque());
+
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to enqueue convolution on stream: "
+               << ToString(status);
+    return false;
+  }
+
+  return true;
+}
+
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<double>& input_data,
+    const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<double>& filter_data,
+    const ConvolutionDescriptor& convolution_descriptor,
+    const BatchDescriptor& output_descriptor,
+    DeviceMemory<double>* output_data) {
+  LOG(ERROR) << "double-based DNN not yet implemented";
+  return false;
+}
+
+DeviceMemory<float> CudnnSupport::MaybeTransformLayout(
+    Stream* stream, BatchDescriptor* output_descriptor,
+    DeviceMemory<float> backward_output_data,
+    std::unique_ptr<TemporaryDeviceMemory<float>>* transform_scratch) {
+  if (output_descriptor->layout() == dnn::DataLayout::kBatchDepthYX) {
+    return backward_output_data;
+  }
+  CHECK(output_descriptor->layout() == dnn::DataLayout::kBatchYXDepth);
+  *transform_scratch =
+      stream->AllocateTemporaryArray<float>(backward_output_data.ElementCount())
+          .ConsumeValueOrDie();
+  BatchDescriptor transformed_output_descriptor;
+  transformed_output_descriptor.CloneFrom(*output_descriptor);
+  transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX);
+  ScopedTensorDescriptor orig_out_back_4d{parent_, *output_descriptor,
+                                          CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor transformed_out_back_4d{
+      parent_, transformed_output_descriptor, CUDNN_DATA_FLOAT};
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  auto status = dynload::cudnnTransformTensor(
+      parent_, ToHandle(dnn_handle_), &alpha, orig_out_back_4d.handle(),
+      backward_output_data.opaque(), &beta, transformed_out_back_4d.handle(),
+      (*transform_scratch)->mutable_device_memory()->opaque());
+
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "Failed to transform the data layout.";
+  }
+  output_descriptor->set_layout(dnn::DataLayout::kBatchDepthYX);
+  return (*transform_scratch)->device_memory();
+}
+
+bool CudnnSupport::DoConvolveBackwardData(
+    Stream* stream, const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<float>& filter_data,
+    const BatchDescriptor& output_descriptor_in,
+    DeviceMemory<float> backward_output_data,
+    const ConvolutionDescriptor& convolution_descriptor,
+    const BatchDescriptor& input_descriptor,
+    DeviceMemory<float>* backward_input_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
+  }
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass.
+  BatchDescriptor output_descriptor;
+  output_descriptor.CloneFrom(output_descriptor_in);
+  std::unique_ptr<TemporaryDeviceMemory<float>> transform_scratch;
+  backward_output_data = MaybeTransformLayout(
+      stream, &output_descriptor, backward_output_data, &transform_scratch);
+
+  ScopedTensorDescriptor out_back_4d{parent_, output_descriptor,
+                                     CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor in_back_4d{parent_, input_descriptor,
+                                    CUDNN_DATA_FLOAT};
+  ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT};
+  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor};
+
+  status = dynload::cudnnConvolutionBackwardData(
+      parent_, ToHandle(dnn_handle_), &alpha, filter.handle(),
+      filter_data.opaque(), out_back_4d.handle(), backward_output_data.opaque(),
+      conv.handle(), &beta, in_back_4d.handle(), backward_input_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to enqueue convolution on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool CudnnSupport::DoConvolveBackwardFilter(
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<float>& input_data,
+    const dnn::BatchDescriptor& output_descriptor_in,
+    DeviceMemory<float> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemory<float>* backward_filter_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
+  }
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass.
+  BatchDescriptor output_descriptor;
+  output_descriptor.CloneFrom(output_descriptor_in);
+  std::unique_ptr<TemporaryDeviceMemory<float>> transform_scratch;
+  backward_output_data = MaybeTransformLayout(
+      stream, &output_descriptor, backward_output_data, &transform_scratch);
+
+  ScopedTensorDescriptor out_back_4d{parent_, output_descriptor,
+        CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor input_4d{parent_, input_descriptor, CUDNN_DATA_FLOAT};
+  ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT};
+  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor};
+
+  status = dynload::cudnnConvolutionBackwardFilter(
+      parent_, ToHandle(dnn_handle_), &alpha, input_4d.handle(),
+      input_data.opaque(), out_back_4d.handle(), backward_output_data.opaque(),
+      conv.handle(), &beta, filter.handle(), backward_filter_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to enqueue convolution on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool CudnnSupport::DoMatMul(Stream* stream,
+                            const DeviceMemory<float>& input_data,
+                            const DeviceMemory<float>& weights,
+                            const dnn::BatchDescriptor& input_dimensions,
+                            const dnn::BatchDescriptor& output_dimensions,
+                            DeviceMemory<float>* output_data) {
+  if (input_dimensions.count() != output_dimensions.count()) {
+    LOG(ERROR) << "MatMul input and output dimensions are not compatible.";
+    return false;
+  }
+
+  // We do not permute the input or output, instead we just
+  // reinterpret the layout. We are working with row-major matrices
+  // and the rows of the input and output correspond to batch, so
+  // batch has to be outermost in both the input and output.
+  //
+  // By adding transposes to the BLAS gemm call we could perhaps make
+  // the kYXDepthBatch layout work as well, but there has been no need
+  // for that so far.
+  if (input_dimensions.layout() != dnn::DataLayout::kBatchYXDepth &&
+      input_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) {
+    LOG(ERROR) << "Unsupported MatMul input layout.";
+    return false;
+  }
+  if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth &&
+      output_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) {
+    LOG(ERROR) << "Unsupported MatMul output layout.";
+    return false;
+  }
+
+  if (output_dimensions.width() == 1 && output_dimensions.height() == 1) {
+    // This is a fast path that also supports the kBatchYXDepth layout.
+
+    // The matrices here are in row-major format while BLAS expects
+    // column-major, i.e. our matrices are transposed as far as BLAS
+    // is concerned. So we need to compute output^T =
+    // input^T*weights^T. There is no parameter for transposing the
+    // output in BLAS gemm, but instead we can transpose both sides of
+    // the equality to see that this is equivalent to
+    // output=weights*input. So we only need to swap the order of
+    // weights and input in the matrix product to correct for the
+    // row-major versus column-major difference.
+    const float alpha = 1.0f;  // Take the matrix product without scaling it.
+    const float beta = 0.0f;   // Ignore the original values in output_data.
+    const int64 m = output_dimensions.NodesAcrossFeatureMaps();
+    const int64 n = input_dimensions.count();
+    const int64 k = input_dimensions.NodesAcrossFeatureMaps();
+    stream->ThenBlasGemm(blas::Transpose::kNoTranspose,
+                         blas::Transpose::kNoTranspose, m, n, k, alpha, weights,
+                         m, input_data, k, beta, output_data, m);
+  } else {
+    // This is a slower and more complex path that supports output
+    // width() * height() > 1, though it only supports the
+    // kBatchYXDepth layout. Does support kBatchDepthYX if output
+    // feature_map_count() == 1, as then there is no difference
+    // between the two layouts.
+    //
+    // The operation here is the same as above, except that we have to
+    // do the matrix multiplication for each (y,x) output coordinate
+    // separately. We then interpret weights as containing K = width()
+    // * height() different matrices, which we all multiply onto the
+    // matrix from input_data, yielding K matrix products. We then
+    // combine these together into one matrix by concatenating all the
+    // first rows of these matrices, then all the seconds rows and so
+    // on. We can do this with a batched matrix multiplication, where
+    // the result is written to a different submatrix of the output
+    // for each matrix multiplication.
+    //
+    // The reason that we only support the kBatchYXDepth output layout
+    // is that we have to do something in the depth for each (y,x)
+    // coordinate. The kBatchYXDepth layout has the depth information
+    // for each point (y,x) in contiguous memory while the
+    // kBatchDepthYX layout does not.
+    //
+    // TODO(broune): Consider a special case for when output depth ==
+    // 1, as then possibly this could all be done as one matrix
+    // multiplication instead of a batched one, which should be
+    // faster. Another possibility would be to add a weights layout
+    // parameter and then support kBatchDepthYX for a different
+    // weights layout.
+    if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth &&
+        !(output_dimensions.layout() == dnn::DataLayout::kBatchDepthYX &&
+          output_dimensions.feature_map_count() == 1)) {
+      LOG(ERROR) << "Unsupported MatMul output layout.";
+      return false;
+    }
+
+    const float alpha = 1.0f;  // Take the matrix product without scaling it.
+    const float beta = 0.0f;   // Ignore the original values in output_data.
+    const uint64 m = output_dimensions.feature_map_count();
+    const uint64 n = input_dimensions.count();
+    const uint64 k = input_dimensions.NodesAcrossFeatureMaps();
+    const int lda = m;
+    const int ldb = k;
+    const int ldc = output_dimensions.NodesAcrossFeatureMaps();
+    const int batch_count = output_dimensions.NodesPerFeatureMap();
+
+    std::vector<DeviceMemory<float>> a(batch_count);
+    std::vector<DeviceMemory<float>> b(batch_count);
+    std::vector<DeviceMemory<float>> c(batch_count);
+    for (int i = 0; i < batch_count; ++i) {
+      const int weights_offset = i * input_dimensions.NodesAcrossFeatureMaps() *
+                                 output_dimensions.feature_map_count();
+      a[i] = DeviceMemory<float>::MakeFromByteSize(
+          const_cast<float*>(reinterpret_cast<const float*>(weights.opaque())) +
+              weights_offset,
+          weights.ElementCount() - weights_offset);
+
+      b[i] = input_data;
+
+      const int output_offset = i * output_dimensions.feature_map_count();
+      c[i] = DeviceMemory<float>::MakeFromByteSize(
+          const_cast<float*>(
+              reinterpret_cast<const float*>(output_data->opaque())) +
+              output_offset,
+          output_data->ElementCount() - output_offset);
+    }
+    const auto toPtrs = [](std::vector<DeviceMemory<float>>& v) {
+      std::vector<DeviceMemory<float>*> ptrs;
+      for (auto& mem : v) {
+        ptrs.push_back(&mem);
+      }
+      return ptrs;
+    };
+
+    stream->ThenBlasGemmBatched(blas::Transpose::kNoTranspose,
+                                blas::Transpose::kNoTranspose, m, n, k, alpha,
+                                toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c),
+                                ldc, batch_count);
+  }
+
+  return stream->ok();
+}
+
+bool CudnnSupport::DoBiasAdd(Stream* stream,
+                             const DeviceMemory<float>& input_data,
+                             const DeviceMemory<float>& biases,
+                             const dnn::BatchDescriptor& dimensions,
+                             DeviceMemory<float>* output_data) {
+  ScopedTensorDescriptor input_descriptor{parent_, dimensions,
+                                          CUDNN_DATA_FLOAT};
+
+  BatchDescriptor bias_dimensions;
+  bias_dimensions.set_count(1)
+      .set_feature_map_count(dimensions.feature_map_count())
+      .set_height(1)
+      .set_width(1)
+      .set_layout(dnn::DataLayout::kBatchYXDepth);
+  ScopedTensorDescriptor bias_descriptor{parent_, bias_dimensions,
+                                         CUDNN_DATA_FLOAT};
+
+  // cudnnAddTensor is in-place, so we need to copy input_data to
+  // output_data before doing the addition, unless the input and
+  // output are at the same address.
+  if (input_data.opaque() != output_data->opaque()) {
+    stream->ThenMemcpy(output_data, input_data,
+                       dimensions.ElementCount() * sizeof(float));
+    if (!stream->ok()) {
+      LOG(ERROR)
+          << "stream " << stream
+          << " could not enqueue a tensor copy as part of bias addition.";
+      return false;
+    }
+  }
+
+  mutex_lock lock{dnn_handle_mutex_};
+
+  const float alpha = 1.0f;
+  const float beta = 1.0f;
+  auto status = dynload::cudnnAddTensor(
+      parent_, ToHandle(dnn_handle_), CUDNN_ADD_SAME_C, &alpha,
+      bias_descriptor.handle(), biases.opaque(), &beta,
+      input_descriptor.handle(), output_data->opaque());
+
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "stream " << stream << " could not enqueue bias addition.";
+    return false;
+  }
+
+  return true;
+}
+
+bool CudnnSupport::DoActivate(Stream* stream,
+                              dnn::ActivationMode activation_mode,
+                              const dnn::BatchDescriptor& dimensions,
+                              const DeviceMemory<float>& input_data,
+                              DeviceMemory<float>* output_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+  cudnnActivationMode_t mode;
+  switch (activation_mode) {
+    case dnn::ActivationMode::kRelu6:
+      // TODO(leary) should probably do a post-pass to clip at 6?
+      LOG(WARNING) << "user requested Relu6, but providing Relu instead";
+      mode = CUDNN_ACTIVATION_RELU;
+      break;
+    case dnn::ActivationMode::kReluX:
+      // TODO(broune) should probably do a post-pass to clip at X?
+      LOG(WARNING) << "user requested ReluX, but providing Relu instead";
+      mode = CUDNN_ACTIVATION_RELU;
+      break;
+    case dnn::ActivationMode::kRelu:
+      mode = CUDNN_ACTIVATION_RELU;
+      break;
+    case dnn::ActivationMode::kSigmoid:
+      mode = CUDNN_ACTIVATION_SIGMOID;
+      break;
+    case dnn::ActivationMode::kTanh:
+      mode = CUDNN_ACTIVATION_TANH;
+      break;
+    default:
+      LOG(ERROR) << "unrecognized activation mode: "
+                 << static_cast<int>(activation_mode);
+      return false;
+  }
+
+  ScopedTensorDescriptor input_4d{parent_, dimensions, CUDNN_DATA_FLOAT};
+  // Alpha is the input scaling factor.
+  float alpha = 1.0;
+  // Beta is the output scaling factor.
+  float beta = 0.0;
+  status = dynload::cudnnActivationForward(
+      parent_, ToHandle(dnn_handle_), mode, &alpha, input_4d.handle(),
+      input_data.opaque(), &beta, input_4d.handle(), output_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "stream " << stream
+               << " could not enqueue activation: " << ToString(status);
+    return false;
+  }
+
+  return true;
+}
+
+bool CudnnSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<float>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<float>* output_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
+                                   CUDNN_DATA_FLOAT};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = dynload::cudnnPoolingForward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
+      output_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool CudnnSupport::DoPoolBackward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<float>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<float>& output_data,
+    const DeviceMemory<float>& input_diff_data,
+    DeviceMemory<float>* output_diff_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
+        CUDNN_DATA_FLOAT};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = dynload::cudnnPoolingBackward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
+      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
+      src_desc.handle(), output_diff_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool CudnnSupport::DoNormalize(
+    Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+    const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
+  LOG(FATAL) << "not yet implemented";  // TODO(leary)
+}
+
+bool CudnnSupport::DoDepthConcatenate(
+    Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+    port::ArraySlice<const DeviceMemory<float>*> input_data,
+    DeviceMemory<float>* output_data) {
+  LOG(FATAL) << "not yet implemented";  // TODO(leary)
+}
+
+bool CudnnSupport::DoElementwiseOperate(
+    Stream* stream, dnn::ElementwiseOperation operation,
+    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+    port::ArraySlice<const DeviceMemory<float>*> input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<float>* output_data) {
+  LOG(FATAL) << "not yet implemented";  // TODO(leary)
+}
+
+bool CudnnSupport::DoMemcpyD2HQuantized(
+    Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
+    port::MutableArraySlice<uint8> host_dst) {
+  LOG(ERROR) << "quantized memcpy not supported by cuDNN";
+  return false;
+}
+
+bool CudnnSupport::DoMemcpyD2HQuantized(
+    Stream* stream, const DeviceMemory<float>& device_unquantized_src,
+    port::MutableArraySlice<uint16> host_dst) {
+  LOG(ERROR) << "quantized memcpy not supported by cuDNN";
+  return false;
+}
+
+bool CudnnSupport::DoMemcpyD2HQuantized(
+    Stream* stream, const DeviceMemory<float>& device_unquantized_src,
+    port::MutableArraySlice<int32> host_dst) {
+  LOG(ERROR) << "quantized memcpy not supported by cuDNN";
+  return false;
+}
+
+bool CudnnSupport::DoMemcpyH2DQuantized(
+    Stream* stream, port::ArraySlice<uint8> host_src,
+    DeviceMemory<float>* gpu_unquantized_dst) {
+  LOG(ERROR) << "quantized memcpy not supported by cuDNN";
+  return false;
+}
+
+bool CudnnSupport::DeriveOutputBatchDescriptor(
+    const BatchDescriptor& batch_descriptor,
+    const FilterDescriptor& filter_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    dnn::BatchDescriptor* output_batch_descriptor) {
+  ScopedTensorDescriptor input_4d{parent_, batch_descriptor, CUDNN_DATA_FLOAT};
+  ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT};
+  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor};
+
+  int dims[4];
+  auto status = dynload::cudnnGetConvolutionNdForwardOutputDim(
+      parent_, conv.handle(), input_4d.handle(), filter.handle(), 4, dims);
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "could not get output tensor for convolution: "
+               << ToString(status);
+    return false;
+  }
+
+  output_batch_descriptor->set_count(dims[0])
+      .set_feature_map_count(dims[1])
+      .set_height(dims[2])
+      .set_width(dims[3])
+      .set_layout(batch_descriptor.layout());
+  return true;
+}
+
+}  // namespace cuda
+
+namespace gpu = ::perftools::gputools;
+
+void initialize_cudnn() {
+  gpu::port::Status status =
+      gpu::PluginRegistry::Instance()
+          ->RegisterFactory<gpu::PluginRegistry::DnnFactory>(
+              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuDnnPlugin, "cuDNN",
+              [](gpu::internal::StreamExecutorInterface*
+                     parent) -> gpu::dnn::DnnSupport* {
+                gpu::cuda::CUDAExecutor* cuda_executor =
+                    dynamic_cast<gpu::cuda::CUDAExecutor*>(parent);
+                if (cuda_executor == nullptr) {
+                  LOG(ERROR)
+                      << "Attempting to initialize an instance of the cuBLAS "
+                      << "support library with a non-CUDA StreamExecutor";
+                  return nullptr;
+                }
+
+                gpu::cuda::CudnnSupport* dnn =
+                    new gpu::cuda::CudnnSupport(cuda_executor);
+                if (!dnn->Init().ok()) {
+                  // Note: Init() will log a more specific error.
+                  delete dnn;
+                  return nullptr;
+                }
+                return dnn;
+              });
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to register cuDNN factory: "
+               << status.error_message();
+  }
+
+  // Prime the cuDNN DSO. The loader will log more information.
+  auto statusor = gpu::internal::CachedDsoLoader::GetCudnnDsoHandle();
+  if (!statusor.ok()) {
+    LOG(INFO) << "Unable to load cuDNN DSO.";
+  }
+
+  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
+                                                     gpu::PluginKind::kDnn,
+                                                     gpu::cuda::kCuDnnPlugin);
+}
+
+}  // namespace gputools
+}  // namespace perftools
+
+REGISTER_MODULE_INITIALIZER(register_cudnn,
+                            { perftools::gputools::initialize_cudnn(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
new file mode 100644
index 0000000000..08e952cee0
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -0,0 +1,206 @@
+// The CUDA-specific DNN library support, implementing the general DnnSupport
+// interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
+
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/temporary_device_memory.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+class CUDAExecutor;
+
+// Opaque and unique identifer for the cuDNN plugin.
+extern const PluginId kCuDnnPlugin;
+
+// cudnn-library based DNN support. For details on overridden interface
+// functions, see dnn.h.
+class CudnnSupport : public dnn::DnnSupport {
+ public:
+  explicit CudnnSupport(CUDAExecutor* parent);
+  ~CudnnSupport() override;
+
+  port::Status Init() override;
+
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+                  const DeviceMemory<float>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<float>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<float>* output_data) override;
+
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+                  const DeviceMemory<double>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<double>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<double>* output_data) override;
+
+  bool DoSeparableConvolve(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<float>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor, int depth_multiplier,
+      const DeviceMemory<float>& first_weights,
+      const DeviceMemory<float>& second_weights,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data) override {
+    LOG(ERROR) << "separable convolution not supported by CUDNN";
+    return false;
+  }
+
+  bool DoConvolveBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<float>* backward_input_data) override;
+
+  bool DoConvolveBackwardFilter(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<float>& input_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemory<float>* backward_filter_data) override;
+
+  bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data,
+                const DeviceMemory<float>& weights,
+                const dnn::BatchDescriptor& input_dimensions,
+                const dnn::BatchDescriptor& output_dimensions,
+                DeviceMemory<float>* output_data) override;
+
+  bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data,
+                         const DeviceMemory<int8>& quantized_weights,
+                         const DeviceMemory<float>& weight_scales,
+                         const dnn::BatchDescriptor& input_dimensions,
+                         const dnn::BatchDescriptor& output_dimensions,
+                         DeviceMemory<float>* output_data) override {
+    LOG(ERROR) << "DNN MatMulQuantized not supported by CUDNN";
+    return false;
+  }
+
+  bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data,
+                         const DeviceMemory<int16>& quantized_weights,
+                         const DeviceMemory<float>& weight_scales,
+                         const dnn::BatchDescriptor& input_dimensions,
+                         const dnn::BatchDescriptor& output_dimensions,
+                         DeviceMemory<float>* output_data) override {
+    LOG(ERROR) << "DNN MatMulQuantized not supported by CUDNN";
+    return false;
+  }
+
+  bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data,
+                 const DeviceMemory<float>& biases,
+                 const dnn::BatchDescriptor& dimensions,
+                 DeviceMemory<float>* output_data) override;
+
+  bool DoActivate(Stream* stream, dnn::ActivationMode activation_mode,
+                  const dnn::BatchDescriptor& dimensions,
+                  const DeviceMemory<float>& input_data,
+                  DeviceMemory<float>* output_data) override;
+
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<float>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<float>* output_data) override;
+
+  bool DoPoolBackward(Stream* stream,
+                      const dnn::PoolingDescriptor& pooling_dimensions,
+                      const dnn::BatchDescriptor& input_dimensions,
+                      const DeviceMemory<float>& input_data,
+                      const dnn::BatchDescriptor& output_dimensions,
+                      const DeviceMemory<float>& output_data,
+                      const DeviceMemory<float>& input_diff_data,
+                      DeviceMemory<float>* output_diff_data) override;
+
+  bool DoNormalize(Stream* stream,
+                   const dnn::NormalizeDescriptor& normalize_descriptor,
+                   const DeviceMemory<float>& input_data,
+                   DeviceMemory<float>* output_data) override;
+
+  bool DoDepthConcatenate(
+      Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float>*> input_data,
+      DeviceMemory<float>* output_data) override;
+
+  bool DoElementwiseOperate(
+      Stream* stream, dnn::ElementwiseOperation operation,
+      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float>*> input_data,
+      const dnn::BatchDescriptor& output_dimensions,
+      DeviceMemory<float>* output_data) override;
+
+  bool DoMemcpyD2HQuantized(Stream* stream,
+                            const DeviceMemory<float>& device_unquantized_src,
+                            port::MutableArraySlice<uint8> host_dst) override;
+
+  bool DoMemcpyD2HQuantized(Stream* stream,
+                            const DeviceMemory<float>& device_unquantized_src,
+                            port::MutableArraySlice<uint16> host_dst) override;
+
+  bool DoMemcpyD2HQuantized(Stream* stream,
+                            const DeviceMemory<float>& device_unquantized_src,
+                            port::MutableArraySlice<int32> host_dst) override;
+
+  bool DoMemcpyH2DQuantized(
+      Stream* stream, port::ArraySlice<uint8> host_src,
+      DeviceMemory<float>* device_unquantized_dst) override;
+
+  // Derives an output batch descriptor from an input batch and convolution
+  // descriptors.
+  bool DeriveOutputBatchDescriptor(
+      const dnn::BatchDescriptor& batch_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::BatchDescriptor* output_batch_descriptor);
+
+ private:
+  // Guards the enqueueing of DNN operations via the dnn_handle_ below.
+  mutex dnn_handle_mutex_;
+
+  CUDAExecutor* parent_;  // Parent executor object. Not owned.
+
+  // cudnn library handle. cudnnHandle_t type is not present in this header to
+  // prevent third-party library header inclusions from leaking outside the
+  // single cuda_dnn translation unit.
+  void* dnn_handle_ GUARDED_BY(dnn_handle_mutex_);
+
+  // NOTE(keveman): Temporary data layout transformation until cuDNN supports
+  // kBatchYXDepth for backward pass. This function allocates temporary memory,
+  // lays out the source data into the temporary but in the kBatchDepthXY
+  // layout, and returns the temporary memory. The caller is responsible for
+  // deallocating the temporary. Since the allocation is done using Stream's
+  // AllocateTemporaryMemory, a later BlockHostUntilDone could be used for
+  // deallocation.
+  //
+  // transform_scratch is populated with a legitimate temporary allocation iff
+  // the original output data needs to be transformed.
+  DeviceMemory<float> MaybeTransformLayout(
+      Stream* stream, dnn::BatchDescriptor* output_descriptor,
+      DeviceMemory<float> backward_output_data,
+      std::unique_ptr<TemporaryDeviceMemory<float>>* transform_scratch)
+      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
new file mode 100644
index 0000000000..8c4316b4c1
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -0,0 +1,1608 @@
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <set>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/casts.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/human_readable.h"
+#include "tensorflow/stream_executor/lib/notification.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/lib/stacktrace.h"
+#include "tensorflow/stream_executor/lib/static_threadlocal.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
+
+bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
+bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
+bool FLAGS_gpuexec_cuda_device_0_only = false;
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+namespace dynload {
+
+#define PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                             \
+    static const char *kName;                                                \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;         \
+    static void *GetDsoHandle() {                                            \
+      static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
+      return status.ValueOrDie();                                            \
+    }                                                                        \
+    static FuncPointerT DynLoad() {                                          \
+      static void *f = dlsym(GetDsoHandle(), kName);                         \
+      CHECK(f != nullptr) << "could not find " << kName                      \
+                          << "in libcuda DSO; dlerror: " << dlerror();       \
+      return reinterpret_cast<FuncPointerT>(f);                              \
+    }                                                                        \
+    template <typename... Args>                                              \
+    CUresult operator()(Args... args) {                                      \
+      return DynLoad()(args...);                                             \
+    }                                                                        \
+  } __name;                                                                  \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxCreate_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxDestroy);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxEnablePeerAccess);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetCurrent);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetDevice);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetSharedMemConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxPopCurrent_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetCurrent);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetSharedMemConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceComputeCapability);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceCanAccessPeer);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGet);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetCount);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetName);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetPCIBusId);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetProperties);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceTotalMem);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDriverGetVersion);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventCreate);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorString);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuInit);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuLaunchKernel);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemAlloc_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoD_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoH_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoD_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoDAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoHAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoDAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetAddressRange_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFree_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFreeHost);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetInfo_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostAlloc);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostRegister_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostUnregister);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32Async);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetFunction);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetGlobal_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadDataEx);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadFatBinary);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleUnload);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuPointerGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamAddCallback);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamCreate);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamDestroy_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamQuery);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamWaitEvent);
+
+}  // namespace dynload
+
+namespace {
+
+// Manages the singleton set of contexts that we've created. This is used for
+// checking that no CUDA-runtime-created contexts have been generated
+// accidentally.  CUDA-runtime-created contexts are avoided, if triple angle
+// brace launches are required, by using the scoped activations in
+// cuda_activation.h.
+class CreatedContexts {
+ public:
+  // Returns whether context is a member of the live set.
+  static bool Has(CUcontext context) {
+    shared_lock lock{mu_};
+    return Live()->find(context) != Live()->end();
+  }
+
+  // Adds context to the live set.
+  static void Add(CUcontext context) {
+    CHECK(context != nullptr);
+    mutex_lock lock{mu_};
+    Live()->emplace(context);
+  }
+
+  // Removes context from the live set.
+  static void Remove(CUcontext context) {
+    CHECK(context != nullptr);
+    mutex_lock lock{mu_};
+    Live()->erase(context);
+  }
+
+ private:
+  // Returns the live set singleton.
+  static std::set<CUcontext> *Live() {
+    static auto singleton = new std::set<CUcontext>;
+    return singleton;
+  }
+
+  // Lock that guards access-to/mutation-of the live set.
+  static mutex mu_;
+};
+
+/* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED};
+
+// Formats CUresult to output prettified values into a log stream.
+// Error summaries taken from:
+// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
+//
+// TODO(leary) switch to cuGetErrorName when updated cuda.h is available.
+string ToString(CUresult result) {
+#define OSTREAM_CUDA_ERROR(__name) \
+  case CUDA_ERROR_##__name:        \
+    return "CUDA_ERROR_" #__name;
+
+///////////////
+// NOTE: here we specify return code values outside of the enum explicitly
+// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver
+// libraries are deployed in the fleet these error codes are backwards
+// compatible, but if we see a "new" one, we want to be able to identify it in
+// the logs.
+//
+// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can
+// eliminate this function and just rely on the driver to provide us these
+// strings.
+//
+// NOTE: "Must reboot all context" below is shorthand for, "must
+// destroy/recreate the offending context and any allocation which come from
+// it if you are to continue using CUDA."
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+  switch (result) {
+    OSTREAM_CUDA_ERROR(INVALID_VALUE)
+    OSTREAM_CUDA_ERROR(OUT_OF_MEMORY)
+    OSTREAM_CUDA_ERROR(NOT_INITIALIZED)
+    OSTREAM_CUDA_ERROR(DEINITIALIZED)
+    OSTREAM_CUDA_ERROR(NO_DEVICE)
+    OSTREAM_CUDA_ERROR(INVALID_DEVICE)
+    OSTREAM_CUDA_ERROR(INVALID_IMAGE)
+    OSTREAM_CUDA_ERROR(INVALID_CONTEXT)
+    OSTREAM_CUDA_ERROR(INVALID_HANDLE)
+    OSTREAM_CUDA_ERROR(NOT_FOUND)
+    OSTREAM_CUDA_ERROR(NOT_READY)
+    OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU)
+
+    // Encountered an uncorrectable ECC error during execution.
+    OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE)
+
+    // Load/store on an invalid address. Must reboot all context.
+    case 700:
+      return "CUDA_ERROR_ILLEGAL_ADDRESS";
+    // Passed too many / wrong arguments, too many threads for register count.
+    case 701:
+      return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+    // Kernel took too long to execute.
+    case 702:
+      return "CUDA_ERROR_LAUNCH_TIMEOUT";
+    // Kernel launch uses an incompatible texturing mode.
+    case 703:
+      return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+    // Trying to re-enable peer access that already has it enabled.
+    case 704:
+      return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+    // Trying to disable peer access that has not yet been enabled.
+    case 705:
+      return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+    // Primary context for the specified device has already been initialized.
+    case 708:
+      return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+    // Context current to calling thread has been destroyed or is a primary
+    // context that has not yet been initialized.
+    case 709:
+      return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+    // Device-side assert triggered during kernel execution. Must reboot all
+    // context.
+    case 710:
+      return "CUDA_ERROR_ASSERT";
+    // Hardware resources to enable peer access have been exhausted.
+    case 711:
+      return "CUDA_ERROR_TOO_MANY_PEERS";
+    // Memory range has already been registered.
+    case 712:
+      return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+    // Pointer does not correspond to any currently registered memory region.
+    case 713:
+      return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+    // Due to stack corruption or exceeding stack size limit. Must reboot all
+    // context.
+    case 714:
+      return "CUDA_ERROR_HARDWARE_STACK_ERROR";
+    case 715:
+      return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
+    // Load/store on an unaligned memory address. Must reboot all context.
+    case 716:
+      return "CUDA_ERROR_MISALIGNED_ADDRESS";
+    // Device instruction with specific address space given address not
+    // belonging to allowed address space. Must reboot all context.
+    case 717:
+      return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
+    // Device program counter wrapped its address space. Must reboot all
+    // context.
+    case 718:
+      return "CUDA_ERROR_INVALID_PC";
+    // Exception on device while executing a kernel; e.g. deref invalid device
+    // pointer, accessing OOB shared memory. Must reboot all context.
+    case 719:
+      return "CUDA_ERROR_LAUNCH_FAILED";
+
+    OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
+    OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
+    OSTREAM_CUDA_ERROR(NOT_PERMITTED)
+    OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
+    OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
+    default:
+      return port::StrCat("CUresult(", static_cast<int>(result), ")");
+  }
+#pragma GCC diagnostic pop
+}
+
+// Returns the current context and checks that it is in the set of CUDA contexts
+// created by StreamExecutor (to ensure that the CUDA runtime didn't create a
+// context behind our backs).
+CUcontext CurrentContext() {
+  CUcontext current = nullptr;
+  CUresult result = dynload::cuCtxGetCurrent(&current);
+  if (result != CUDA_SUCCESS) {
+    LOG(FATAL) << "failed to query current context: " << ToString(result);
+  }
+  if (current != nullptr && !CreatedContexts::Has(current)) {
+    LOG(FATAL) << "current context was not created by the StreamExecutor "
+                  "cuda_driver API: "
+               << current
+               << "; a CUDA runtime call "
+                  "was likely performed without using a StreamExecutor context";
+  }
+  return current;
+}
+
+// "Pops" the current context, checks that it matches expected, and checks the
+// postcondition that the current context is nullptr.
+//
+// This is not done when we're nested within a MultiOpActivation, as we want to
+// persist the active context until the MultiOpActivation is popped.
+void PopContextAndCheckNowNull(CUcontext expected) {
+  CUcontext actual = CurrentContext();
+  CHECK_EQ(expected, actual) << "would pop unexpected context";
+  CUcontext popped;
+  CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxPopCurrent_v2(&popped));
+  CHECK_EQ(expected, popped);
+  CHECK(nullptr == CurrentContext());
+  VLOG(3) << "popped context " << expected
+          << " and current context is now null";
+}
+
+// CUDA driver routines may require a large amount of stack (particularly
+// cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
+// stack-limited threads (such as those spawned by a default-argument
+// thread::ThreadPool on some platforms), we run certain routines in this pool
+// and wait for completion.
+static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
+static port::ThreadPool *InitializeDriverExecutor() {
+  return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
+                              "cuda_driver", 1);
+}
+
+port::ThreadPool *GetDriverExecutor() {
+  mutex_lock lock(driver_executor_threadpool_mu);
+  static port::ThreadPool *thread_pool = InitializeDriverExecutor();
+  return thread_pool;
+}
+
+}  // namespace
+
+
+// Thread-local storage that indicates whether a CUDA context activation is
+// being nested within an outer, MultiOpActivation. In that case, we should not
+// pop the context to nullptr when we are done with the current activation.
+SE_STATIC_THREAD_LOCAL_POD(bool, tls_in_multi_op_activation);
+
+string MemorySpaceString(MemorySpace memory_space) {
+  switch (memory_space) {
+    case MemorySpace::kHost:
+      return "host";
+    case MemorySpace::kDevice:
+      return "device";
+    default:
+      LOG(FATAL) << "impossible memory space";
+  }
+}
+
+// Implementation note: the CUDA context is held, per-thread, in TLS. We avoid
+// setting all the time because it's not clear what side effects might occur for
+// a "set" operation, whereas a "get" operation we can reasonably assume is a
+// TLS read.
+//
+// We cannot race here because CUcontext is associated with a particular thread
+// and stored in TLS; and these interfaces should not be used from signal
+// handlers.
+ScopedActivateContext::ScopedActivateContext(CUcontext context,
+                                             MultiOpActivation moa)
+    : context_(CHECK_NOTNULL(context)),
+      previously_in_multi_op_activation_(tls_in_multi_op_activation.get()) {
+  if (static_cast<bool>(moa)) {
+    tls_in_multi_op_activation.get() = true;
+  }
+
+  CUcontext current = prior_context_ = CurrentContext();
+  if (current != context) {
+    VLOG(3) << "ScopedActivateContext switching context from " << current
+            << " to " << context;
+    CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(context));
+    if (FLAGS_gpuexec_cuda_sync_around_driver_calls) {
+      auto res = dynload::cuCtxSynchronize();
+      if (res != CUDA_SUCCESS) {
+        LOG(FATAL) << "gpuexec_cuda_sync_around_driver_calls found "
+                   << ToString(res)
+                   << " immediately after establishing the device context "
+                   << context << " :: " << port::CurrentStackTrace();
+      }
+    }
+  }
+}
+
+ScopedActivateContext::~ScopedActivateContext() {
+  if (tls_in_multi_op_activation.get()) {
+    CHECK_EQ(context_, CurrentContext());
+    if (FLAGS_gpuexec_cuda_sync_around_driver_calls) {
+      auto res = dynload::cuCtxSynchronize();
+      if (res != CUDA_SUCCESS) {
+        LOG(FATAL) << "gpuexec_cuda_sync_around_driver_calls found "
+                   << ToString(res)
+                   << " immediately after de-establishing the device context "
+                   << context_ << " :: " << port::CurrentStackTrace();
+      }
+    }
+    CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(prior_context_));
+  } else {
+    PopContextAndCheckNowNull(context_);
+  }
+  tls_in_multi_op_activation.get() = previously_in_multi_op_activation_;
+}
+
+namespace {
+
+// Returns a stringified device number associated with pointer, primarily for
+// logging purposes. Returns "?" if the device could not be successfully
+// queried.
+string CUDAPointerToDeviceString(CUdeviceptr pointer) {
+  auto value = CUDADriver::GetPointerDevice(pointer);
+  if (value.ok()) {
+    return port::StrCat(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified memory space associated with pointer, primarily for
+// logging purposes. Returns "?" if the memory space could not be successfully
+// queried.
+string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
+  auto value = CUDADriver::GetPointerMemorySpace(pointer);
+  if (value.ok()) {
+    return MemorySpaceString(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified representation of whether or not peer access is
+// permitted between the "from" and "to" pointers' associated contexts,
+// primarily for logging purposes. Returns "error" if an error is encountered
+// in the process of querying.
+string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
+  auto from_context = CUDADriver::GetPointerContext(from);
+  if (!from_context.ok()) {
+    LOG(ERROR) << "could not retrieve source pointer's context: "
+               << from_context.status();
+    return "error";
+  }
+  auto to_context = CUDADriver::GetPointerContext(to);
+  if (!to_context.ok()) {
+    LOG(ERROR) << "could not retrieve destination pointer's context: "
+               << to_context.status();
+    return "error";
+  }
+  return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
+                                         to_context.ValueOrDie())
+             ? "true"
+             : "false";
+}
+
+
+// Actually performs the work of CUDA initialization. Wrapped up in one-time
+// execution guard.
+static port::Status InternalInit() {
+  CUresult res = CUDA_ERROR_NO_DEVICE;
+  if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
+    LOG(ERROR) << "injecting CUDA init error; initialization will fail";
+  } else if (internal::CachedDsoLoader::GetLibcudaDsoHandle().ok()) {
+    // We only call cuInit if we can dynload libcuda.
+    
+    res = dynload::cuInit(0 /* = flags */);
+  }
+
+  if (res == CUDA_SUCCESS) {
+    return port::Status::OK();
+  }
+
+  LOG(ERROR) << "failed call to cuInit: " << ToString(res);
+  Diagnostician::LogDiagnosticInformation();
+  return port::Status{port::error::ABORTED,
+                      port::StrCat("failed call to cuInit: ", ToString(res))};
+}
+
+}  // namespace
+
+/* static */ port::Status CUDADriver::Init() {
+  // Cached return value from calling InternalInit(), as cuInit need only be
+  // called once, but CUDADriver::Init may be called many times.
+  static port::Status init_retval;
+  static bool set = false;
+  static mutex init_mu(LINKER_INITIALIZED);
+
+  mutex_lock lock(init_mu);
+  if (!set) {
+    init_retval = InternalInit();
+    set = true;
+  }
+
+  return init_retval;
+}
+
+/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
+                                                CUdevice *device) {
+  CUresult res = dynload::cuDeviceGet(device, device_ordinal);
+  if (res == CUDA_SUCCESS) {
+    return port::Status::OK();
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::StrCat("failed call to cuDeviceGet: ", ToString(res))};
+}
+
+/* static */ bool CUDADriver::GetDeviceName(CUdevice device,
+                                            string *device_name) {
+  static const size_t kCharLimit = 64;
+  port::InlinedVector<char, 4> chars(kCharLimit);
+  CUresult res =
+      dynload::cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to get device name for " << device << ": "
+               << ToString(res);
+    return false;
+  }
+  chars[kCharLimit - 1] = '\0';
+  *device_name = chars.begin();
+  return true;
+}
+
+bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
+  static_assert(DeviceOptions::kMask == 0xf,
+                "needs update for new device options");
+
+  if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
+    *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
+  }
+
+  // If no flags are set the default is CU_CTX_SCHED_AUTO, which
+  // in Google environments is very likely to mean SPIN.
+  if (device_options.flags() & DeviceOptions::kScheduleSpin) {
+    *flags |= CU_CTX_SCHED_SPIN;
+  }
+  if (device_options.flags() & DeviceOptions::kScheduleYield) {
+    *flags |= CU_CTX_SCHED_YIELD;
+  }
+  if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
+    *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
+  }
+
+  return true;
+}
+
+/* static */ port::Status CUDADriver::CreateContext(
+    CUdevice device, DeviceOptions device_options, CUcontext *context) {
+  CUcontext former_context = CurrentContext();
+  if (former_context != nullptr) {
+    LOG(WARNING) << "creating context when one is currently active; existing: "
+                 << former_context;
+  }
+
+  int flags = 0;
+  if (!DeviceOptionsToContextFlags(device_options, &flags)) {
+    LOG(WARNING) << "could not convert all device options into context flags";
+  }
+
+  CUresult res;
+  {
+    // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
+    // context creation: see http://b/13248943
+    
+    res = dynload::cuCtxCreate_v2(context, flags, device);
+  }
+  if (res == CUDA_SUCCESS) {
+    CreatedContexts::Add(*context);
+    PopContextAndCheckNowNull(*context);
+    CHECK(*context != nullptr)
+        << "success in this call must entail non-null result";
+    VLOG(2) << "created context " << context << " for this thread";
+    return port::Status::OK();
+  }
+
+  string message = "failed call to cuCtxCreate: " + ToString(res);
+  if (res == CUDA_ERROR_OUT_OF_MEMORY) {
+    uint64 total_memory;
+    if (GetDeviceTotalMemory(device, &total_memory)) {
+      port::StrAppend(&message, "; total memory reported: ", total_memory);
+    } else {
+      port::StrAppend(&message, "; could not query total memory");
+    }
+  }
+
+  return port::Status{port::error::INTERNAL, message};
+}
+
+/* static */ void CUDADriver::DestroyContext(CUcontext context) {
+  if (context == nullptr) {
+    return;
+  }
+
+  CUresult res = dynload::cuCtxDestroy_v2(context);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to destroy CUDA context; leaking: " << ToString(res);
+  }
+
+  CreatedContexts::Remove(context);
+}
+
+/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
+                                               CUfunction func,
+                                               int *attribute_value) {
+  CUresult res = dynload::cuFuncGetAttribute(attribute_value, attribute, func);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
+               << ", attribute: " << attribute;
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
+                                                 CUfunc_cache cache_config) {
+  CUresult res = dynload::cuFuncSetCacheConfig(function, cache_config);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
+               << ", config: " << cache_config << ", result: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<CUsharedconfig>
+CUDADriver::ContextGetSharedMemConfig(CUcontext context) {
+  CUsharedconfig shared_mem_config;
+  ScopedActivateContext activation{context};
+  CUresult result = dynload::cuCtxGetSharedMemConfig(&shared_mem_config);
+  if (result != CUDA_SUCCESS) {
+    CUdevice device;
+    dynload::cuCtxGetDevice(&device);
+    LOG(ERROR) << "failed to get CUDA device shared memory config. "
+               << "Context device ID: " << device
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        port::StrCat("failed to get shared memory config: ", ToString(result))};
+  }
+  return shared_mem_config;
+}
+
+/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
+    CUcontext context, CUsharedconfig shared_mem_config) {
+  ScopedActivateContext activation{context};
+  CUresult result = dynload::cuCtxSetSharedMemConfig(shared_mem_config);
+  if (result != CUDA_SUCCESS) {
+    CUdevice device;
+    dynload::cuCtxGetDevice(&device);
+    LOG(ERROR) << "failed to set CUDA device shared memory config. "
+               << "Context device ID: " << device
+               << ", config: " << shared_mem_config
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        port::StrCat("failed to set shared memory config: ", ToString(result))};
+  }
+  return port::Status::OK();
+}
+
+/* static */ bool CUDADriver::LaunchKernel(
+    CUcontext context, CUfunction function, unsigned int grid_dim_x,
+    unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
+    unsigned int block_dim_y, unsigned int block_dim_z,
+    unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
+    void **extra) {
+  ScopedActivateContext activation{context};
+  VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
+          << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
+          << " bdx: " << block_dim_x << " bdy: " << block_dim_y
+          << " bdz: " << block_dim_z;
+  CUresult res = dynload::cuLaunchKernel(
+      function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
+      block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to launch CUDA kernel: " << function
+               << "; result: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully launched kernel";
+  return true;
+}
+
+/* static */ port::Status CUDADriver::LoadCubin(CUcontext context,
+                                                const char *cubin_bytes,
+                                                CUmodule *module) {
+  ScopedActivateContext activation{context};
+  CUresult result = dynload::cuModuleLoadFatBinary(module, cubin_bytes);
+  if (result != CUDA_SUCCESS) {
+    return port::Status{port::error::INTERNAL,
+                        "failed to load in-memory CUBIN: " + ToString(result)};
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ bool CUDADriver::LoadPtx(CUcontext context,
+                                      const char *ptx_contents,
+                                      CUmodule *module) {
+  port::Notification notification;
+  bool ret = true;
+  GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
+                                 &notification]() {
+    ScopedActivateContext activation{context};
+    void *ptx_data = const_cast<char *>(ptx_contents);
+    static const unsigned int kLogBufferBytesLimit = 1024;
+    unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
+    unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
+    port::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
+    port::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
+    bool log_verbose = true;
+    CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+                              CU_JIT_ERROR_LOG_BUFFER,
+                              CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+                              CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
+    // Note that the driver API wants the contents of this values to be stored
+    // in an array of void*s, so we coerce them accordingly.
+    void *option_values[] = {
+        port::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)),
+        port::bit_cast<void *>(error_log_buffer.data()),
+        port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
+        port::bit_cast<void *>(info_log_buffer.data()),
+        port::bit_cast<void *>(uintptr_t(log_verbose))};
+    CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values));
+
+    CUresult res;
+    {
+      // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
+      // module loading: see http://b/13248943
+      
+      res = dynload::cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options),
+                                        options, option_values);
+    }
+
+    // The PTX JIT mutates the values in the option values array to reflect the
+    // size of the logs it output; now that we've made the call, read the values
+    // back out.
+    error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
+    info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
+    CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
+    CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
+
+    if (res != CUDA_SUCCESS) {
+      LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
+      // As a precaution for null termination of the API-provided value, ensure
+      // that at least the last byte is null.
+      error_log_buffer[error_log_buffer_bytes ?
+                       error_log_buffer_bytes - 1 : 0] = '\0';
+      LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
+                 << " bytes): " << error_log_buffer.data();
+      ret = false;
+      notification.Notify();
+    }
+
+    VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
+            << " bytes): " << info_log_buffer.data();
+    VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
+            << " bytes): " << error_log_buffer.data();
+    CHECK(module != nullptr);
+    notification.Notify();
+  });
+  notification.WaitForNotification();
+
+  return ret;
+}
+
+/* static */ bool CUDADriver::SynchronousMemsetUint8(CUcontext context,
+                                                     CUdeviceptr location,
+                                                     uint8 value, size_t size) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemsetD8_v2(location, value, size);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronousMemsetUint32(CUcontext context,
+                                                      CUdeviceptr location,
+                                                      uint32 value,
+                                                      size_t uint32_count) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemsetD32_v2(location, value, uint32_count);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::AsynchronousMemsetUint32(CUcontext context,
+                                                       CUdeviceptr location,
+                                                       uint32 value,
+                                                       size_t uint32_count,
+                                                       CUstream stream) {
+  ScopedActivateContext activation{context};
+  CUresult res =
+      dynload::cuMemsetD32Async(location, value, uint32_count, stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memset operation";
+  return true;
+}
+
+/* static */ bool CUDADriver::AddStreamCallback(CUcontext context,
+                                                CUstream stream,
+                                                StreamCallback callback,
+                                                void *data) {
+  // Note: flags param is required to be zero according to CUDA 6.0.
+  CUresult res =
+      dynload::cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "unable to add host callback: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::GetModuleFunction(CUcontext context,
+                                                CUmodule module,
+                                                const char *kernel_name,
+                                                CUfunction *function) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && kernel_name != nullptr);
+  CUresult res = dynload::cuModuleGetFunction(function, module, kernel_name);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
+               << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::GetModuleSymbol(CUcontext context,
+                                              CUmodule module,
+                                              const char *symbol_name,
+                                              CUdeviceptr *dptr,
+                                              size_t *bytes) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && symbol_name != nullptr &&
+        (dptr != nullptr || bytes != nullptr));
+  CUresult res =
+      dynload::cuModuleGetGlobal_v2(dptr, bytes, module, symbol_name);
+  if (res != CUDA_SUCCESS) {
+    // symbol may not be found in the current module, but it may reside in
+    // another module.
+    VLOG(2) << "failed to get symbol \"" << symbol_name
+            << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ void CUDADriver::UnloadModule(CUcontext context, CUmodule module) {
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuModuleUnload(module);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to unload module " << module
+               << "; leaking: " << ToString(res);
+  }
+}
+
+/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
+    CUcontext context) {
+  ScopedActivateContext activated{context};
+  CUdevice device = -1;
+  CUresult result = dynload::cuCtxGetDevice(&device);
+  if (result == CUDA_SUCCESS) {
+    return device;
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::StrCat("failed to get device for context: ", ToString(result))};
+}
+
+/* static */ bool CUDADriver::CreateStream(CUcontext context, CUstream *out) {
+  // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
+  // up synchronization with respect to memsets and any other things that have
+  // to occur on the default stream?
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuStreamCreate(out, 0);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not allocate CUDA stream for context " << context
+               << ": " << ToString(res);
+    return false;
+  }
+
+  VLOG(2) << "successfully created stream " << *out << " for context "
+          << context << " on thread";
+  return true;
+}
+
+/* static */ void CUDADriver::DestroyStream(CUcontext context,
+                                            CUstream *stream) {
+  if (*stream == nullptr) {
+    return;
+  }
+
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuStreamDestroy_v2(*stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to destroy CUDA stream for context " << context
+               << ": " << ToString(res);
+  } else {
+    VLOG(2) << "successfully destroyed stream " << *stream << " for context "
+            << context;
+    *stream = nullptr;
+  }
+}
+
+/* static */ void *CUDADriver::DeviceAllocate(CUcontext context, uint64 bytes) {
+  ScopedActivateContext activated{context};
+  CUdeviceptr result = 0;
+  CUresult res = dynload::cuMemAlloc_v2(&result, bytes);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to allocate "
+               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
+               << " bytes) from device: " << ToString(res);
+    return nullptr;
+  }
+  void *ptr = reinterpret_cast<void *>(result);
+  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
+          << bytes << " bytes";
+  return ptr;
+}
+
+/* static */ void CUDADriver::DeviceDeallocate(CUcontext context,
+                                               void *location) {
+  ScopedActivateContext activation{context};
+  CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+  CUresult res = dynload::cuMemFree_v2(pointer);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to free device memory at " << location
+               << "; result: " << ToString(res);
+  } else {
+    VLOG(2) << "deallocated " << location << " for context " << context;
+  }
+}
+
+/* static */ void *CUDADriver::HostAllocate(CUcontext context, uint64 bytes) {
+  ScopedActivateContext activation{context};
+  void *host_mem = nullptr;
+  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+  CUresult res =
+      dynload::cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to alloc " << bytes
+               << " bytes on host: " << ToString(res);
+  }
+  return host_mem;
+}
+
+/* static */ void CUDADriver::HostDeallocate(CUcontext context,
+                                             void *location) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemFreeHost(location);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "error deallocating host memory at " << location << ": "
+               << ToString(res);
+  }
+}
+
+/* static */ bool CUDADriver::HostRegister(CUcontext context, void *location,
+                                           uint64 bytes) {
+  ScopedActivateContext activation{context};
+  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+  CUresult res =
+      dynload::cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "error registering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::HostUnregister(CUcontext context,
+                                             void *location) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemHostUnregister(location);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "error unregistering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ port::Status CUDADriver::DestroyEvent(CUcontext context,
+                                                   CUevent *event) {
+  if (*event == nullptr) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        "input event cannot be null"};
+  }
+
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventDestroy_v2(*event);
+  *event = nullptr;
+
+  switch (res) {
+    case CUDA_SUCCESS:
+      return port::Status::OK();
+    case CUDA_ERROR_DEINITIALIZED:
+    case CUDA_ERROR_NOT_INITIALIZED:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          port::Printf("error destroying CUDA event in context %p: %s", context,
+                       ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INTERNAL,
+          port::Printf("error destroying CUDA event in context %p: %s", context,
+                       ToString(res).c_str())};
+  }
+}
+
+/* static */ port::Status CUDADriver::RecordEvent(CUcontext context,
+                                                  CUevent event,
+                                                  CUstream stream) {
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventRecord(event, stream);
+  switch (res) {
+    case CUDA_SUCCESS:
+      return port::Status::OK();
+    case CUDA_ERROR_DEINITIALIZED:
+    case CUDA_ERROR_NOT_INITIALIZED:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          port::Printf("error recording CUDA event on stream %p: %s", stream,
+                       ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INVALID_ARGUMENT,
+          port::Printf("error recording CUDA event on stream %p: %s", stream,
+                       ToString(res).c_str())};
+  }
+}
+
+/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(CUcontext context,
+                                                             CUevent event) {
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventQuery(event);
+  if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("failed to query event: %s", ToString(res).c_str())};
+  }
+
+  return res;
+}
+
+/* static */ bool CUDADriver::GetEventElapsedTime(CUcontext context,
+                                                  float *elapsed_milliseconds,
+                                                  CUevent start, CUevent stop) {
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to get elapsed time between events: "
+               << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::WaitStreamOnEvent(CUcontext context,
+                                                CUstream stream,
+                                                CUevent event) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuStreamWaitEvent(stream, event, 0 /* = flags */);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not wait stream on event: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronizeContext(CUcontext context) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuCtxSynchronize();
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronizeStream(CUcontext context,
+                                                CUstream stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  CUresult res = dynload::cuStreamSynchronize(stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not synchronize on CUDA stream: " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+    return false;
+  }
+  VLOG(2) << "successfully synchronized stream " << stream << " on context "
+          << context;
+  return true;
+}
+
+/* static */ bool CUDADriver::IsStreamIdle(CUcontext context, CUstream stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  CUresult res = dynload::cuStreamQuery(stream);
+  if (res == CUDA_SUCCESS) {
+    return true;
+  }
+
+  if (res != CUDA_ERROR_NOT_READY) {
+    LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
+  }
+  return false;
+}
+
+/* static */ bool CUDADriver::SynchronousMemcpyD2H(CUcontext context,
+                                                   void *host_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to synchronous memcpy from device to host: %s; "
+        "host dst: %p; GPU src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size);
+    return false;
+  }
+  VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
+          << host_dst;
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronousMemcpyH2D(CUcontext context,
+                                                   CUdeviceptr gpu_dst,
+                                                   const void *host_src,
+                                                   uint64 size) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
+        " host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronousMemcpyD2D(CUcontext context,
+                                                   CUdeviceptr gpu_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
+        "GPU src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst),
+        port::bit_cast<void *>(gpu_src), size, size);
+    return false;
+  }
+  VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CUcontext context,
+                                                    void *host_dst,
+                                                    CUdeviceptr gpu_src,
+                                                    uint64 size,
+                                                    CUstream stream) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyDtoHAsync_v2(host_dst, gpu_src, size, stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
+        "GPU src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2h of " << size
+          << " bytes from " << port::bit_cast<void *>(gpu_src) << " to " << host_dst
+          << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CUcontext context,
+                                                    CUdeviceptr gpu_dst,
+                                                    const void *host_src,
+                                                    uint64 size,
+                                                    CUstream stream) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyHtoDAsync_v2(gpu_dst, host_src, size, stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
+        "host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
+          << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CUcontext context,
+                                                    CUdeviceptr gpu_dst,
+                                                    CUdeviceptr gpu_src,
+                                                    uint64 size,
+                                                    CUstream stream) {
+  ScopedActivateContext activation{context};
+  CUresult result =
+      dynload::cuMemcpyDtoDAsync_v2(gpu_dst, gpu_src, size, stream);
+  if (result != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to enqueue async memcpy from device to device: %s"
+        "; GPU dst: %p on %s %s"
+        "; GPU src: %p on %s %s"
+        "; can access? %s; size: %llu=0x%llx",
+        ToString(result).c_str(), port::bit_cast<void *>(gpu_dst),
+        CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
+        CUDAPointerToDeviceString(gpu_dst).c_str(), port::bit_cast<void *>(gpu_src),
+        CUDAPointerToMemorySpaceString(gpu_src).c_str(),
+        CUDAPointerToDeviceString(gpu_src).c_str(),
+        CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
+
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ port::Status CUDADriver::CreateEvent(CUcontext context,
+                                                  CUevent *result,
+                                                  EventFlags flags) {
+  int cuflags;
+  switch (flags) {
+    case EventFlags::kDefault:
+      cuflags = CU_EVENT_DEFAULT;
+      break;
+    case EventFlags::kDisableTiming:
+      cuflags = CU_EVENT_DISABLE_TIMING;
+      break;
+    default:
+      LOG(FATAL) << "impossible event flags: " << int(flags);
+  }
+
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventCreate(result, cuflags);
+
+  if (res == CUDA_SUCCESS) {
+    return port::Status::OK();
+  } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
+    return port::Status{port::error::RESOURCE_EXHAUSTED,
+                        "could not create CUDA event: out of device memory"};
+  } else {
+    return port::Status{
+        port::error::FAILED_PRECONDITION,
+        port::StrCat("could not create CUDA event: ", ToString(res))};
+  }
+}
+
+/* static */ int CUDADriver::GetDeviceCount() {
+  int device_count = 0;
+  CUresult res = dynload::cuDeviceGetCount(&device_count);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
+    return 0;
+  }
+
+  if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
+    device_count = 1;
+  }
+  return device_count;
+}
+
+/* static */ port::StatusOr<CUcontext> CUDADriver::GetPointerContext(
+    CUdeviceptr pointer) {
+  CUcontext context = nullptr;
+  CUresult result = dynload::cuPointerGetAttribute(
+      &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
+  if (result == CUDA_SUCCESS) {
+    CHECK(context != nullptr) << "success should entail non-null context";
+    return context;
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::StrCat("failed to query device pointer for context: ",
+                   ToString(result))};
+}
+
+/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
+    CUdeviceptr pointer) {
+  unsigned int value;
+  CUresult result = dynload::cuPointerGetAttribute(
+      &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
+  if (result == CUDA_SUCCESS) {
+    switch (value) {
+      case CU_MEMORYTYPE_DEVICE:
+        return MemorySpace::kDevice;
+      case CU_MEMORYTYPE_HOST:
+        return MemorySpace::kHost;
+      default:
+        return port::Status{
+            port::error::INTERNAL,
+            port::StrCat("unknown memory space provided by CUDA API: ", value)};
+    }
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::StrCat("failed to query device pointer for memory space: ",
+                   ToString(result))};
+}
+
+/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
+                                                             CUdeviceptr *base,
+                                                             size_t *size) {
+  CUresult result = dynload::cuMemGetAddressRange(base, size, dptr);
+  if (result == CUDA_SUCCESS) {
+    return port::Status::OK();
+  } else if (result == CUDA_ERROR_NOT_FOUND) {
+    // We differentiate between "this pointer is unknown" (return here) and
+    // "there was an internal error while performing this operation" (return
+    // below).
+    return port::Status{
+        port::error::NOT_FOUND,
+        port::Printf("not a device pointer %p; %s",
+                     reinterpret_cast<void *>(dptr), ToString(result).c_str())};
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::Printf("failed to get pointer into for device pointer %p; %s",
+                   reinterpret_cast<void *>(dptr), ToString(result).c_str())};
+}
+
+/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
+    CUdeviceptr pointer) {
+  auto result = GetPointerContext(pointer);
+  if (!result.ok()) {
+    return result.status();
+  }
+
+  return DeviceFromContext(result.ValueOrDie());
+}
+
+/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
+                                                           int *cc_minor,
+                                                           CUdevice device) {
+  *cc_major = 0;
+  *cc_minor = 0;
+  CUresult result =
+      dynload::cuDeviceComputeCapability(cc_major, cc_minor, device);
+  if (result == CUDA_SUCCESS) {
+    return port::Status::OK();
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::Printf("failed to get compute capability for device: %s; %d",
+                   ToString(result).c_str(), device)};
+}
+
+// Helper function that turns the integer output of cuDeviceGetAttribute to type
+// T and wraps it in a StatusOr.
+template <typename T>
+static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
+                                            CUdevice_attribute attribute) {
+  int value = -1;
+  CUresult result = dynload::cuDeviceGetAttribute(&value, attribute, device);
+  if (result != CUDA_SUCCESS) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        port::StrCat("could not retrieve CUDA device attribute (", attribute,
+                     "): ", ToString(result))};
+  }
+  T converted = value;
+  return converted;
+}
+
+/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
+    CUdevice device) {
+  return GetSimpleAttribute<int>(device,
+                                 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(
+      device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(
+      device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(
+      device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(device,
+                                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(device,
+                                   CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
+}
+
+/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
+                                            CUdevice device) {
+  int value;
+  CUresult res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
+    return false;
+  }
+  *x = value;
+
+  res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
+    return false;
+  }
+  *y = value;
+
+  res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
+    return false;
+  }
+  *z = value;
+  return true;
+}
+
+/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
+  CUresult res = dynload::cuDriverGetVersion(driver_version);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query driver version: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties,
+                                                  int device_ordinal) {
+  CUresult res =
+      dynload::cuDeviceGetProperties(device_properties, device_ordinal);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query device properties: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
+  int value = -1;
+  CUresult res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query ECC status: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ bool CUDADriver::GetDeviceMemoryInfo(CUcontext context,
+                                                  int64 *free_out,
+                                                  int64 *total_out) {
+  ScopedActivateContext activation{context};
+  size_t free = 0;
+  size_t total = 0;
+  CUresult res = dynload::cuMemGetInfo_v2(&free, &total);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query device memory info: " << ToString(res);
+    return false;
+  }
+
+  *free_out = free;
+  *total_out = total;
+  return true;
+}
+
+/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
+                                                   uint64 *result) {
+  size_t value = -1;
+  CUresult res = dynload::cuDeviceTotalMem_v2(&value, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query total available memory: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
+  string pci_bus_id;
+  static const int kBufferSize = 64;
+  port::InlinedVector<char, 4> chars(kBufferSize);
+  chars[kBufferSize - 1] = '\0';
+  CUresult res =
+      dynload::cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
+    return pci_bus_id;
+  }
+  pci_bus_id = chars.begin();
+  return pci_bus_id;
+}
+
+/* static */ bool CUDADriver::CanEnablePeerAccess(CUcontext from,
+                                                  CUcontext to) {
+  if (from == to) {
+    return true;  // A context can always access its own memory.
+  }
+
+  int can_access_peer = -1;
+  auto from_device = DeviceFromContext(from);
+  if (!from_device.ok()) {
+    LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
+               << from_device.status();
+    return false;
+  }
+  auto to_device = DeviceFromContext(to);
+  if (!to_device.ok()) {
+    LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
+               << to_device.status();
+    return false;
+  }
+  CUresult res = dynload::cuDeviceCanAccessPeer(
+      &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
+    return false;
+  }
+
+  return can_access_peer;
+}
+
+/* static */ port::Status CUDADriver::EnablePeerAccess(CUcontext from,
+                                                       CUcontext to) {
+  if (from == to) {
+    return port::Status::OK();  // A context can always access its own memory.
+  }
+
+  ScopedActivateContext activated{from};
+  CUresult result = dynload::cuCtxEnablePeerAccess(to, 0 /* = flags */);
+  if (result != CUDA_SUCCESS &&
+      result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("failed to enable peer access from %p to %p: %s", from, to,
+                     ToString(result).c_str())};
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
+    CUcontext context, CUfunction kernel, int threads_per_block,
+    size_t dynamic_shared_memory_bytes) {
+  ScopedActivateContext activation{context};
+
+  int max_blocks;
+  CUresult result = dynload::cuOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
+  if (result != CUDA_SUCCESS) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("failed to calculate occupancy of kernel %p: %s", kernel,
+                     ToString(result).c_str())};
+  }
+
+  return max_blocks;
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
new file mode 100644
index 0000000000..007db222d9
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -0,0 +1,460 @@
+// CUDA userspace driver library wrapper functionality.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
+
+#include <stddef.h>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/cuda/multi_op_activation.h"
+#include "tensorflow/stream_executor/device_options.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// Identifies the memory space where an allocation resides. See
+// CUDADriver::GetPointerMemorySpace().
+enum class MemorySpace { kHost, kDevice };
+
+// Returns a casual string, such as "host" for the provided memory space.
+string MemorySpaceString(MemorySpace memory_space);
+
+// CUDADriver contains wrappers for calls to the userspace library driver. It's
+// useful to isolate these calls and put basic wrappers around them to separate
+// userspace library driver behaviors from the rest of the program.
+//
+// At the moment it's simply used as a namespace.
+//
+// The calls log any specific errors internally and return whether the operation
+// was successful to the caller.
+//
+// The order of parameters is generally kept symmetric with the underlying CUDA
+// driver API.
+//
+// Links on functions are to specific documentation under
+// http://docs.nvidia.com/cuda/cuda-driver-api/
+//
+// Thread safety: these functions should not be used from signal handlers.
+class CUDADriver {
+ public:
+  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
+  // the case of failure. Safe to call multiple times; will be fast on all calls
+  // after the first.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
+  static port::Status Init();
+
+  // Returns the device associated with the given context.
+  // device is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
+  static port::StatusOr<CUdevice> DeviceFromContext(CUcontext context);
+
+  // Creates a new CUDA stream associated with the given context via
+  // cuStreamCreate.
+  // stream is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
+  static bool CreateStream(CUcontext context, CUstream *stream);
+
+  // Destroys a CUDA stream associated with the given context.
+  // stream is owned by the caller, must not be null, and *stream is set to null
+  // if the stream is successfuly destroyed.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
+  static void DestroyStream(CUcontext context, CUstream *stream);
+
+  // CUDA events can explicitly disable event TSC retrieval for some presumed
+  // performance improvement if timing is unnecessary.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  enum class EventFlags { kDefault, kDisableTiming };
+
+  // Creates a new event associated with the given context.
+  // result is an outparam owned by the caller and must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  static port::Status CreateEvent(CUcontext context, CUevent *result,
+                                  EventFlags flags);
+
+  // Destroys *event and turns it into a nullptr. event may not be null, but
+  // *event may be, via cuEventDestroy
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
+  static port::Status DestroyEvent(CUcontext context, CUevent *event);
+
+  // Allocates a GPU memory space of size bytes associated with the given
+  // context via cuMemAlloc.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
+  static void *DeviceAllocate(CUcontext context, uint64 bytes);
+
+  // Deallocates a GPU memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  static void DeviceDeallocate(CUcontext context, void *location);
+
+  // Allocates page-locked and CUDA-registered memory on the host via
+  // cuMemAllocHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
+  static void *HostAllocate(CUcontext context, uint64 bytes);
+
+  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
+  static void HostDeallocate(CUcontext context, void *location);
+
+  // Registers a memory region at location of size bytes via cuMemHostRegister.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
+  static bool HostRegister(CUcontext context, void *location, uint64 bytes);
+
+  // Unregisters a memory region that was previously registered at location via
+  // cuMemHostUnregister.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
+  //
+  // TODO(leary) verify an error will be returned if the location wasn't
+  // previously registered.
+  static bool HostUnregister(CUcontext context, void *location);
+
+  // Given a device ordinal, returns a device handle into the device outparam,
+  // which must not be null.
+  //
+  // N.B. these device handles do not have a corresponding destroy function in
+  // the CUDA driver API.
+  static port::Status GetDevice(int device_ordinal, CUdevice *device);
+
+  // Given a device handle, returns the name reported by the driver for the
+  // device.
+  static bool GetDeviceName(CUdevice device, string *name_out);
+
+  // Given a device to create a context for, returns a context handle into the
+  // context outparam, which must not be null.
+  //
+  // N.B. CUDA contexts are weird. They are implicitly associated with the
+  // calling thread. Current documentation on contexts and their influence on
+  // userspace processes is given here:
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
+  static port::Status CreateContext(CUdevice device,
+                                    DeviceOptions device_options,
+                                    CUcontext *context);
+
+  // Destroys the provided context via cuCtxDestroy.
+  // Don't do this while clients could still be using the context, per the docs
+  // bad things will happen.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
+  static void DestroyContext(CUcontext context);
+
+  // Queries the runtime for the specified attribute of the specified function.
+  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
+  // in terms of integer-sized values, so there's no potential for overrun (as
+  // of CUDA 5.5).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
+  static bool FuncGetAttribute(CUfunction_attribute attribute,
+                               CUfunction function, int *attribute_value);
+
+  // Sets the preferred cache configuration for the specified function.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
+  static bool FuncSetCacheConfig(CUfunction function,
+                                 CUfunc_cache cache_config);
+
+  // Gets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
+  static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig(
+      CUcontext context);
+
+  // Sets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
+  static port::Status ContextSetSharedMemConfig(
+      CUcontext context, CUsharedconfig shared_mem_config);
+
+  // Launches a CUDA kernel via cuLaunchKernel.
+  // TODO(leary) describe the structure of kernel_params and extra in a readable
+  // way.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
+  static bool LaunchKernel(CUcontext context, CUfunction function,
+                           unsigned int grid_dim_x, unsigned int grid_dim_y,
+                           unsigned int grid_dim_z, unsigned int block_dim_x,
+                           unsigned int block_dim_y, unsigned int block_dim_z,
+                           unsigned int shared_mem_bytes, CUstream stream,
+                           void **kernel_params, void **extra);
+
+  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
+  // handle in "module". Any error logs that are produced are logged internally.
+  static bool LoadPtx(CUcontext context, const char *ptx_contents,
+                      CUmodule *module);
+
+  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
+  // the resulting handle in "module".
+  static port::Status LoadCubin(CUcontext context, const char *cubin_bytes,
+                                CUmodule *module);
+
+  // Retrieves a named kernel from a loaded module, and places the resulting
+  // handle into function (outparam) on success. Neither kernel_name nor
+  // function may be null. No ownership is taken of kernel_name.
+  static bool GetModuleFunction(CUcontext context, CUmodule module,
+                                const char *kernel_name, CUfunction *function);
+
+  // Retrieves a named global/constant symbol from a loaded module, and returns
+  // a device pointer and size of the symbol on success. symbol_name may not be
+  // null. At least one of dptr or bytes should not be null. No ownership is
+  // taken of symbol_name.
+  static bool GetModuleSymbol(CUcontext context, CUmodule module,
+                              const char *symbol_name, CUdeviceptr *dptr,
+                              size_t *bytes);
+
+  // Unloads module from the current context via cuModuleUnload.
+  // TODO(leary) the documentation doesn't say what kind of disasters happen
+  // if you try to unload a module while its CUfunctions are in use.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
+  static void UnloadModule(CUcontext context, CUmodule module);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
+  static bool SynchronousMemsetUint8(CUcontext context, CUdeviceptr location,
+                                     uint8 value, size_t size);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
+  static bool SynchronousMemsetUint32(CUcontext context, CUdeviceptr location,
+                                      uint32 value, size_t uint32_count);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD32Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
+  static bool AsynchronousMemsetUint32(CUcontext context, CUdeviceptr location,
+                                       uint32 value, size_t uint32_count,
+                                       CUstream stream);
+
+  // -- Synchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
+
+  static bool SynchronousMemcpyD2H(CUcontext context, void *host_dst,
+                                   CUdeviceptr gpu_src, uint64 size);
+  static bool SynchronousMemcpyH2D(CUcontext context, CUdeviceptr gpu_dst,
+                                   const void *host_src, uint64 size);
+  static bool SynchronousMemcpyD2D(CUcontext context, CUdeviceptr gpu_dst,
+                                   CUdeviceptr gpu_src, uint64 size);
+
+  // -- Asynchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
+
+  static bool AsynchronousMemcpyD2H(CUcontext context, void *host_dst,
+                                    CUdeviceptr gpu_src, uint64 size,
+                                    CUstream stream);
+  static bool AsynchronousMemcpyH2D(CUcontext context, CUdeviceptr gpu_dst,
+                                    const void *host_src, uint64 size,
+                                    CUstream stream);
+  static bool AsynchronousMemcpyD2D(CUcontext context, CUdeviceptr gpu_dst,
+                                    CUdeviceptr gpu_src, uint64 size,
+                                    CUstream stream);
+
+  // The CUDA stream callback type signature.
+  // The data passed to AddStreamCallback is subsequently passed to this
+  // callback when it fires.
+  //
+  // Some notable things:
+  // * Callbacks must not make any CUDA API calls.
+  // * Callbacks from independent streams execute in an undefined order and may
+  //   be serialized.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
+  typedef void (*StreamCallback)(CUstream stream, CUresult status, void *data);
+
+  // Enqueues a callback operation into stream.
+  // See StreamCallback above and the NVIDIA documentation for additional
+  // details.
+  static bool AddStreamCallback(CUcontext context, CUstream stream,
+                                StreamCallback callback, void *data);
+
+  // Causes stream to wait for event to trigger before proceeding via
+  // cuStreamWaitEvent.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
+  static bool WaitStreamOnEvent(CUcontext context, CUstream stream,
+                                CUevent event);
+
+  // Blocks the calling thread until the operations enqueued onto stream have
+  // been completed, via cuStreamSynchronize.
+  //
+  // TODO(leary) if a pathological thread enqueues operations onto the stream
+  // while another thread blocks like this, can you wind up waiting an unbounded
+  // amount of time?
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
+  static bool SynchronizeStream(CUcontext context, CUstream stream);
+
+  // Blocks the calling thread until the operations associated with the context
+  // have been completed, via cuCtxSynchronize.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
+  static bool SynchronizeContext(CUcontext context);
+
+  // Returns true if all stream tasks have completed at time of the call. Note
+  // the potential for races around this call (if another thread adds work to
+  // the stream immediately after this returns).
+  static bool IsStreamIdle(CUcontext context, CUstream stream);
+
+  // Returns whether code in the from context can access memory in the to
+  // context via cuDeviceCanAccessPeer.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
+  static bool CanEnablePeerAccess(CUcontext from, CUcontext to);
+
+  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
+  static port::Status EnablePeerAccess(CUcontext from, CUcontext to);
+
+  // Returns the elapsed milliseconds between start and stop via
+  // cuEventElapsedTime.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
+  static bool GetEventElapsedTime(CUcontext context,
+                                  float *elapsed_milliseconds, CUevent start,
+                                  CUevent stop);
+
+  // Records that an event occurred when execution reaches the current point in
+  // thestream via cuEventRecord.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
+  static port::Status RecordEvent(CUcontext context, CUevent event,
+                                  CUstream stream);
+
+  // Polls (without blocking) to determine the status of an event - pending or
+  // complete (or an error status).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
+  static port::StatusOr<CUresult> QueryEvent(CUcontext context, CUevent event);
+
+  // -- Pointer-specific calls.
+
+  // Returns the context in which pointer was allocated or registered.
+  static port::StatusOr<CUcontext> GetPointerContext(CUdeviceptr pointer);
+
+  // Returns the device associated with the context from GetPointerContext().
+  static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer);
+
+  // Returns the memory space addressed by pointer.
+  static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer);
+
+  // Returns the base address and size of the device pointer dptr.
+  static port::Status GetPointerAddressRange(CUdeviceptr dptr,
+                                             CUdeviceptr *base, size_t *size);
+
+  // -- Device-specific calls.
+
+  // Returns the compute capability for the device; i.e (3, 5).
+  // This is currently done via the deprecated device API.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
+  static port::Status GetComputeCapability(int *cc_major, int *cc_minor,
+                                           CUdevice device);
+
+  // Returns the number of multiprocessors on the device (note that the device
+  // may be multi-GPU-per-board).
+  static port::StatusOr<int> GetMultiprocessorCount(CUdevice device);
+
+  // Returns the limit on number of threads that can be resident in a single
+  // multiprocessor.
+  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device);
+
+  // Returns the limit on number of threads which may be resident for a single
+  // block (cooperative thread array).
+  static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device);
+
+  // Returns the amount of shared memory available on a single GPU core (i.e.
+  // SM on NVIDIA devices).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device);
+
+  // Returns the amount of shared memory available for a single block
+  // (cooperative thread array).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device);
+
+  // Returns the maximum supported number of registers per block.
+  static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device);
+
+  // Returns the number of threads per warp.
+  static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device);
+
+  // Queries the grid limits for device with cuDeviceGetAttribute calls.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool GetGridLimits(int *x, int *y, int *z, CUdevice device);
+
+  // Returns a grab-bag of device properties in a caller-owned device_properties
+  // structure for device_ordinal via cuDeviceGetProperties.
+  // This call is deprecated in the NVIDIA driver API.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
+  static bool GetDeviceProperties(CUdevprop *device_properties,
+                                  int device_ordinal);
+
+  // Returns whether ECC is enabled for the given CUdevice via
+  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool IsEccEnabled(CUdevice device, bool *result);
+
+  // Returns the total amount of memory available for allocation by the CUDA
+  // context, in bytes, via cuDeviceTotalMem.
+  static bool GetDeviceTotalMemory(CUdevice device, uint64 *result);
+
+  // Returns the free amount of memory and total amount of memory, as reported
+  // by cuMemGetInfo.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
+  static bool GetDeviceMemoryInfo(CUcontext context, int64 *free, int64 *total);
+
+  // Returns a PCI bus id string for the device.
+  // [domain]:[bus]:[device].[function]
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
+  static string GetPCIBusID(CUdevice device);
+
+  // -- Context- and device-independent calls.
+
+  // Returns the number of visible CUDA device via cuDeviceGetCount.
+  // This should correspond to the set of device ordinals available.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
+  static int GetDeviceCount();
+
+  // Returns the driver version number via cuDriverGetVersion.
+  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
+  // instead, the CUDA toolkit release number that this driver is compatible
+  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
+  // compatible driver).
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
+  static bool GetDriverVersion(int *driver_version);
+
+  // -- Other calls
+
+  // Returns the maximum number of blocks (per multiprocessor) occupied by the
+  // specified kernel/CUfunction when launched with the specified parameters.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
+  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
+      CUcontext context, CUfunction kernel, int threads_per_block,
+      size_t dynamic_shared_memory_bytes);
+
+  // Seam for injecting an error at CUDA initialization time for testing
+  // purposes.
+  static bool driver_inject_init_error_;
+};
+
+// Ensures a context is activated within a scope.
+class ScopedActivateContext {
+ public:
+  // Activates the context via cuCtxSetCurrent, if it is not the currently
+  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
+  // mechanism is said by NVIDIA to be relatively slow and deprecated.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
+  explicit ScopedActivateContext(
+      CUcontext context, MultiOpActivation moa = MultiOpActivation::kNo);
+
+  // Checks that the context has remained activated for the duration of the
+  // scope.
+  ~ScopedActivateContext();
+
+ private:
+  CUcontext context_;  // context being activated.
+
+  CUcontext prior_context_;  // context that was active when we were activated.
+
+  // Stores whether this was instantiated during a MultiOpActivation, in which
+  // case we will not pop the context when we're destroyed (we will leave it to
+  // the parent MultiOpActivation that we were nested within).
+  bool previously_in_multi_op_activation_;
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc
new file mode 100644
index 0000000000..a87c868c6b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_event.cc
@@ -0,0 +1,56 @@
+#include "tensorflow/stream_executor/cuda/cuda_event.h"
+
+#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+CUDAEvent::CUDAEvent(CUDAExecutor* parent)
+    : parent_(parent), cuda_event_(nullptr) {}
+
+CUDAEvent::~CUDAEvent() {}
+
+port::Status CUDAEvent::Init() {
+  return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_,
+                                 CUDADriver::EventFlags::kDisableTiming);
+}
+
+port::Status CUDAEvent::Destroy() {
+  return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_);
+}
+
+port::Status CUDAEvent::Record(CUDAStream* stream) {
+  return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_,
+                                 stream->cuda_stream());
+}
+
+Event::Status CUDAEvent::PollForStatus() {
+  port::StatusOr<CUresult> status =
+      CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Error polling for event status: "
+               << status.status().error_message();
+    return Event::Status::kError;
+  }
+
+  switch (status.ValueOrDie()) {
+    case CUDA_SUCCESS:
+      return Event::Status::kComplete;
+    case CUDA_ERROR_NOT_READY:
+      return Event::Status::kPending;
+    default:
+      LOG(INFO) << "Error condition returned for event status: "
+                << status.ValueOrDie();
+      return Event::Status::kError;
+  }
+}
+
+const CUevent& CUDAEvent::cuda_event() {
+  return cuda_event_;
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
new file mode 100644
index 0000000000..c5b65662db
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -0,0 +1,49 @@
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
+
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/lib/status.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// CUDAEvent wraps a CUevent in the platform-independent EventInterface
+// interface.
+class CUDAEvent : public internal::EventInterface {
+ public:
+  explicit CUDAEvent(CUDAExecutor* parent);
+
+  ~CUDAEvent() override;
+
+  // Populates the CUDA-platform-specific elements of this object.
+  port::Status Init();
+
+  // Deallocates any platform-specific elements of this object. This is broken
+  // out (not part of the destructor) to allow for error reporting.
+  port::Status Destroy();
+
+  // Inserts the event at the current position into the specified stream.
+  port::Status Record(CUDAStream* stream);
+
+  // Polls the CUDA platform for the event's current status.
+  Event::Status PollForStatus();
+
+  // The underyling CUDA event element.
+  const CUevent& cuda_event();
+
+ private:
+  // The Executor used to which this object and CUevent are bound.
+  CUDAExecutor* parent_;
+
+  // The underlying CUDA event element.
+  CUevent cuda_event_;
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
new file mode 100644
index 0000000000..59c3159895
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -0,0 +1,327 @@
+#include "tensorflow/stream_executor/cuda/cuda_fft.h"
+
+#include <dlfcn.h>
+
+#include <complex>
+
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/cuda/cuda_helpers.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
+
+namespace dynload {
+
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define PERFTOOLS_GPUTOOLS_CUFFT_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                           \
+    static const char *kName;                                              \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;       \
+    static void *GetDsoHandle() {                                          \
+      static auto status = internal::CachedDsoLoader::GetCufftDsoHandle(); \
+      return status.ValueOrDie();                                          \
+    }                                                                      \
+    static FuncPointerT DynLoad() {                                        \
+      static void *f = dlsym(GetDsoHandle(), kName);                       \
+      CHECK(f != nullptr) << "could not find " << kName                    \
+                          << " in cuFFT DSO; dlerror: " << dlerror();      \
+      return reinterpret_cast<FuncPointerT>(f);                            \
+    }                                                                      \
+    template <typename... Args>                                            \
+    cufftResult operator()(CUDAExecutor * parent, Args... args) {          \
+      cuda::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                           \
+    }                                                                      \
+  } __name;                                                                \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#define CUFFT_ROUTINE_EACH(__macro)                                         \
+  __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d)        \
+      __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany)      \
+          __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C) \
+              __macro(cufftExecC2R) __macro(cufftExecZ2Z)                   \
+                  __macro(cufftExecR2C)
+
+CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP)
+
+}  // namespace dynload
+
+namespace {
+
+// A helper function transforming gpu_fft arguments into cuFFT arguments.
+cufftType CUDAFftType(fft::Type type) {
+  switch (type) {
+    case fft::Type::kC2CForward:
+    case fft::Type::kC2CInverse:
+      return CUFFT_C2C;
+    case fft::Type::kC2R:
+      return CUFFT_C2R;
+    case fft::Type::kR2C:
+      return CUFFT_R2C;
+    case fft::Type::kZ2ZForward:
+    case fft::Type::kZ2ZInverse:
+      return CUFFT_Z2Z;
+    case fft::Type::kZ2D:
+      return CUFFT_Z2D;
+    case fft::Type::kD2Z:
+      return CUFFT_D2Z;
+    default:
+      LOG(FATAL) << "Invalid value of fft::Type.";
+  }
+}
+
+// Associates the given stream with the given cuFFT plan.
+bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
+  auto ret = dynload::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type)
+    : parent_(parent), fft_type_(type) {
+  auto ret = dynload::cufftPlan1d(parent, &plan_, num_x, CUDAFftType(type),
+                                  1 /* = batch */);
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret;
+  }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y,
+                         fft::Type type)
+    : parent_(parent), fft_type_(type) {
+  auto ret =
+      dynload::cufftPlan2d(parent, &plan_, num_x, num_y, CUDAFftType(type));
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret;
+  }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y,
+                         uint64 num_z, fft::Type type)
+    : parent_(parent), fft_type_(type) {
+  auto ret = dynload::cufftPlan3d(parent, &plan_, num_x, num_y, num_z,
+                                  CUDAFftType(type));
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret;
+  }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count,
+                         uint64 *input_embed, uint64 input_stride,
+                         uint64 input_distance, uint64 *output_embed,
+                         uint64 output_stride, uint64 output_distance,
+                         fft::Type type, int batch_count)
+    : parent_(parent), fft_type_(type) {
+  int elem_count_[3], input_embed_[3], output_embed_[3];
+  for (int i = 0; i < rank; ++i) {
+    elem_count_[i] = elem_count[i];
+    if (input_embed) {
+      input_embed_[i] = input_embed[i];
+    }
+    if (output_embed) {
+      output_embed_[i] = output_embed[i];
+    }
+  }
+  auto ret = dynload::cufftPlanMany(
+      parent, &plan_, rank, elem_count_, input_embed ? input_embed_ : nullptr,
+      input_stride, input_distance, output_embed ? output_embed_ : nullptr,
+      output_stride, output_distance, CUDAFftType(type), batch_count);
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
+  }
+}
+
+CUDAFftPlan::~CUDAFftPlan() { dynload::cufftDestroy(parent_, plan_); }
+
+int CUDAFftPlan::GetFftDirection() const {
+  switch (fft_type_) {
+    case fft::Type::kC2CForward:
+    case fft::Type::kZ2ZForward:
+    case fft::Type::kR2C:
+    case fft::Type::kD2Z:
+      return CUFFT_FORWARD;
+    case fft::Type::kC2CInverse:
+    case fft::Type::kZ2ZInverse:
+    case fft::Type::kC2R:
+    case fft::Type::kZ2D:
+      return CUFFT_INVERSE;
+    default:
+      LOG(FATAL) << "Invalid value of fft::Type.";
+  }
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64 num_x,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, type)};
+  return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, num_y, type)};
+  return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, uint64 num_z,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<fft::Plan> plan{
+      new CUDAFftPlan(parent_, num_x, num_y, num_z, type)};
+  return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count) {
+  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(
+      parent_, rank, elem_count, input_embed, input_stride, input_distance,
+      output_embed, output_stride, output_distance, type, batch_count)};
+  return plan;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
+                            const DeviceMemory<InputT> &input,
+                            DeviceMemory<OutputT> *output) {
+  CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
+  if (cuda_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
+                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
+                       CUDAComplex(CUDAMemoryMutable(output)));
+
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run cuFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                         FuncT cufftExec,
+                                         const DeviceMemory<InputT> &input,
+                                         DeviceMemory<OutputT> *output) {
+  CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
+  if (cuda_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
+                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
+                       CUDAComplex(CUDAMemoryMutable(output)),
+                       cuda_fft_plan->GetFftDirection());
+
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run cuFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+#define PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2,   \
+                                           __fft_type3)                        \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<std::complex<__type>> &input,         \
+                      DeviceMemory<std::complex<__type>> *output) {            \
+    return DoFftWithDirectionInternal(                                         \
+        stream, plan, dynload::cufftExec##__fft_type1, input, output);         \
+  }                                                                            \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<__type> &input,                       \
+                      DeviceMemory<std::complex<__type>> *output) {            \
+    return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type2, input, \
+                         output);                                              \
+  }                                                                            \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<std::complex<__type>> &input,         \
+                      DeviceMemory<__type> *output) {                          \
+    return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type3, input, \
+                         output);                                              \
+  }
+
+PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
+PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+
+#undef PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+namespace gpu = ::perftools::gputools;
+
+REGISTER_MODULE_INITIALIZER(register_cufft, {
+  gpu::port::Status status =
+      gpu::PluginRegistry::Instance()
+          ->RegisterFactory<gpu::PluginRegistry::FftFactory>(
+              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuFftPlugin, "cuFFT",
+              [](gpu::internal::StreamExecutorInterface
+                     *parent) -> gpu::fft::FftSupport * {
+                gpu::cuda::CUDAExecutor *cuda_executor =
+                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
+                if (cuda_executor == nullptr) {
+                  LOG(ERROR)
+                      << "Attempting to initialize an instance of the cuFFT "
+                      << "support library with a non-CUDA StreamExecutor";
+                  return nullptr;
+                }
+
+                return new gpu::cuda::CUDAFft(cuda_executor);
+              });
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to register cuFFT factory: "
+               << status.error_message();
+  }
+
+  // Prime the cuFFT DSO. The loader will log more information.
+  auto statusor = gpu::internal::CachedDsoLoader::GetCufftDsoHandle();
+  if (!statusor.ok()) {
+    LOG(INFO) << "Unable to load cuFFT DSO.";
+  }
+
+  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
+                                                     gpu::PluginKind::kFft,
+                                                     gpu::cuda::kCuFftPlugin);
+});
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
new file mode 100644
index 0000000000..2577c2952e
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -0,0 +1,95 @@
+// CUDA-specific support for FFT functionality -- this wraps the cuFFT library
+// capabilities, and is only included into CUDA implementation code -- it will
+// not introduce cuda headers into other code.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
+
+#include "tensorflow/stream_executor/fft.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "third_party/gpus/cuda/include/cufft.h"
+
+namespace perftools {
+namespace gputools {
+
+class Stream;
+
+namespace cuda {
+
+class CUDAExecutor;
+
+// Opaque and unique indentifier for the cuFFT plugin.
+extern const PluginId kCuFftPlugin;
+
+class CUDAFftPlan : public fft::Plan {
+ public:
+  // Constructor creating 1d FFT plan.
+  CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type);
+  // Constructor creating 2d FFT plan.
+  CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, fft::Type type);
+  // Constructor creating 3d FFT plan.
+  CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, uint64 num_z,
+              fft::Type type);
+  // Constructor creating batched FFT plan.
+  CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count,
+              uint64 *input_embed, uint64 input_stride, uint64 input_distance,
+              uint64 *output_embed, uint64 output_stride,
+              uint64 output_distance, fft::Type type, int batch_count);
+  ~CUDAFftPlan() override;
+
+  // Get FFT direction in cuFFT based on FFT type.
+  int GetFftDirection() const;
+  cufftHandle GetPlan() const { return plan_; }
+
+ private:
+  CUDAExecutor *parent_;
+  cufftHandle plan_;
+  fft::Type fft_type_;
+};
+
+// FFT support for CUDA platform via cuFFT library.
+//
+// This satisfies the platform-agnostic FftSupport interface.
+//
+// Note that the cuFFT handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// to. This simply happens as an artifact of creating the cuFFT handle when a
+// CUDA context is active.
+//
+// Thread-safe. The CUDA context associated with all operations is the CUDA
+// context of parent_, so all context is explicit.
+class CUDAFft : public fft::FftSupport {
+ public:
+  explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {}
+  ~CUDAFft() override {}
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
+
+ private:
+  CUDAExecutor *parent_;
+
+  // Two helper functions that execute dynload::cufftExec?2?.
+
+  // This is for complex to complex FFT, when the direction is required.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                  FuncT cufft_exec,
+                                  const DeviceMemory<InputT> &input,
+                                  DeviceMemory<OutputT> *output);
+
+  // This is for complex to real or real to complex FFT, when the direction
+  // is implied.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufft_exec,
+                     const DeviceMemory<InputT> &input,
+                     DeviceMemory<OutputT> *output);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
new file mode 100644
index 0000000000..77f16e2a6e
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -0,0 +1,1082 @@
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+
+#include <unistd.h>
+
+#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/cuda/cuda_event.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform.h"
+#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/cuda/cuda_timer.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/lib/casts.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/mathutil.h"
+#include "tensorflow/stream_executor/lib/path.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+
+#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
+#error \
+    "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
+#endif
+
+#ifdef __CUDA_RUNTIME_H__
+#error \
+    "CUDA runtime being included into CUDA GPU executor; should be driver only."
+#endif
+
+extern bool FLAGS_check_gpu_leaks;
+tensorflow::int32 FLAGS_register_occupancy_warning_threshold;
+bool FLAGS_prefer_cubin_to_ptx = true;
+
+namespace perftools {
+namespace gputools {
+namespace rng {
+class RngSupport;
+}  // namespace rng
+}  // namespace gputools
+}  // namespace perftools
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
+// It has been observed that loading both PTX and cubins into the driver library
+// can cause it to crash, but loading only CUBINs avoids those crashes;
+// therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
+// PTX code.
+//
+// As this is an implementation-detail workaround, the usage is to declare this
+// variable with extern linkage and populate it from another translation unit.
+std::function<string(const string &)> g_cubinate;
+
+static CUDAEvent *AsCUDAEvent(Event *event) {
+  DCHECK(event != nullptr);
+  return static_cast<CUDAEvent *>(event->implementation());
+}
+
+// Given a platform-independent stream datatype, returns the internal CUDA
+// platform implementation pointer.
+static CUDAStream *AsCUDAStream(Stream *stream) {
+  DCHECK(stream != nullptr);
+  return static_cast<CUDAStream *>(stream->implementation());
+}
+
+// Given a platform-independent stream datatype, returns the platform
+// implementation's internal value, suitable for passing directly to libcuda
+// APIs.
+CUstream AsCUDAStreamValue(Stream *stream) {
+  DCHECK(stream != nullptr);
+  return AsCUDAStream(stream)->cuda_stream();
+}
+
+// Given a platform-independent timer datatype, returns the internal CUDA
+// platform implementation pointer.
+static CUDATimer *AsCUDATimer(Timer *timer) {
+  DCHECK(timer != nullptr);
+  return static_cast<CUDATimer *>(timer->implementation());
+}
+
+// Given const GPU memory, returns a libcuda device pointer datatype, suitable
+// for passing directly to libcuda APIs.
+//
+// N.B. we must lose constness in order to pass a suitable type to the existing
+// libcuda APIs, so the caller should take care to only pass the result of const
+// GPU memory conversions to libcuda functions which will honor constness.
+static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase &gpu_mem) {
+  return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
+}
+
+// See description on const version above.
+static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
+  return AsCudaDevicePtr(*gpu_mem);
+}
+
+static CUcontext GetCudaContext(Stream *stream) {
+  return static_cast<CUDAExecutor *>(stream->parent()->implementation())
+      ->cuda_context();
+}
+
+CUcontext ExtractCudaContext(CUDAExecutor *cuda_exec) {
+  CHECK(cuda_exec != nullptr);
+  return cuda_exec->cuda_context();
+}
+
+CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) {
+  return static_cast<CUDAExecutor *>(stream_exec->implementation());
+}
+
+CUDAExecutor::~CUDAExecutor() {
+  for (auto &it : disk_modules_) {
+    CUDADriver::UnloadModule(context_, it.second);
+  }
+  for (auto &it : in_memory_modules_) {
+    CUDADriver::UnloadModule(context_, it.second);
+  }
+  if (context_ != nullptr) {
+    CUDADriver::DestroyContext(context_);
+  }
+}
+
+port::Status CUDAExecutor::Init(int device_ordinal,
+                                DeviceOptions device_options) {
+  device_ordinal_ = device_ordinal;
+
+  auto status = CUDADriver::Init();
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = CUDADriver::GetDevice(device_ordinal_, &device_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = CUDADriver::CreateContext(device_, device_options, &context_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
+}
+
+bool CUDAExecutor::FindOnDiskForComputeCapability(
+    port::StringPiece filename, port::StringPiece canonical_suffix,
+    string *found_filename) const {
+  if (cc_major_ == 0 && cc_minor_ == 0) {
+    return false;
+  }
+
+  // TODO(22689637): Eliminate unnecessary ToString()s when all dependencies
+  // have been migrated.
+  string cc_specific = port::StrCat(filename.ToString(), ".cc", cc_major_,
+                                    cc_minor_, canonical_suffix.ToString());
+  if (port::FileExists(cc_specific)) {
+    VLOG(2) << "found compute-capability-specific file, using that: "
+            << cc_specific;
+    *found_filename = cc_specific;
+    return true;
+  }
+
+  VLOG(2) << "could not find compute-capability specific file at: "
+          << cc_specific;
+  if (port::FileExists(filename.ToString())) {
+    *found_filename = filename.ToString();
+    return true;
+  }
+
+  return false;
+}
+
+// Returns the path to the running executable.
+// N.B. Derived from //knowledge/smalltalk/background_kb.cc
+// Arg: strip_exe: if true, remove the name of the executable itself from the
+//                 returned string. Example: calling this from /usr/bin/foo
+//                 would return /usr/bin.
+static string GetBinaryDir(bool strip_exe) {
+  char exe_path[PATH_MAX] = {0};
+  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+  // Make sure it's null-terminated:
+  exe_path[sizeof(exe_path) - 1] = 0;
+
+  if (strip_exe) {
+    // The exe is the last component of the path, so remove one component.
+    string ret = exe_path;
+    std::vector<string> components = port::Split(exe_path, '/');
+    components.pop_back();
+    return port::Join(components, "/");
+  }
+  return exe_path;
+}
+
+// Returns the location of the runfiles directory.
+// This is the directory which "bazel run" sets as the current working directory
+// before the program starts.
+// N.B. This doesn't have to be running under "bazel run" in order to get the
+// appropriate runfiles directory.
+static string GetRunfilesDir() {
+  return port::StrCat(GetBinaryDir(false), ".runfiles");
+}
+
+bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
+                             KernelBase *kernel) {
+  CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
+  CUmodule module = nullptr;
+  const string *kernelname;
+
+  const OnDiskKernelLoaderSpec *on_disk_spec = nullptr;
+  bool has_ptx = spec.has_cuda_ptx_on_disk();
+  bool has_cubin = spec.has_cuda_cubin_on_disk();
+  if (has_cubin && (!has_ptx || FLAGS_prefer_cubin_to_ptx)) {
+    on_disk_spec = &spec.cuda_cubin_on_disk();
+  } else if (has_ptx) {
+    on_disk_spec = &spec.cuda_ptx_on_disk();
+  }
+
+  if (on_disk_spec != nullptr) {
+  } else if (spec.has_cuda_ptx_in_memory()) {
+    kernelname = &spec.cuda_ptx_in_memory().kernelname();
+
+    if (cc_major_ == 0 && cc_minor_ == 0) {
+      return false;
+    }
+
+    // Note that the orignal ptx may be compressed, and the ptx we get below is
+    // the decompressed result. To cache the module we should use the original
+    // ptx (compressed one) as the key. This is because for the same compressed
+    // ptx, we may get different decompressed ptx wrt the pointer value.
+    const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
+    const char *orig_ptx =
+        spec.cuda_ptx_in_memory().original_text(cc_major_, cc_minor_);
+    if (ptx == nullptr || orig_ptx == nullptr) {
+      ptx = spec.cuda_ptx_in_memory().default_text();
+      orig_ptx = spec.cuda_ptx_in_memory().original_default_text();
+    }
+    if (ptx == nullptr || orig_ptx == nullptr) {
+      LOG(FATAL) << "could not load ptx for kernel " << kernelname;
+      return false;
+    }
+
+    mutex_lock lock{in_memory_modules_mu_};
+    module = in_memory_modules_[orig_ptx];
+
+    if (module == nullptr) {
+      if (g_cubinate == nullptr) {
+        if (!CUDADriver::LoadPtx(context_, ptx, &module)) {
+          return false;
+        }
+      } else {
+        string cubin = g_cubinate(ptx);
+        auto load_status =
+            CUDADriver::LoadCubin(context_, cubin.c_str(), &module);
+        if (!load_status.ok()) {
+          LOG(ERROR) << "failed to load cubin via hook: " << load_status;
+          return false;
+        }
+      }
+      in_memory_modules_[orig_ptx] = module;
+    }
+  } else if (spec.has_cuda_cubin_in_memory()) {
+    kernelname = &spec.cuda_cubin_in_memory().kernelname();
+    const char *cubin = spec.cuda_cubin_in_memory().bytes();
+    mutex_lock lock{in_memory_modules_mu_};
+    module = in_memory_modules_[cubin];
+
+    if (module == nullptr) {
+      auto load_status = CUDADriver::LoadCubin(context_, cubin, &module);
+      if (!load_status.ok()) {
+        LOG(ERROR) << "failed to load CUBIN: " << load_status;
+        return false;
+      }
+
+      in_memory_modules_[cubin] = module;
+    }
+  } else {
+    LOG(WARNING) << "no method of loading CUDA kernel provided";
+    return false;
+  }
+
+  VLOG(2) << "getting function " << kernelname << " from module " << module;
+  if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                     cuda_kernel->cuda_function_ptr())) {
+    return false;
+  }
+
+  // We have to trust the kernel loader spec arity because there doesn't appear
+  // to be a way to reflect on the number of expected arguments w/the CUDA API.
+  cuda_kernel->set_arity(spec.arity());
+
+  KernelMetadata kernel_metadata;
+  if (!GetKernelMetadata(cuda_kernel, &kernel_metadata)) {
+    LOG(WARNING) << "Unable to get metadata for kernel " << kernelname;
+  }
+  kernel->set_metadata(kernel_metadata);
+  kernel->set_name(*kernelname);
+  return true;
+}
+
+bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
+                                     KernelMetadata *kernel_metadata) {
+  int value;
+  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                    *cuda_kernel->cuda_function_ptr(),
+                                    &value)) {
+    return false;
+  }
+  kernel_metadata->set_registers_per_thread(value);
+
+  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                                    *cuda_kernel->cuda_function_ptr(),
+                                    &value)) {
+    return false;
+  }
+  kernel_metadata->set_shared_memory_bytes(value);
+
+  return true;
+}
+
+bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
+                          const BlockDim &block_dims, const KernelBase &kernel,
+                          const std::vector<KernelArg> &args) {
+  CHECK_EQ(kernel.Arity(), args.size());
+  CUstream custream = AsCUDAStreamValue(stream);
+  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+
+  std::vector<void *> addrs;
+  addrs.reserve(args.size());
+  int shmem_bytes = 0;
+  for (size_t i = 0; i < args.size(); i++) {
+    switch (args[i].type) {
+      case KernelArg::kNormal:
+        addrs.push_back(const_cast<void *>(
+            static_cast<const void *>(args[i].data.begin())));
+        break;
+      case KernelArg::kSharedMemory:
+        shmem_bytes += args[i].bytes;
+        break;
+      default:
+        LOG(ERROR) << "Invalid kernel arg type passed (" << args[i].type
+                   << ") for arg " << i;
+        return false;
+    }
+  }
+
+  // Only perform/print the occupancy check 1x.
+  launched_kernels_mu_.lock();
+  if (launched_kernels_.find(cufunc) == launched_kernels_.end()) {
+    OccupancyCheck(kernel, thread_dims, block_dims);
+    // TODO(rspringer): Remove elements from launched_kernels_...if we ever
+    // expose a kernel/module deallocation method.
+    launched_kernels_.insert(cufunc);
+  }
+  launched_kernels_mu_.unlock();
+
+  if (cuda_kernel->GetPreferredCacheConfig() !=
+      KernelCacheConfig::kNoPreference) {
+    CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
+  }
+
+  if (!CUDADriver::LaunchKernel(
+          GetCudaContext(stream), cufunc, block_dims.x, block_dims.y,
+          block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
+          shmem_bytes, custream, addrs.data(), nullptr /* = extra */)) {
+    LOG(ERROR) << "failed to launch CUDA kernel with args: " << args.size()
+               << "; thread dim: " << thread_dims.ToString()
+               << "; block dim: " << block_dims.ToString();
+    return false;
+  }
+
+  return true;
+}
+
+// This is a non-essential operation; if there's a failure, proceed without
+// logging an error. It's nearly certain that in case of failures, we'd never
+// get here in the first place; these are very low-impact routines.
+void CUDAExecutor::OccupancyCheck(const KernelBase &kernel,
+                                  const ThreadDim &thread_dims,
+                                  const BlockDim &block_dims) {
+  VLOG(2) << "Computing kernel occupancy for kernel "
+          << kernel.demangled_name();
+  VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
+          << ", " << thread_dims.z << ")";
+
+  int regs_per_thread;
+  if (!kernel.metadata().registers_per_thread(&regs_per_thread)) {
+    return;
+  }
+
+  int smem_per_block;
+  if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
+    return;
+  }
+
+  const DeviceDescription &device_description =
+      kernel.parent()->GetDeviceDescription();
+
+  uint64 blocks_per_sm = CalculateOccupancy(
+      device_description, regs_per_thread, smem_per_block, thread_dims);
+  VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
+
+  // To increase occupancy, there must be a sufficient number of blocks
+  // available to spread across the sm's at this new improved occupancy level.
+  int multiprocessor_count = device_description.core_count();
+  int block_count = block_dims.x * block_dims.y * block_dims.z;
+  int available_blocks_per_sm =
+      port::MathUtil::CeilOfRatio(block_count, multiprocessor_count);
+  if (available_blocks_per_sm <= static_cast<int64>(blocks_per_sm)) {
+    VLOG(2) << "Occupancy is limited by number of blocks available per sm.";
+    return;
+  }
+
+  uint64 improved_regs_per_thread = CalculateRegisterLimitForTargetOccupancy(
+      device_description, smem_per_block, thread_dims, blocks_per_sm + 1);
+  if (improved_regs_per_thread != 0) {
+    VLOG(2) << "Reducing register usage from " << regs_per_thread
+            << " to " << improved_regs_per_thread
+            << " could increase resident blocks per SM by one.";
+
+    uint64 reg_reduction = regs_per_thread - improved_regs_per_thread;
+    if (reg_reduction <=
+        static_cast<uint64>(FLAGS_register_occupancy_warning_threshold)) {
+      LOG(INFO) << "Notice: occupancy would increase if register usage was"
+                << " reduced from " << regs_per_thread
+                << " to " << improved_regs_per_thread
+                << " registers per thread for kernel: "
+                << kernel.demangled_name();
+    }
+  } else {
+    VLOG(2) << "Resident blocks per SM cannot be increased by reducing "
+        "register usage.";
+  }
+}
+
+void *CUDAExecutor::Allocate(uint64 size) {
+  return CUDADriver::DeviceAllocate(context_, size);
+}
+
+void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem,
+                                      uint64 offset_bytes, uint64 size_bytes) {
+  // offset and size are in bytes, so char* works as the pointer type.
+  return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
+}
+
+void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) {
+  // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
+  if (!mem->is_sub_buffer()) {
+    CUDADriver::DeviceDeallocate(context_, mem->opaque());
+  }
+}
+
+bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) {
+  if (location == nullptr || size == 0) {
+    LOG(WARNING) << "attempting to register null or zero-sized memory: "
+                 << location << "; size " << size;
+  }
+  VLOG(2) << "registering " << location << " size " << size;
+  return CUDADriver::HostRegister(context_, location, size);
+}
+
+bool CUDAExecutor::HostMemoryUnregister(void *location) {
+  VLOG(2) << "unregistering " << location;
+  return CUDADriver::HostUnregister(context_, location);
+}
+
+bool CUDAExecutor::SynchronizeAllActivity() {
+  return CUDADriver::SynchronizeContext(context_);
+}
+
+bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return CUDADriver::SynchronousMemsetUint32(
+        context_, AsCudaDevicePtr(location), 0x0, size / 4);
+  }
+  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                            0x0, size);
+}
+
+bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
+                                     uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    // cudaMemset reinterprets "value" as a uint8.
+    uint8 byte_value = static_cast<uint8>(value);
+    uint32 pattern = (byte_value << 24) | (byte_value << 16) |
+                     (byte_value << 8) | byte_value;
+    return CUDADriver::SynchronousMemsetUint32(
+        context_, AsCudaDevicePtr(location), pattern, size / 4);
+  }
+  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                            value, size);
+}
+
+bool CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
+                                     const void *host_src, uint64 size) {
+  return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          host_src, size);
+}
+
+bool CUDAExecutor::SynchronousMemcpy(void *host_dst,
+                                     const DeviceMemoryBase &gpu_src,
+                                     uint64 size) {
+  return CUDADriver::SynchronousMemcpyD2H(context_, host_dst,
+                                          AsCudaDevicePtr(gpu_src), size);
+}
+
+bool CUDAExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
+  return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          AsCudaDevicePtr(gpu_src), size);
+}
+
+bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
+                           uint64 size) {
+  return Memset32(stream, location, 0x0, size);
+}
+
+bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
+                            uint32 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset32 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+        size % 4 == 0);
+  return CUDADriver::AsynchronousMemsetUint32(
+      context_, AsCudaDevicePtr(location), pattern, size / 4,
+      AsCUDAStreamValue(stream));
+}
+
+bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst,
+                          const DeviceMemoryBase &gpu_src, uint64 size) {
+  return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                           AsCudaDevicePtr(gpu_src), size,
+                                           AsCUDAStreamValue(stream));
+}
+
+bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
+                          const void *host_src, uint64 size) {
+  return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                           host_src, size,
+                                           AsCUDAStreamValue(stream));
+}
+
+bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
+                                        DeviceMemoryBase *gpu_dst,
+                                        const DeviceMemoryBase &gpu_src,
+                                        uint64 size) {
+  return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                           AsCudaDevicePtr(gpu_src), size,
+                                           AsCUDAStreamValue(stream));
+}
+
+bool CUDAExecutor::HostCallback(Stream *stream,
+                                std::function<void()> callback) {
+  auto callback_ptr = new std::function<void()>(callback);
+  return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
+                                       InternalHostCallback, callback_ptr);
+}
+
+/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream,
+                                                     CUresult status,
+                                                     void *data) {
+  std::function<void()> *callback =
+      reinterpret_cast<std::function<void()> *>(data);
+  (*callback)();
+  delete callback;
+}
+
+port::Status CUDAExecutor::AllocateEvent(Event *event) {
+  return AsCUDAEvent(event)->Init();
+}
+
+port::Status CUDAExecutor::DeallocateEvent(Event *event) {
+  return AsCUDAEvent(event)->Destroy();
+}
+
+port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) {
+  return AsCUDAEvent(event)->Record(AsCUDAStream(stream));
+}
+
+port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
+  if (CUDADriver::WaitStreamOnEvent(context_,
+                                    AsCUDAStream(stream)->cuda_stream(),
+                                    AsCUDAEvent(event)->cuda_event())) {
+    return port::Status::OK();
+  } else {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("error recording waiting for CUDA event on stream %p",
+                     stream)};
+  }
+}
+
+Event::Status CUDAExecutor::PollForEventStatus(Event *event) {
+  return AsCUDAEvent(event)->PollForStatus();
+}
+
+bool CUDAExecutor::AllocateStream(Stream *stream) {
+  return AsCUDAStream(stream)->Init();
+}
+
+void CUDAExecutor::DeallocateStream(Stream *stream) {
+  CUDAStream *cuda_stream = AsCUDAStream(stream);
+  if (!cuda_stream->IsIdle()) {
+    LOG(ERROR) << "Deallocating stream with pending work";
+  }
+  cuda_stream->Destroy();
+}
+
+bool CUDAExecutor::AllocateTimer(Timer *timer) {
+  return AsCUDATimer(timer)->Init();
+}
+
+void CUDAExecutor::DeallocateTimer(Timer *timer) {
+  AsCUDATimer(timer)->Destroy();
+}
+
+bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
+  CUevent other_completed_event;
+  bool ok =
+      AsCUDAStream(other)->GetOrCreateCompletedEvent(&other_completed_event);
+  if (!ok) {
+    LOG(ERROR) << "failed to get completion event from other; "
+                  "therefore, failed to create inter-stream dependency";
+    return false;
+  }
+
+  ok = CUDADriver::RecordEvent(context_, other_completed_event,
+                               AsCUDAStreamValue(other))
+           .ok();
+  if (!ok) {
+    LOG(ERROR) << "failed to record completion event; "
+                  "therefore, failed to create inter-stream dependency";
+    return false;
+  }
+
+  return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent),
+                                       other_completed_event);
+}
+
+bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) {
+  return AsCUDATimer(timer)->Start(AsCUDAStream(stream));
+}
+
+bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
+  return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
+}
+
+bool CUDAExecutor::BlockHostUntilDone(Stream *stream) {
+  return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
+}
+
+blas::BlasSupport *CUDAExecutor::CreateBlas() {
+  PluginRegistry *registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::BlasFactory> status =
+      registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId,
+                                                        plugin_config_.blas());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve BLAS factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+dnn::DnnSupport *CUDAExecutor::CreateDnn() {
+  PluginRegistry *registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::DnnFactory> status =
+      registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId,
+                                                       plugin_config_.dnn());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve DNN factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+fft::FftSupport *CUDAExecutor::CreateFft() {
+  PluginRegistry *registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::FftFactory> status =
+      registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId,
+                                                       plugin_config_.fft());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve FFT factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+rng::RngSupport *CUDAExecutor::CreateRng() {
+  PluginRegistry *registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::RngFactory> status =
+      registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId,
+                                                       plugin_config_.rng());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve RNG factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+// TODO(rspringer): Remove in b/18544742.
+bool CUDAExecutor::SupportsDnn() const {
+  return true;
+}
+
+bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) {
+  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
+  return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_);
+}
+
+port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) {
+  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
+  return CUDADriver::EnablePeerAccess(context_, cuda_other->context_);
+}
+
+SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
+  port::StatusOr<CUsharedconfig> cuda_config =
+      CUDADriver::ContextGetSharedMemConfig(context_);
+  if (!cuda_config.ok()) {
+    // Don't log; the failed call will log necessary output.
+    return SharedMemoryConfig::kDefault;
+  }
+
+  switch (cuda_config.ValueOrDie()) {
+    case CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE:
+      return SharedMemoryConfig::kDefault;
+    case CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:
+      return SharedMemoryConfig::kFourByte;
+    case CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE:
+      return SharedMemoryConfig::kEightByte;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration returned: "
+                 << cuda_config.ValueOrDie();
+  }
+}
+
+port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
+    SharedMemoryConfig config) {
+  CUsharedconfig cuda_config;
+  switch (config) {
+    case SharedMemoryConfig::kDefault:
+      cuda_config = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
+      break;
+    case SharedMemoryConfig::kFourByte:
+      cuda_config = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
+      break;
+    case SharedMemoryConfig::kEightByte:
+      cuda_config = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE;
+      break;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration specified: "
+                 << static_cast<int>(config);
+  }
+  return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config);
+}
+
+bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
+  return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
+}
+
+bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem,
+                             size_t *bytes) {
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{disk_modules_mu_};
+    for (auto &it : disk_modules_) {
+      if (CUDADriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                      reinterpret_cast<CUdeviceptr *>(mem),
+                                      bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    for (auto &it : in_memory_modules_) {
+      if (CUDADriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                      reinterpret_cast<CUdeviceptr *>(mem),
+                                      bytes)) {
+        return true;
+      }
+    }
+  }
+
+  LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
+  return false;
+}
+
+bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
+  // The BlockDim name is a mismatch against these GRID_DIM_* queries because
+  // we use BlockDims to express the dimensions of blocks within a grid
+  // (as opposed to ThreadDim which expresses the dimensions of threads
+  // within a block).
+  int x, y, z;
+  if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) {
+    return false;
+  }
+
+  block_dim_limit->x = x;
+  block_dim_limit->y = y;
+  block_dim_limit->z = z;
+  return true;
+}
+
+KernelArg CUDAExecutor::DeviceMemoryToKernelArg(
+    const DeviceMemoryBase &gpu_mem) const {
+  const void* arg = gpu_mem.opaque();
+  const uint8 *arg_ptr = reinterpret_cast<const uint8 *>(&arg);
+
+  KernelArg kernel_arg;
+  kernel_arg.type = KernelArg::kNormal;
+  kernel_arg.data = port::InlinedVector<uint8, 4>(arg_ptr, arg_ptr + sizeof(arg));
+  kernel_arg.bytes = sizeof(arg);
+  return kernel_arg;
+}
+
+bool CUDAExecutor::SupportsBlas() const { return true; }
+
+bool CUDAExecutor::SupportsFft() const { return true; }
+
+bool CUDAExecutor::SupportsRng() const { return true; }
+
+void *CUDAExecutor::CudaContextHack() { return context_; }
+
+CUcontext CUDAExecutor::cuda_context() { return context_; }
+
+// Attemps to read the NUMA node corresponding to the GPU device's PCI bus out
+// of SysFS. Returns -1 if it cannot.
+//
+// For anything more complicated/prod-focused than this, you'll likely want to
+// turn to gsys' topology modeling.
+static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
+  VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
+  static const int kUnknownNumaNode = -1;
+
+  if (pci_bus_id.empty()) {
+    LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
+    return kUnknownNumaNode;
+  }
+
+  string filename =
+      port::Printf("/sys/bus/pci/devices/%s/numa_node", pci_bus_id.c_str());
+
+  // We have to use fopen/fread here so that the device properties can be
+  // populated before InitGoogle procedure has been completed (at which point we
+  // could use the file::* utilities).
+  FILE *file = fopen(filename.c_str(), "r");
+  if (file == nullptr) {
+    LOG(ERROR) << "could not open file to read NUMA node: " << filename;
+    return kUnknownNumaNode;
+  }
+
+  string content;
+  char buf[32];
+  size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
+  buf[did_read] = '\0';
+  content = buf;
+
+  int32 value;
+  if (port::safe_strto32(content, &value)) {
+    if (value < 0) {  // See http://b/18228951 for details on this path.
+      LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
+                << value << "), but there must be at least one NUMA node"
+                            ", so returning NUMA node zero";
+      return 0;
+    }
+    return value;
+  }
+
+  LOG(WARNING)
+      << "could not convert SysFS file contents to integral NUMA node value: "
+      << content;
+
+  return kUnknownNumaNode;
+}
+
+// Set of compute capability specific device parameters that cannot be
+// queried from the driver API.  These values instead are baked into a
+// lookup table indexed by compute capability version.
+struct UnqueryableDeviceParams {
+  int cc_major;
+  int cc_minor;
+  uint64 blocks_per_core_limit;
+  uint64 registers_per_core_limit;
+  uint64 registers_per_thread_limit;
+  uint64 warp_alloc_granularity;
+  uint64 register_alloc_granularity;
+  uint64 shared_memory_alloc_granularity;
+};
+
+static const UnqueryableDeviceParams kAllUnqueryableDeviceParams[] = {
+  {
+    3, 5,       // compute capability (3.5)
+    16,         // blocks_per_core_limit
+    64 * 1024,  // registers_per_core_limit
+    255,        // registers_per_thread_limit
+    4,          // warp_alloc_granularity
+    256,        // register_alloc_granularity
+    256         // shared_memory_alloc_granularity
+  }
+};
+
+DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  {
+    int driver_version = 0;
+    (void)CUDADriver::GetDriverVersion(&driver_version);
+    string augmented_driver_version = port::Printf(
+        "%d (%s)", driver_version,
+        DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
+    builder.set_driver_version(augmented_driver_version);
+  }
+
+  {
+    string pci_bus_id = CUDADriver::GetPCIBusID(device_);
+
+    // Lower the hex characters to match sysfs.
+    pci_bus_id = port::Lowercase(pci_bus_id);
+    builder.set_pci_bus_id(pci_bus_id);
+
+    // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
+    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
+    builder.set_numa_node(numa_node);
+  }
+
+  CUdevprop prop;
+  if (CUDADriver::GetDeviceProperties(&prop, device_ordinal_)) {
+    builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
+
+    ThreadDim thread_dim_limit;
+    thread_dim_limit.x = prop.maxThreadsDim[0];
+    thread_dim_limit.y = prop.maxThreadsDim[1];
+    thread_dim_limit.z = prop.maxThreadsDim[2];
+    builder.set_thread_dim_limit(thread_dim_limit);
+
+    float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
+    builder.set_clock_rate_ghz(clock_rate_ghz);
+  }
+
+  {
+    bool ecc_enabled = false;
+    (void)CUDADriver::IsEccEnabled(device_, &ecc_enabled);
+    builder.set_ecc_enabled(ecc_enabled);
+  }
+
+  {
+    uint64 device_memory_size = -1;
+    (void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    builder.set_device_memory_size(device_memory_size);
+  }
+
+  {
+    BlockDim block_dim_limit;
+    FillBlockDimLimit(&block_dim_limit);
+    builder.set_block_dim_limit(block_dim_limit);
+  }
+
+  {
+    string device_name;
+    (void)CUDADriver::GetDeviceName(device_, &device_name);
+    builder.set_name(device_name);
+  }
+
+  for (size_t i = 0; i < ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
+    const auto &params = kAllUnqueryableDeviceParams[i];
+    if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) {
+      builder.set_blocks_per_core_limit(params.blocks_per_core_limit);
+      builder.set_registers_per_core_limit(params.registers_per_core_limit);
+      builder.set_registers_per_thread_limit(params.registers_per_thread_limit);
+      builder.set_warp_alloc_granularity(params.warp_alloc_granularity);
+      builder.set_register_alloc_granularity(params.register_alloc_granularity);
+      builder.set_shared_memory_alloc_granularity(
+          params.shared_memory_alloc_granularity);
+    }
+  }
+
+  builder.set_platform_version(
+      port::StrCat("Compute Capability ", cc_major_, ".", cc_minor_));
+
+  // TODO(leary) should be a way to query this from the driver, but this is
+  // unlikely to change for us any time soon.
+  builder.set_device_address_bits(64);
+
+  builder.set_device_vendor("NVIDIA Corporation");
+  builder.set_cuda_compute_capability(cc_major_, cc_minor_);
+  builder.set_shared_memory_per_core(
+      CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+  builder.set_shared_memory_per_block(
+      CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+  builder.set_core_count(
+      CUDADriver::GetMultiprocessorCount(device_).ValueOrDie());
+  builder.set_threads_per_core_limit(
+      CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+  builder.set_registers_per_block_limit(
+      CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+  builder.set_threads_per_warp(
+      CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+}  // namespace cuda
+
+namespace gpu = ::perftools::gputools;
+
+void initialize_cuda_gpu_executor() {
+  port::StatusOr<void *> status =
+      gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle();
+  if (!status.ok()) {
+    gpu::cuda::Diagnostician::LogDriverVersionInformation();
+    LOG(INFO) << "LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH");
+    LOG(INFO) << "failed to find libcuda.so on this system: "
+              << status.status();
+  }
+
+  // TODO(b/22689637): Temporary until users are migrated off of PlatformKind.
+  gpu::PluginRegistry::Instance()->MapPlatformKindToId(
+      gpu::PlatformKind::kCuda, gpu::cuda::kCudaPlatformId);
+
+  *gpu::internal::MakeCUDAExecutorImplementation() = [](
+      const gpu::PluginConfig &config) {
+    return new gpu::cuda::CUDAExecutor{config};
+  };
+
+  *gpu::internal::MakeCUDAKernelImplementation() = []() {
+    return new gpu::cuda::CUDAKernel;
+  };
+
+  *gpu::internal::MakeCUDAEventImplementation() = [](
+      gpu::StreamExecutor *parent) {
+    gpu::cuda::CUDAExecutor *cuda_executor =
+        static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation());
+    return new gpu::cuda::CUDAEvent{cuda_executor};
+  };
+
+  *gpu::internal::MakeCUDAStreamImplementation() = [](
+      gpu::StreamExecutor *parent) {
+    gpu::cuda::CUDAExecutor *cuda_executor =
+        static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation());
+    return new gpu::cuda::CUDAStream{cuda_executor};
+  };
+  *gpu::internal::MakeCUDATimerImplementation() = [](
+      gpu::StreamExecutor *parent) {
+    gpu::cuda::CUDAExecutor *cuda_executor =
+        static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation());
+    return new gpu::cuda::CUDATimer{cuda_executor};
+  };
+}
+
+}  // namespace gputools
+}  // namespace perftools
+
+REGISTER_MODULE_INITIALIZER(
+    cuda_gpu_executor, {perftools::gputools::initialize_cuda_gpu_executor();});
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
new file mode 100644
index 0000000000..fda89b9738
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -0,0 +1,270 @@
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
+
+#include <map>
+#include <set>
+
+#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace blas {
+class BlasSupport;
+}
+namespace internal {
+class RngSupport;
+}  // namespace internal
+}  // namespace gputools
+}  // namespace perftools
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// CUDA-platform implementation of the platform-agnostic
+// StreamExecutorInferface.
+class CUDAExecutor : public internal::StreamExecutorInterface {
+ public:
+  // sub_platform indicates the subplatform used in this executor; it must
+  // be a CUDA type.
+  explicit CUDAExecutor(const PluginConfig &plugin_config)
+      : device_(0),
+        context_(nullptr),
+        device_ordinal_(0),
+        cc_major_(0),
+        cc_minor_(0),
+        plugin_config_(plugin_config) {}
+
+  // See the corresponding StreamExecutor methods for method comments on the
+  // following overrides.
+
+  ~CUDAExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+
+  bool GetKernel(const MultiKernelLoaderSpec &spec,
+                 KernelBase *kernel) override;
+
+  bool Launch(Stream *stream, const ThreadDim &thread_dims,
+              const BlockDim &block_dims, const KernelBase &k,
+              const std::vector<KernelArg> &args) override;
+
+  void *Allocate(uint64 size) override;
+
+  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+
+  void Deallocate(DeviceMemoryBase *mem) override;
+
+  // CUDA allocation/registration functions are necessary because the driver
+  // internally sets up buffers for DMA operations (and page locks them).
+  // There's no external interface for us to otherwise control these DMA
+  // settings.
+  void *HostMemoryAllocate(uint64 size) override {
+    return CUDADriver::HostAllocate(context_, size);
+  }
+
+  void HostMemoryDeallocate(void *location) override {
+    return CUDADriver::HostDeallocate(context_, location);
+  }
+
+  bool HostMemoryRegister(void *location, uint64 size) override;
+
+  bool HostMemoryUnregister(void *location) override;
+
+  bool SynchronizeAllActivity() override;
+
+  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
+
+  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
+                         uint64 size) override;
+
+  bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+                         uint64 size) override;
+
+  bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+                         uint64 size) override;
+
+  bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
+                                       const DeviceMemoryBase &gpu_src,
+                                       uint64 size) override;
+
+  bool MemZero(Stream *stream, DeviceMemoryBase *location,
+               uint64 size) override;
+  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
+                uint64 size) override;
+
+  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
+              uint64 size) override;
+
+  bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
+              uint64 size) override;
+
+  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
+                            const DeviceMemoryBase &gpu_src,
+                            uint64 size) override;
+
+  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+
+  bool AllocateStream(Stream *stream) override;
+
+  void DeallocateStream(Stream *stream) override;
+
+  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
+
+  bool AllocateTimer(Timer *timer) override;
+
+  void DeallocateTimer(Timer *timer) override;
+
+  bool StartTimer(Stream *stream, Timer *timer) override;
+
+  bool StopTimer(Stream *stream, Timer *timer) override;
+
+  port::Status AllocateEvent(Event *event) override;
+
+  port::Status DeallocateEvent(Event *event) override;
+
+  port::Status RecordEvent(Stream *stream, Event *event) override;
+
+  port::Status WaitForEvent(Stream *stream, Event *event) override;
+
+  Event::Status PollForEventStatus(Event *event) override;
+
+  bool BlockHostUntilDone(Stream *stream) override;
+
+  int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
+
+  bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
+
+  // Search for the symbol and returns a device pointer and size.
+  // Returns false if symbol does not exist.
+  bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override;
+
+  DeviceDescription *PopulateDeviceDescription() const override;
+
+  // Populates the block_dim_limit by querying the device driver API. If an
+  // error occurs at any point while asking the driver for block dim limits, it
+  // will be only partially populated as a result, and an error will be logged.
+  bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
+
+  KernelArg DeviceMemoryToKernelArg(
+      const DeviceMemoryBase &gpu_mem) const override;
+
+  bool SupportsBlas() const override;
+
+  blas::BlasSupport *CreateBlas() override;
+
+  bool SupportsFft() const override;
+
+  fft::FftSupport *CreateFft() override;
+
+  bool SupportsRng() const override;
+
+  rng::RngSupport *CreateRng() override;
+
+  bool SupportsDnn() const override;
+
+  dnn::DnnSupport *CreateDnn() override;
+
+  void *CudaContextHack() override;
+
+  CUcontext cuda_context();
+
+ private:
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for compute-capability-specific suffixed versions; i.e.
+  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
+  // we're on a compute capability 3.0 machine.
+  bool FindOnDiskForComputeCapability(port::StringPiece filename,
+                                      port::StringPiece canonical_suffix,
+                                      string *found_filename) const;
+
+  // Host callback landing routine invoked by CUDA.
+  // data: User-provided callback provided to HostCallback() above, captured
+  //       as a std::function<void()>. Allocated/initialized inside
+  //       HostCallback() and owned and deleted by this call.
+  static void InternalHostCallback(CUstream stream, CUresult status,
+                                   void *data);
+
+  // Collects metadata for the specified kernel.
+  bool GetKernelMetadata(CUDAKernel *cuda_kernel,
+                         KernelMetadata *kernel_metadata);
+
+  // Determines if the given kernel's occupancy could be improved by only
+  // slightly reducing its register usage. If so, a message is emitted to the
+  // INFO log. The warning threshold is controlled by the flag
+  // register_occupancy_warning_threshold.
+  void OccupancyCheck(const KernelBase &kernel, const ThreadDim &thread_dims,
+                      const BlockDim &block_dims);
+
+  // Guards the on-disk-module mapping.
+  mutex disk_modules_mu_;
+
+  // Mapping from filename to CUmodule, if it was already retrieved.
+  // Multiple CUfunctions are usually obtained from a single CUmodule so we
+  // attempt to hit in this mapping first, before retrieving it.
+  std::map<string, CUmodule> disk_modules_ GUARDED_BY(disk_modules_mu_);
+
+  // Guards the in-memory-module mapping.
+  mutex in_memory_modules_mu_;
+
+  std::map<const char *, CUmodule> in_memory_modules_
+      GUARDED_BY(in_memory_modules_mu_);
+
+  // Guards the launched kernel set.
+  mutex launched_kernels_mu_;
+
+  // Keeps track of the set of launched kernels. Currently used to suppress the
+  // occupancy check on subsequent launches.
+  std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
+
+  // Handle for the CUDA device being operated on. Immutable
+  // post-initialization.
+  CUdevice device_;
+
+  // Handle for session with the library/driver. Immutable post-initialization.
+  CUcontext context_;
+
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+
+  // The major verion of the compute capability for device_.
+  int cc_major_;
+
+  // The minor verion of the compute capability for device_.
+  int cc_minor_;
+
+  // The plugin configuration associated with this instance.
+  PluginConfig plugin_config_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
new file mode 100644
index 0000000000..2c5311cb3b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@@ -0,0 +1,95 @@
+// Common helper functions used for dealing with CUDA API datatypes.
+//
+// These are typically placed here for use by multiple source components (for
+// example, BLAS and executor components).
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
+
+#include <stddef.h>
+#include <complex>
+
+#include "third_party/gpus/cuda/include/cuComplex.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+
+namespace perftools {
+namespace gputools {
+
+class Stream;
+template <typename ElemT>
+class DeviceMemory;
+
+namespace cuda {
+
+// Converts a const DeviceMemory reference to its underlying typed pointer in
+// CUDA
+// device memory.
+template <typename T>
+const T *CUDAMemory(const DeviceMemory<T> &mem) {
+  return static_cast<const T *>(mem.opaque());
+}
+
+// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
+// pointer in CUDA device device memory.
+template <typename T>
+T *CUDAMemoryMutable(DeviceMemory<T> *mem) {
+  return static_cast<T *>(mem->opaque());
+}
+
+CUstream AsCUDAStreamValue(Stream *stream);
+
+static_assert(sizeof(std::complex<float>) == sizeof(cuComplex),
+              "std::complex<float> and cuComplex should have the same size");
+static_assert(offsetof(cuComplex, x) == 0,
+              "The real part of cuComplex should appear first.");
+static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex),
+              "std::complex<double> and cuDoubleComplex should have the same "
+              "size");
+static_assert(offsetof(cuDoubleComplex, x) == 0,
+              "The real part of cuDoubleComplex should appear first.");
+
+// Type traits to get CUDA complex types from std::complex<>.
+
+template <typename T>
+struct CUDAComplexT {
+  typedef T type;
+};
+
+template <>
+struct CUDAComplexT<std::complex<float>> {
+  typedef cuComplex type;
+};
+
+template <>
+struct CUDAComplexT<std::complex<double>> {
+  typedef cuDoubleComplex type;
+};
+
+// Converts pointers of std::complex<> to pointers of
+// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
+
+template <typename T>
+inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) {
+  return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p);
+}
+
+template <typename T>
+inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) {
+  return reinterpret_cast<typename CUDAComplexT<T>::type *>(p);
+}
+
+// Converts values of std::complex<float/double> to values of
+// cuComplex/cuDoubleComplex.
+inline cuComplex CUDAComplexValue(std::complex<float> val) {
+  return {val.real(), val.imag()};
+}
+
+inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
+  return {val.real(), val.imag()};
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
new file mode 100644
index 0000000000..e8ad3955e9
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -0,0 +1,115 @@
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
+
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/lib/casts.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+
+#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
+#error \
+    "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
+#endif
+
+#ifdef __CUDA_RUNTIME_H__
+#error \
+    "CUDA runtime being included into CUDA GPU executor; should be driver only."
+#endif
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// Wraps a CUfunction to implement the platform-independent KernelInterface.
+class CUDAKernel : public internal::KernelInterface {
+ public:
+  CUDAKernel() : cuda_function_(nullptr), arity_(0),
+                 preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
+
+  // Note that the function is unloaded when the module is unloaded, and the
+  // module that the function is contained in is owned by the CUDAExecutor.
+  ~CUDAKernel() override {}
+
+  // As arity cannot be reflected upon using the CUDA API, the arity is
+  // explicitly set during the CUDAExecutor::GetKernel initialization process.
+  void set_arity(unsigned arity) { arity_ = arity; }
+  unsigned Arity() const override { return arity_; }
+
+  // Returns the CUfunction value for passing to the CUDA API.
+  CUfunction AsCUDAFunctionValue() const {
+    DCHECK(cuda_function_ != nullptr);
+    return const_cast<CUfunction>(cuda_function_);
+  }
+
+  // Returns the slot that the CUfunction is stored within for this object,
+  // for the CUDA API which wants to load into a CUfunction*.
+  CUfunction *cuda_function_ptr() { return &cuda_function_; }
+
+  // CUDA supports setting the preferred cache configuration of a CUfunction
+  // (more-or-less equivalent to a CUDAKernel). We support this via the below
+  // functions; users can set a preference, and that is applied when the kernel
+  // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
+  // load the kernel & set the preference when the user calls the setter below;
+  // either approach is valid.
+  // Sets the current kernel cache configuration preference.
+  void SetPreferredCacheConfig(KernelCacheConfig config) override {
+    preferred_cache_config_ = config;
+  }
+
+  // Returns the current kernel cache configuration preference.
+  KernelCacheConfig GetPreferredCacheConfig() const override {
+    return preferred_cache_config_;
+  }
+
+  // Returns the current kernel cache configuration preference as a
+  // CUfunc_cache.
+  CUfunc_cache GetCUDACacheConfig() const {
+    switch (preferred_cache_config_) {
+      case KernelCacheConfig::kNoPreference:
+        return CU_FUNC_CACHE_PREFER_NONE;
+      case KernelCacheConfig::kPreferShared:
+        return CU_FUNC_CACHE_PREFER_SHARED;
+      case KernelCacheConfig::kPreferL1:
+        return CU_FUNC_CACHE_PREFER_L1;
+      case KernelCacheConfig::kPreferEqual:
+        return CU_FUNC_CACHE_PREFER_EQUAL;
+      default:
+        LOG(FATAL) << "Unknown KernelCacheConfig"
+                   << static_cast<int32>(preferred_cache_config_);
+    }
+  }
+
+ private:
+  CUfunction cuda_function_;  // Wrapped CUDA kernel handle.
+  unsigned arity_;            // Number of formal parameters the kernel takes.
+
+  // Preferred (but not required) cache configuration for this kernel.
+  KernelCacheConfig preferred_cache_config_;
+};
+
+// Given a platform-independent kernel datatype, returns the (const) internal
+// CUDA platform implementation pointer.
+inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
+  return static_cast<const CUDAKernel *>(kernel->implementation());
+}
+
+// Given a platform-independent kernel datatype, returns the (non-const)
+// internal CUDA platform implementation pointer.
+inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
+  return static_cast<CUDAKernel *>(kernel->implementation());
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
new file mode 100644
index 0000000000..ef88b89eda
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -0,0 +1,172 @@
+#include "tensorflow/stream_executor/cuda/cuda_platform.h"
+
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+PLATFORM_DEFINE_ID(kCudaPlatformId);
+
+CudaPlatform::CudaPlatform()
+    : name_("CUDA"), min_numa_node_(0), limit_numa_node_(0) {}
+
+CudaPlatform::~CudaPlatform() {}
+
+// Due to legacy issues in user code, we can't currently call InpectNumaNodes
+// at module initialization time, because non-GPU programs still include this
+// plugin via various methods, so instead, it has to be init-on-reference.
+void CudaPlatform::InspectNumaNodes() {
+  // To get NUMA node information, we need to create all executors, so we can
+  // examine their device descriptions to see their bus assignments.
+  static bool initialized = false;
+  static mutex numa_mutex(LINKER_INITIALIZED);
+  mutex_lock lock(numa_mutex);
+  if (initialized) {
+    return;
+  }
+
+  StreamExecutorConfig config;
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    config.ordinal = i;
+    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+    if (i == 0) {
+      // NUMA nodes may not start at 0, so set the minimum node  based on the
+      // first executor we see.
+      min_numa_node_ = exec->GetDeviceDescription().numa_node();
+      limit_numa_node_ = min_numa_node_ + 1;
+    } else {
+      min_numa_node_ =
+          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+      limit_numa_node_ = std::max(limit_numa_node_,
+                                  exec->GetDeviceDescription().numa_node() + 1);
+    }
+  }
+  initialized = true;
+}
+
+int CudaPlatform::BusCount() {
+  InspectNumaNodes();
+  return limit_numa_node_ - min_numa_node_;
+}
+
+int CudaPlatform::DeviceToBus(int device_ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+  return exec->GetDeviceDescription().numa_node() - min_numa_node_;
+}
+
+port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
+    int bus_ordinal) {
+  InspectNumaNodes();
+  CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    if (DeviceToBus(i) == bus_ordinal) {
+      StreamExecutorConfig config;
+      config.ordinal = i;
+      return GetExecutor(config).ValueOrDie();
+    }
+  }
+
+  return port::Status{
+      port::error::NOT_FOUND,
+      port::Printf("Executor for bus %d not found.", bus_ordinal)};
+}
+
+Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
+
+int CudaPlatform::VisibleDeviceCount() const {
+  // Throw away the result - it logs internally, and this [containing] function
+  // isn't in the path of user control. It's safe to call this > 1x.
+  if (!cuda::CUDADriver::Init().ok()) {
+    return -1;
+  }
+
+  return CUDADriver::GetDeviceCount();
+}
+
+const string& CudaPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  mutex_lock lock(mu_);
+
+  port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config);
+  if (status.ok()) {
+    return status.ValueOrDie();
+  }
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> executor =
+      GetUncachedExecutor(config);
+  if (!executor.ok()) {
+    return executor.status();
+  }
+
+  StreamExecutor* naked_executor = executor.ValueOrDie().get();
+  executor_cache_.Insert(config, executor.ConsumeValueOrDie());
+  return naked_executor;
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = port::MakeUnique<StreamExecutor>(PlatformKind::kCuda,
+                                                   config.plugin_config);
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf(
+            "failed initializing StreamExecutor for CUDA device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void CudaPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register CUDA trace listener";
+}
+
+void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister CUDA trace listener";
+}
+
+}  // namespace cuda
+
+static void InitializeCudaPlatform() {
+  // Disabling leak checking, MultiPlatformManager does not destroy its
+  // registered platforms.
+  
+  std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
+  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+}
+
+}  // namespace gputools
+}  // namespace perftools
+
+REGISTER_MODULE_INITIALIZER(cuda_platform,
+                            perftools::gputools::InitializeCudaPlatform());
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
new file mode 100644
index 0000000000..966d7343f7
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@@ -0,0 +1,98 @@
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
+
+#include <memory>
+#include "tensorflow/stream_executor/platform/port.h"
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// Opaque and unique identifier for the CUDA platform plugin.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a CudaPlatform object.
+extern const Platform::Id kCudaPlatformId;
+
+// Cuda-specific platform plugin, registered as a singleton value via module
+// initializer.
+class CudaPlatform : public Platform {
+ public:
+  CudaPlatform();
+  ~CudaPlatform() override;
+
+  // CudaPlatform-specific functionality
+  // Returns the number of distinct buses / NUMA nodes on the machine.
+  int BusCount();
+
+  // Returns the bus/NUMA node for the specified device ordinal.
+  int DeviceToBus(int device_ordinal);
+
+  // Returns the lowest-ordinal-number StreamExecutor on the specified bus.
+  port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
+
+  // Platform interface implementation:
+  // Returns the same value as kCudaPlatform above.
+  Platform::Id id() const override;
+
+  // Returns -1 as a sentinel on internal failure (and logs the error).
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // Determines the number of NUMA nodes and the assignment of executor to each.
+  void InspectNumaNodes();
+
+  // This platform's name.
+  string name_;
+
+  // mutex that guards internal state.
+  mutable mutex mu_;
+
+  // Cache of created executors.
+  ExecutorCache executor_cache_;
+
+  // The smallest NUMA node value for any device managed by this machine
+  // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus
+  // ordinals. The NUMA node space occupied by GPUs is assumed to be dense./
+  int min_numa_node_;
+
+  // Larger than the NUMA node value for any device managed by this machine
+  // manager.
+  int limit_numa_node_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
new file mode 100644
index 0000000000..ad48c8b59a
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -0,0 +1,317 @@
+#include "tensorflow/stream_executor/cuda/cuda_rng.h"
+
+#include <dlfcn.h>
+
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/cuda/cuda_helpers.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "third_party/gpus/cuda/include/curand.h"
+
+// Formats curandStatus_t to output prettified values into a log stream.
+std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
+#define OSTREAM_CURAND_STATUS(__name) \
+  case CURAND_STATUS_##__name:        \
+    in << "CURAND_STATUS_" #__name;   \
+    return in;
+
+  switch (status) {
+    OSTREAM_CURAND_STATUS(SUCCESS)
+    OSTREAM_CURAND_STATUS(VERSION_MISMATCH)
+    OSTREAM_CURAND_STATUS(NOT_INITIALIZED)
+    OSTREAM_CURAND_STATUS(ALLOCATION_FAILED)
+    OSTREAM_CURAND_STATUS(TYPE_ERROR)
+    OSTREAM_CURAND_STATUS(OUT_OF_RANGE)
+    OSTREAM_CURAND_STATUS(LENGTH_NOT_MULTIPLE)
+    OSTREAM_CURAND_STATUS(LAUNCH_FAILURE)
+    OSTREAM_CURAND_STATUS(PREEXISTING_FAILURE)
+    OSTREAM_CURAND_STATUS(INITIALIZATION_FAILED)
+    OSTREAM_CURAND_STATUS(ARCH_MISMATCH)
+    OSTREAM_CURAND_STATUS(INTERNAL_ERROR)
+    default:
+      in << "curandStatus_t(" << static_cast<int>(status) << ")";
+      return in;
+  }
+}
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
+
+namespace dynload {
+
+#define PERFTOOLS_GPUTOOLS_CURAND_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                            \
+    static const char *kName;                                               \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;        \
+    static void *GetDsoHandle() {                                           \
+      static auto status = internal::CachedDsoLoader::GetCurandDsoHandle(); \
+      return status.ValueOrDie();                                           \
+    }                                                                       \
+    static FuncPointerT DynLoad() {                                         \
+      static void *f = dlsym(GetDsoHandle(), kName);                        \
+      CHECK(f != nullptr) << "could not find " << kName                     \
+                          << " in curand DSO; dlerror: " << dlerror();      \
+      return reinterpret_cast<FuncPointerT>(f);                             \
+    }                                                                       \
+    template <typename... Args>                                             \
+    curandStatus_t operator()(CUDAExecutor * parent, Args... args) {        \
+      cuda::ScopedActivateExecutorContext sac{parent};                      \
+      return DynLoad()(args...);                                            \
+    }                                                                       \
+  } __name;                                                                 \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandCreateGenerator);
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandDestroyGenerator);
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetStream);
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniform);
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniformDouble);
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed);
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetGeneratorOffset);
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormal);
+PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormalDouble);
+
+}  // namespace dynload
+
+template <typename T>
+string TypeString();
+
+template <>
+string TypeString<float>() {
+  return "float";
+}
+
+template <>
+string TypeString<double>() {
+  return "double";
+}
+
+template <>
+string TypeString<std::complex<float>>() {
+  return "std::complex<float>";
+}
+
+template <>
+string TypeString<std::complex<double>>() {
+  return "std::complex<double>";
+}
+
+CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {}
+
+CUDARng::~CUDARng() {
+  if (rng_ != nullptr) {
+    dynload::curandDestroyGenerator(parent_, rng_);
+  }
+}
+
+bool CUDARng::Init() {
+  mutex_lock lock{mu_};
+  CHECK(rng_ == nullptr);
+
+  curandStatus_t ret =
+      dynload::curandCreateGenerator(parent_, &rng_, CURAND_RNG_PSEUDO_DEFAULT);
+  if (ret != CURAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to create random number generator: " << ret;
+    return false;
+  }
+
+  CHECK(rng_ != nullptr);
+  return true;
+}
+
+bool CUDARng::SetStream(Stream *stream) {
+  curandStatus_t ret =
+      dynload::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream));
+  if (ret != CURAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for random generation: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+// Returns true if std::complex stores its contents as two consecutive
+// elements. Tests int, float and double, as the last two are independent
+// specializations.
+constexpr bool ComplexIsConsecutiveFloats() {
+  return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 &&
+      sizeof(std::complex<double>) == 16;
+}
+
+template <typename T>
+bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
+                                            DeviceMemory<T> *v) {
+  mutex_lock lock{mu_};
+  static_assert(ComplexIsConsecutiveFloats(),
+                "std::complex values are not stored as consecutive values");
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // std::complex<T> is currently implemented as two consecutive T variables.
+  uint64 element_count = v->ElementCount();
+  if (std::is_same<T, std::complex<float>>::value ||
+      std::is_same<T, std::complex<double>>::value) {
+    element_count *= 2;
+  }
+
+  curandStatus_t ret;
+  if (std::is_same<T, float>::value ||
+      std::is_same<T, std::complex<float>>::value) {
+    ret = dynload::curandGenerateUniform(
+        parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)),
+        element_count);
+  } else {
+    ret = dynload::curandGenerateUniformDouble(
+        parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)),
+        element_count);
+  }
+  if (ret != CURAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount()
+               << " " << TypeString<T>() << "s at " << v->opaque() << ": "
+               << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool CUDARng::DoPopulateRandUniform(Stream *stream,
+                                    DeviceMemory<std::complex<float>> *v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool CUDARng::DoPopulateRandUniform(Stream *stream,
+                                    DeviceMemory<std::complex<double>> *v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+template <typename ElemT, typename FuncT>
+bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
+                                             ElemT stddev,
+                                             DeviceMemory<ElemT> *v,
+                                             FuncT func) {
+  mutex_lock lock{mu_};
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  uint64 element_count = v->ElementCount();
+  curandStatus_t ret =
+      func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev);
+
+  if (ret != CURAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
+               << " floats at " << v->opaque() << ": " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
+                                     DeviceMemory<float> *v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        dynload::curandGenerateNormal);
+}
+
+bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
+                                     DeviceMemory<double> *v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        dynload::curandGenerateNormalDouble);
+}
+
+bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
+  mutex_lock lock{mu_};
+  CHECK(rng_ != nullptr);
+
+  if (!CheckSeed(seed, seed_bytes)) {
+    return false;
+  }
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above)
+  // (which itself requires 16 for API consistency with host RNG fallbacks).
+  curandStatus_t ret = dynload::curandSetPseudoRandomGeneratorSeed(
+      parent_, rng_, *(reinterpret_cast<const uint64 *>(seed)));
+  if (ret != CURAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set rng seed: " << ret;
+    return false;
+  }
+
+  ret = dynload::curandSetGeneratorOffset(parent_, rng_, 0);
+  if (ret != CURAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to reset rng position: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+namespace gpu = ::perftools::gputools;
+
+REGISTER_MODULE_INITIALIZER(register_curand, {
+  gpu::port::Status status =
+      gpu::PluginRegistry::Instance()
+          ->RegisterFactory<gpu::PluginRegistry::RngFactory>(
+              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuRandPlugin, "cuRAND",
+              [](gpu::internal::StreamExecutorInterface
+                     *parent) -> gpu::rng::RngSupport * {
+                gpu::cuda::CUDAExecutor *cuda_executor =
+                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
+                if (cuda_executor == nullptr) {
+                  LOG(ERROR)
+                      << "Attempting to initialize an instance of the cuRAND "
+                      << "support library with a non-CUDA StreamExecutor";
+                  return nullptr;
+                }
+
+                gpu::cuda::CUDARng *rng = new gpu::cuda::CUDARng(cuda_executor);
+                if (!rng->Init()) {
+                  // Note: Init() will log a more specific error.
+                  delete rng;
+                  return nullptr;
+                }
+                return rng;
+              });
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to register cuRAND factory: "
+               << status.error_message();
+  }
+
+  // Prime the cuRAND DSO. The loader will log more information.
+  auto statusor = gpu::internal::CachedDsoLoader::GetCurandDsoHandle();
+  if (!statusor.ok()) {
+    LOG(INFO) << "Unable to load cuRAND DSO.";
+  }
+
+  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
+                                                     gpu::PluginKind::kRng,
+                                                     gpu::cuda::kCuRandPlugin);
+});
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h
new file mode 100644
index 0000000000..4e1b82969b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_rng.h
@@ -0,0 +1,89 @@
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
+
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rng.h"
+
+typedef struct curandGenerator_st *curandGenerator_t;
+
+namespace perftools {
+namespace gputools {
+
+class Stream;
+template <typename ElemT>
+class DeviceMemory;
+
+namespace cuda {
+
+// Opaque and unique identifier for the cuRAND plugin.
+extern const PluginId kCuRandPlugin;
+
+class CUDAExecutor;
+
+// CUDA-platform implementation of the random number generation support
+// interface.
+//
+// Thread-safe post-initialization.
+class CUDARng : public rng::RngSupport {
+ public:
+  explicit CUDARng(CUDAExecutor *parent);
+
+  // Retrieves a curand library generator handle. This is necessary for
+  // enqueuing random number generation work onto the device.
+  // TODO(leary) provide a way for users to select the RNG algorithm.
+  bool Init();
+
+  // Releases a curand library generator handle, if one was acquired.
+  ~CUDARng() override;
+
+  // See rng::RngSupport for details on the following overrides.
+  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override;
+  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override;
+  bool DoPopulateRandUniform(Stream *stream,
+                             DeviceMemory<std::complex<float>> *v) override;
+  bool DoPopulateRandUniform(Stream *stream,
+                             DeviceMemory<std::complex<double>> *v) override;
+  bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
+                              DeviceMemory<float> *v) override;
+  bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
+                              DeviceMemory<double> *v) override;
+
+  bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override;
+
+ private:
+  // Actually performs the work of generating random numbers - the public
+  // methods are thin wrappers to this interface.
+  template <typename T>
+  bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v);
+  template <typename ElemT, typename FuncT>
+  bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev,
+                                      DeviceMemory<ElemT> *v, FuncT func);
+
+  // Sets the stream for the internal curand generator.
+  //
+  // This is a stateful operation, as the handle can only have one stream set at
+  // a given time, so it is usually performed right before enqueuing work to do
+  // with random number generation.
+  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // mutex that guards the cuRAND handle for this device.
+  mutex mu_;
+
+  // CUDAExecutor which instantiated this CUDARng.
+  // Immutable post-initialization.
+  CUDAExecutor *parent_;
+
+  // cuRANDalibrary handle on the device.
+  curandGenerator_t rng_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(CUDARng);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/cuda/cuda_stream.cc
new file mode 100644
index 0000000000..e70579b55c
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_stream.cc
@@ -0,0 +1,51 @@
+#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+
+#include "tensorflow/stream_executor/lib/status.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+bool CUDAStream::Init() {
+  return CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_);
+}
+
+void CUDAStream::Destroy() {
+  {
+    mutex_lock lock{mu_};
+    if (completed_event_ != nullptr) {
+      port::Status status =
+          CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_);
+      if (!status.ok()) {
+        LOG(ERROR) << status.error_message();
+      }
+    }
+  }
+
+  CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_);
+}
+
+bool CUDAStream::IsIdle() const {
+  return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_);
+}
+
+bool CUDAStream::GetOrCreateCompletedEvent(CUevent *completed_event) {
+  mutex_lock lock{mu_};
+  if (completed_event_ != nullptr) {
+    *completed_event = completed_event_;
+    return true;
+  }
+
+  if (!CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_,
+                               CUDADriver::EventFlags::kDisableTiming)
+           .ok()) {
+    return false;
+  }
+
+  *completed_event = completed_event_;
+  return true;
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h
new file mode 100644
index 0000000000..f6db64a1bf
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_stream.h
@@ -0,0 +1,74 @@
+// Defines the CUDAStream type - the CUDA-specific implementation of the generic
+// StreamExecutor Stream interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
+
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+class CUDAExecutor;
+
+// Wraps a CUstream in order to satisfy the platform-independent
+// StreamInterface.
+//
+// Thread-safe post-initialization.
+class CUDAStream : public internal::StreamInterface {
+ public:
+  explicit CUDAStream(CUDAExecutor *parent)
+      : parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {}
+
+  // Note: teardown is handled by a parent's call to DeallocateStream.
+  ~CUDAStream() override {}
+
+  void *CudaStreamHack() override { return cuda_stream_; }
+  void **CudaStreamMemberHack() override {
+    return reinterpret_cast<void **>(&cuda_stream_);
+  }
+
+  // Explicitly initialize the CUDA resources associated with this stream, used
+  // by StreamExecutor::AllocateStream().
+  bool Init();
+
+  // Explicitly destroy the CUDA resources associated with this stream, used by
+  // StreamExecutor::DeallocateStream().
+  void Destroy();
+
+  // Returns true if no work is pending or executing on the stream.
+  bool IsIdle() const;
+
+  // Retrieves an event which indicates that all work enqueued into the stream
+  // has completed. Ownership of the event is not transferred to the caller, the
+  // event is owned by this stream.
+  bool GetOrCreateCompletedEvent(CUevent *completed_event);
+
+  // Returns the CUstream value for passing to the CUDA API.
+  //
+  // Precond: this CUDAStream has been allocated (otherwise passing a nullptr
+  // into the NVIDIA library causes difficult-to-understand faults).
+  CUstream cuda_stream() const {
+    DCHECK(cuda_stream_ != nullptr);
+    return const_cast<CUstream>(cuda_stream_);
+  }
+
+  CUDAExecutor *parent() const { return parent_; }
+
+ private:
+  mutex mu_;              // mutex that guards the completion event.
+  CUDAExecutor *parent_;  // Executor that spawned this stream.
+  CUstream cuda_stream_;  // Wrapped CUDA stream handle.
+
+  // Event that indicates this stream has completed.
+  CUevent completed_event_ GUARDED_BY(mu_);
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc
new file mode 100644
index 0000000000..ad5e13ab6b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_timer.cc
@@ -0,0 +1,73 @@
+#include "tensorflow/stream_executor/cuda/cuda_timer.h"
+
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/lib/status.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+bool CUDATimer::Init() {
+  CHECK(start_event_ == nullptr && stop_event_ == nullptr);
+  CUcontext context = parent_->cuda_context();
+  if (!CUDADriver::CreateEvent(context, &start_event_,
+                               CUDADriver::EventFlags::kDefault)
+           .ok()) {
+    return false;
+  }
+
+  if (!CUDADriver::CreateEvent(context, &stop_event_,
+                               CUDADriver::EventFlags::kDefault)
+           .ok()) {
+    port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+    if (!status.ok()) {
+      LOG(ERROR) << status;
+    }
+    return false;
+  }
+
+  CHECK(start_event_ != nullptr && stop_event_ != nullptr);
+  return true;
+}
+
+void CUDATimer::Destroy() {
+  CUcontext context = parent_->cuda_context();
+  port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+
+  status = CUDADriver::DestroyEvent(context, &stop_event_);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+}
+
+float CUDATimer::GetElapsedMilliseconds() const {
+  CHECK(start_event_ != nullptr && stop_event_ != nullptr);
+  // TODO(leary) provide a way to query timer resolution?
+  // CUDA docs say a resolution of about 0.5us
+  float elapsed_milliseconds = NAN;
+  (void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(),
+                                        &elapsed_milliseconds, start_event_,
+                                        stop_event_);
+  return elapsed_milliseconds;
+}
+
+bool CUDATimer::Start(CUDAStream *stream) {
+  return CUDADriver::RecordEvent(parent_->cuda_context(), start_event_,
+                                 stream->cuda_stream())
+      .ok();
+}
+
+bool CUDATimer::Stop(CUDAStream *stream) {
+  return CUDADriver::RecordEvent(parent_->cuda_context(), stop_event_,
+                                 stream->cuda_stream())
+      .ok();
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
new file mode 100644
index 0000000000..e49e212403
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -0,0 +1,69 @@
+// Defines the CUDATimer type - the CUDA-specific implementation of the generic
+// StreamExecutor Timer interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
+
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+class CUDAExecutor;
+class CUDAStream;
+
+// Wraps a pair of CUevents in order to satisfy the platform-independent
+// TimerInferface -- both a start and a stop event are present which may be
+// recorded in a stream.
+class CUDATimer : public internal::TimerInterface {
+ public:
+  explicit CUDATimer(CUDAExecutor *parent)
+      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
+
+  // Note: teardown is explicitly handled in this API by a call to
+  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
+  ~CUDATimer() override {}
+
+  // Allocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::AllocateTimer().
+  bool Init();
+
+  // Deallocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::DeallocateTimer().
+  void Destroy();
+
+  // Records the "timer start" event at the current point in the stream.
+  bool Start(CUDAStream *stream);
+
+  // Records the "timer stop" event at the current point in the stream.
+  bool Stop(CUDAStream *stream);
+
+  // Returns the elapsed time, in milliseconds, between the start and stop
+  // events.
+  float GetElapsedMilliseconds() const;
+
+  // See perftools::gputools::Timer::Microseconds().
+  // TODO(leary) make this into an error code interface...
+  uint64 Microseconds() const override {
+    return GetElapsedMilliseconds() * 1e3;
+  }
+
+  // See perftools::GPUTools::Timer::Nanoseconds().
+  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
+
+ private:
+  CUDAExecutor *parent_;
+  CUevent start_event_;  // Event recorded to indicate the "start" timestamp
+                         // executing in a stream.
+  CUevent stop_event_;   // Event recorded to indicate the "stop" timestamp
+                         // executing in a stream.
+};
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
diff --git a/tensorflow/stream_executor/cuda/multi_op_activation.h b/tensorflow/stream_executor/cuda/multi_op_activation.h
new file mode 100644
index 0000000000..ba2bcd3a91
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/multi_op_activation.h
@@ -0,0 +1,16 @@
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// Type-safe boolean wrapper: denotes whether a ScopedActivateExecutorContext
+// may have other ScopedActivateExecutorContexts nested within it.
+enum class MultiOpActivation { kNo = false, kYes = true };
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_