1 files changed, 1608 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
new file mode 100644
index 0000000000..8c4316b4c1
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -0,0 +1,1608 @@
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <set>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/casts.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/human_readable.h"
+#include "tensorflow/stream_executor/lib/notification.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/lib/stacktrace.h"
+#include "tensorflow/stream_executor/lib/static_threadlocal.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
+
+bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
+bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
+bool FLAGS_gpuexec_cuda_device_0_only = false;
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+namespace dynload {
+
+#define PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                             \
+    static const char *kName;                                                \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;         \
+    static void *GetDsoHandle() {                                            \
+      static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
+      return status.ValueOrDie();                                            \
+    }                                                                        \
+    static FuncPointerT DynLoad() {                                          \
+      static void *f = dlsym(GetDsoHandle(), kName);                         \
+      CHECK(f != nullptr) << "could not find " << kName                      \
+                          << "in libcuda DSO; dlerror: " << dlerror();       \
+      return reinterpret_cast<FuncPointerT>(f);                              \
+    }                                                                        \
+    template <typename... Args>                                              \
+    CUresult operator()(Args... args) {                                      \
+      return DynLoad()(args...);                                             \
+    }                                                                        \
+  } __name;                                                                  \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxCreate_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxDestroy);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxEnablePeerAccess);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetCurrent);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetDevice);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetSharedMemConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxPopCurrent_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetCurrent);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetSharedMemConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceComputeCapability);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceCanAccessPeer);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGet);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetCount);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetName);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetPCIBusId);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetProperties);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceTotalMem);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDriverGetVersion);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventCreate);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorString);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuInit);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuLaunchKernel);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemAlloc_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoD_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoH_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoD_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoDAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoHAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoDAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetAddressRange_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFree_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFreeHost);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetInfo_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostAlloc);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostRegister_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostUnregister);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32Async);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetFunction);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetGlobal_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadDataEx);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadFatBinary);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleUnload);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuPointerGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamAddCallback);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamCreate);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamDestroy_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamQuery);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamWaitEvent);
+
+}  // namespace dynload
+
+namespace {
+
+// Manages the singleton set of contexts that we've created. This is used for
+// checking that no CUDA-runtime-created contexts have been generated
+// accidentally.  CUDA-runtime-created contexts are avoided, if triple angle
+// brace launches are required, by using the scoped activations in
+// cuda_activation.h.
+class CreatedContexts {
+ public:
+  // Returns whether context is a member of the live set.
+  static bool Has(CUcontext context) {
+    shared_lock lock{mu_};
+    return Live()->find(context) != Live()->end();
+  }
+
+  // Adds context to the live set.
+  static void Add(CUcontext context) {
+    CHECK(context != nullptr);
+    mutex_lock lock{mu_};
+    Live()->emplace(context);
+  }
+
+  // Removes context from the live set.
+  static void Remove(CUcontext context) {
+    CHECK(context != nullptr);
+    mutex_lock lock{mu_};
+    Live()->erase(context);
+  }
+
+ private:
+  // Returns the live set singleton.
+  static std::set<CUcontext> *Live() {
+    static auto singleton = new std::set<CUcontext>;
+    return singleton;
+  }
+
+  // Lock that guards access-to/mutation-of the live set.
+  static mutex mu_;
+};
+
+/* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED};
+
+// Formats CUresult to output prettified values into a log stream.
+// Error summaries taken from:
+// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
+//
+// TODO(leary) switch to cuGetErrorName when updated cuda.h is available.
+string ToString(CUresult result) {
+#define OSTREAM_CUDA_ERROR(__name) \
+  case CUDA_ERROR_##__name:        \
+    return "CUDA_ERROR_" #__name;
+
+///////////////
+// NOTE: here we specify return code values outside of the enum explicitly
+// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver
+// libraries are deployed in the fleet these error codes are backwards
+// compatible, but if we see a "new" one, we want to be able to identify it in
+// the logs.
+//
+// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can
+// eliminate this function and just rely on the driver to provide us these
+// strings.
+//
+// NOTE: "Must reboot all context" below is shorthand for, "must
+// destroy/recreate the offending context and any allocation which come from
+// it if you are to continue using CUDA."
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+  switch (result) {
+    OSTREAM_CUDA_ERROR(INVALID_VALUE)
+    OSTREAM_CUDA_ERROR(OUT_OF_MEMORY)
+    OSTREAM_CUDA_ERROR(NOT_INITIALIZED)
+    OSTREAM_CUDA_ERROR(DEINITIALIZED)
+    OSTREAM_CUDA_ERROR(NO_DEVICE)
+    OSTREAM_CUDA_ERROR(INVALID_DEVICE)
+    OSTREAM_CUDA_ERROR(INVALID_IMAGE)
+    OSTREAM_CUDA_ERROR(INVALID_CONTEXT)
+    OSTREAM_CUDA_ERROR(INVALID_HANDLE)
+    OSTREAM_CUDA_ERROR(NOT_FOUND)
+    OSTREAM_CUDA_ERROR(NOT_READY)
+    OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU)
+
+    // Encountered an uncorrectable ECC error during execution.
+    OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE)
+
+    // Load/store on an invalid address. Must reboot all context.
+    case 700:
+      return "CUDA_ERROR_ILLEGAL_ADDRESS";
+    // Passed too many / wrong arguments, too many threads for register count.
+    case 701:
+      return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+    // Kernel took too long to execute.
+    case 702:
+      return "CUDA_ERROR_LAUNCH_TIMEOUT";
+    // Kernel launch uses an incompatible texturing mode.
+    case 703:
+      return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+    // Trying to re-enable peer access that already has it enabled.
+    case 704:
+      return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+    // Trying to disable peer access that has not yet been enabled.
+    case 705:
+      return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+    // Primary context for the specified device has already been initialized.
+    case 708:
+      return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+    // Context current to calling thread has been destroyed or is a primary
+    // context that has not yet been initialized.
+    case 709:
+      return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+    // Device-side assert triggered during kernel execution. Must reboot all
+    // context.
+    case 710:
+      return "CUDA_ERROR_ASSERT";
+    // Hardware resources to enable peer access have been exhausted.
+    case 711:
+      return "CUDA_ERROR_TOO_MANY_PEERS";
+    // Memory range has already been registered.
+    case 712:
+      return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+    // Pointer does not correspond to any currently registered memory region.
+    case 713:
+      return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+    // Due to stack corruption or exceeding stack size limit. Must reboot all
+    // context.
+    case 714:
+      return "CUDA_ERROR_HARDWARE_STACK_ERROR";
+    case 715:
+      return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
+    // Load/store on an unaligned memory address. Must reboot all context.
+    case 716:
+      return "CUDA_ERROR_MISALIGNED_ADDRESS";
+    // Device instruction with specific address space given address not
+    // belonging to allowed address space. Must reboot all context.
+    case 717:
+      return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
+    // Device program counter wrapped its address space. Must reboot all
+    // context.
+    case 718:
+      return "CUDA_ERROR_INVALID_PC";
+    // Exception on device while executing a kernel; e.g. deref invalid device
+    // pointer, accessing OOB shared memory. Must reboot all context.
+    case 719:
+      return "CUDA_ERROR_LAUNCH_FAILED";
+
+    OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
+    OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
+    OSTREAM_CUDA_ERROR(NOT_PERMITTED)
+    OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
+    OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
+    default:
+      return port::StrCat("CUresult(", static_cast<int>(result), ")");
+  }
+#pragma GCC diagnostic pop
+}
+
+// Returns the current context and checks that it is in the set of CUDA contexts
+// created by StreamExecutor (to ensure that the CUDA runtime didn't create a
+// context behind our backs).
+CUcontext CurrentContext() {
+  CUcontext current = nullptr;
+  CUresult result = dynload::cuCtxGetCurrent(&current);
+  if (result != CUDA_SUCCESS) {
+    LOG(FATAL) << "failed to query current context: " << ToString(result);
+  }
+  if (current != nullptr && !CreatedContexts::Has(current)) {
+    LOG(FATAL) << "current context was not created by the StreamExecutor "
+                  "cuda_driver API: "
+               << current
+               << "; a CUDA runtime call "
+                  "was likely performed without using a StreamExecutor context";
+  }
+  return current;
+}
+
+// "Pops" the current context, checks that it matches expected, and checks the
+// postcondition that the current context is nullptr.
+//
+// This is not done when we're nested within a MultiOpActivation, as we want to
+// persist the active context until the MultiOpActivation is popped.
+void PopContextAndCheckNowNull(CUcontext expected) {
+  CUcontext actual = CurrentContext();
+  CHECK_EQ(expected, actual) << "would pop unexpected context";
+  CUcontext popped;
+  CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxPopCurrent_v2(&popped));
+  CHECK_EQ(expected, popped);
+  CHECK(nullptr == CurrentContext());
+  VLOG(3) << "popped context " << expected
+          << " and current context is now null";
+}
+
+// CUDA driver routines may require a large amount of stack (particularly
+// cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
+// stack-limited threads (such as those spawned by a default-argument
+// thread::ThreadPool on some platforms), we run certain routines in this pool
+// and wait for completion.
+static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
+static port::ThreadPool *InitializeDriverExecutor() {
+  return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
+                              "cuda_driver", 1);
+}
+
+port::ThreadPool *GetDriverExecutor() {
+  mutex_lock lock(driver_executor_threadpool_mu);
+  static port::ThreadPool *thread_pool = InitializeDriverExecutor();
+  return thread_pool;
+}
+
+}  // namespace
+
+
+// Thread-local storage that indicates whether a CUDA context activation is
+// being nested within an outer, MultiOpActivation. In that case, we should not
+// pop the context to nullptr when we are done with the current activation.
+SE_STATIC_THREAD_LOCAL_POD(bool, tls_in_multi_op_activation);
+
+string MemorySpaceString(MemorySpace memory_space) {
+  switch (memory_space) {
+    case MemorySpace::kHost:
+      return "host";
+    case MemorySpace::kDevice:
+      return "device";
+    default:
+      LOG(FATAL) << "impossible memory space";
+  }
+}
+
+// Implementation note: the CUDA context is held, per-thread, in TLS. We avoid
+// setting all the time because it's not clear what side effects might occur for
+// a "set" operation, whereas a "get" operation we can reasonably assume is a
+// TLS read.
+//
+// We cannot race here because CUcontext is associated with a particular thread
+// and stored in TLS; and these interfaces should not be used from signal
+// handlers.
+ScopedActivateContext::ScopedActivateContext(CUcontext context,
+                                             MultiOpActivation moa)
+    : context_(CHECK_NOTNULL(context)),
+      previously_in_multi_op_activation_(tls_in_multi_op_activation.get()) {
+  if (static_cast<bool>(moa)) {
+    tls_in_multi_op_activation.get() = true;
+  }
+
+  CUcontext current = prior_context_ = CurrentContext();
+  if (current != context) {
+    VLOG(3) << "ScopedActivateContext switching context from " << current
+            << " to " << context;
+    CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(context));
+    if (FLAGS_gpuexec_cuda_sync_around_driver_calls) {
+      auto res = dynload::cuCtxSynchronize();
+      if (res != CUDA_SUCCESS) {
+        LOG(FATAL) << "gpuexec_cuda_sync_around_driver_calls found "
+                   << ToString(res)
+                   << " immediately after establishing the device context "
+                   << context << " :: " << port::CurrentStackTrace();
+      }
+    }
+  }
+}
+
+ScopedActivateContext::~ScopedActivateContext() {
+  if (tls_in_multi_op_activation.get()) {
+    CHECK_EQ(context_, CurrentContext());
+    if (FLAGS_gpuexec_cuda_sync_around_driver_calls) {
+      auto res = dynload::cuCtxSynchronize();
+      if (res != CUDA_SUCCESS) {
+        LOG(FATAL) << "gpuexec_cuda_sync_around_driver_calls found "
+                   << ToString(res)
+                   << " immediately after de-establishing the device context "
+                   << context_ << " :: " << port::CurrentStackTrace();
+      }
+    }
+    CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(prior_context_));
+  } else {
+    PopContextAndCheckNowNull(context_);
+  }
+  tls_in_multi_op_activation.get() = previously_in_multi_op_activation_;
+}
+
+namespace {
+
+// Returns a stringified device number associated with pointer, primarily for
+// logging purposes. Returns "?" if the device could not be successfully
+// queried.
+string CUDAPointerToDeviceString(CUdeviceptr pointer) {
+  auto value = CUDADriver::GetPointerDevice(pointer);
+  if (value.ok()) {
+    return port::StrCat(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified memory space associated with pointer, primarily for
+// logging purposes. Returns "?" if the memory space could not be successfully
+// queried.
+string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
+  auto value = CUDADriver::GetPointerMemorySpace(pointer);
+  if (value.ok()) {
+    return MemorySpaceString(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified representation of whether or not peer access is
+// permitted between the "from" and "to" pointers' associated contexts,
+// primarily for logging purposes. Returns "error" if an error is encountered
+// in the process of querying.
+string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
+  auto from_context = CUDADriver::GetPointerContext(from);
+  if (!from_context.ok()) {
+    LOG(ERROR) << "could not retrieve source pointer's context: "
+               << from_context.status();
+    return "error";
+  }
+  auto to_context = CUDADriver::GetPointerContext(to);
+  if (!to_context.ok()) {
+    LOG(ERROR) << "could not retrieve destination pointer's context: "
+               << to_context.status();
+    return "error";
+  }
+  return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
+                                         to_context.ValueOrDie())
+             ? "true"
+             : "false";
+}
+
+
+// Actually performs the work of CUDA initialization. Wrapped up in one-time
+// execution guard.
+static port::Status InternalInit() {
+  CUresult res = CUDA_ERROR_NO_DEVICE;
+  if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
+    LOG(ERROR) << "injecting CUDA init error; initialization will fail";
+  } else if (internal::CachedDsoLoader::GetLibcudaDsoHandle().ok()) {
+    // We only call cuInit if we can dynload libcuda.
+    
+    res = dynload::cuInit(0 /* = flags */);
+  }
+
+  if (res == CUDA_SUCCESS) {
+    return port::Status::OK();
+  }
+
+  LOG(ERROR) << "failed call to cuInit: " << ToString(res);
+  Diagnostician::LogDiagnosticInformation();
+  return port::Status{port::error::ABORTED,
+                      port::StrCat("failed call to cuInit: ", ToString(res))};
+}
+
+}  // namespace
+
+/* static */ port::Status CUDADriver::Init() {
+  // Cached return value from calling InternalInit(), as cuInit need only be
+  // called once, but CUDADriver::Init may be called many times.
+  static port::Status init_retval;
+  static bool set = false;
+  static mutex init_mu(LINKER_INITIALIZED);
+
+  mutex_lock lock(init_mu);
+  if (!set) {
+    init_retval = InternalInit();
+    set = true;
+  }
+
+  return init_retval;
+}
+
+/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
+                                                CUdevice *device) {
+  CUresult res = dynload::cuDeviceGet(device, device_ordinal);
+  if (res == CUDA_SUCCESS) {
+    return port::Status::OK();
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::StrCat("failed call to cuDeviceGet: ", ToString(res))};
+}
+
+/* static */ bool CUDADriver::GetDeviceName(CUdevice device,
+                                            string *device_name) {
+  static const size_t kCharLimit = 64;
+  port::InlinedVector<char, 4> chars(kCharLimit);
+  CUresult res =
+      dynload::cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to get device name for " << device << ": "
+               << ToString(res);
+    return false;
+  }
+  chars[kCharLimit - 1] = '\0';
+  *device_name = chars.begin();
+  return true;
+}
+
+bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
+  static_assert(DeviceOptions::kMask == 0xf,
+                "needs update for new device options");
+
+  if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
+    *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
+  }
+
+  // If no flags are set the default is CU_CTX_SCHED_AUTO, which
+  // in Google environments is very likely to mean SPIN.
+  if (device_options.flags() & DeviceOptions::kScheduleSpin) {
+    *flags |= CU_CTX_SCHED_SPIN;
+  }
+  if (device_options.flags() & DeviceOptions::kScheduleYield) {
+    *flags |= CU_CTX_SCHED_YIELD;
+  }
+  if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
+    *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
+  }
+
+  return true;
+}
+
+/* static */ port::Status CUDADriver::CreateContext(
+    CUdevice device, DeviceOptions device_options, CUcontext *context) {
+  CUcontext former_context = CurrentContext();
+  if (former_context != nullptr) {
+    LOG(WARNING) << "creating context when one is currently active; existing: "
+                 << former_context;
+  }
+
+  int flags = 0;
+  if (!DeviceOptionsToContextFlags(device_options, &flags)) {
+    LOG(WARNING) << "could not convert all device options into context flags";
+  }
+
+  CUresult res;
+  {
+    // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
+    // context creation: see http://b/13248943
+    
+    res = dynload::cuCtxCreate_v2(context, flags, device);
+  }
+  if (res == CUDA_SUCCESS) {
+    CreatedContexts::Add(*context);
+    PopContextAndCheckNowNull(*context);
+    CHECK(*context != nullptr)
+        << "success in this call must entail non-null result";
+    VLOG(2) << "created context " << context << " for this thread";
+    return port::Status::OK();
+  }
+
+  string message = "failed call to cuCtxCreate: " + ToString(res);
+  if (res == CUDA_ERROR_OUT_OF_MEMORY) {
+    uint64 total_memory;
+    if (GetDeviceTotalMemory(device, &total_memory)) {
+      port::StrAppend(&message, "; total memory reported: ", total_memory);
+    } else {
+      port::StrAppend(&message, "; could not query total memory");
+    }
+  }
+
+  return port::Status{port::error::INTERNAL, message};
+}
+
+/* static */ void CUDADriver::DestroyContext(CUcontext context) {
+  if (context == nullptr) {
+    return;
+  }
+
+  CUresult res = dynload::cuCtxDestroy_v2(context);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to destroy CUDA context; leaking: " << ToString(res);
+  }
+
+  CreatedContexts::Remove(context);
+}
+
+/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
+                                               CUfunction func,
+                                               int *attribute_value) {
+  CUresult res = dynload::cuFuncGetAttribute(attribute_value, attribute, func);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
+               << ", attribute: " << attribute;
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
+                                                 CUfunc_cache cache_config) {
+  CUresult res = dynload::cuFuncSetCacheConfig(function, cache_config);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
+               << ", config: " << cache_config << ", result: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<CUsharedconfig>
+CUDADriver::ContextGetSharedMemConfig(CUcontext context) {
+  CUsharedconfig shared_mem_config;
+  ScopedActivateContext activation{context};
+  CUresult result = dynload::cuCtxGetSharedMemConfig(&shared_mem_config);
+  if (result != CUDA_SUCCESS) {
+    CUdevice device;
+    dynload::cuCtxGetDevice(&device);
+    LOG(ERROR) << "failed to get CUDA device shared memory config. "
+               << "Context device ID: " << device
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        port::StrCat("failed to get shared memory config: ", ToString(result))};
+  }
+  return shared_mem_config;
+}
+
+/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
+    CUcontext context, CUsharedconfig shared_mem_config) {
+  ScopedActivateContext activation{context};
+  CUresult result = dynload::cuCtxSetSharedMemConfig(shared_mem_config);
+  if (result != CUDA_SUCCESS) {
+    CUdevice device;
+    dynload::cuCtxGetDevice(&device);
+    LOG(ERROR) << "failed to set CUDA device shared memory config. "
+               << "Context device ID: " << device
+               << ", config: " << shared_mem_config
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        port::StrCat("failed to set shared memory config: ", ToString(result))};
+  }
+  return port::Status::OK();
+}
+
+/* static */ bool CUDADriver::LaunchKernel(
+    CUcontext context, CUfunction function, unsigned int grid_dim_x,
+    unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
+    unsigned int block_dim_y, unsigned int block_dim_z,
+    unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
+    void **extra) {
+  ScopedActivateContext activation{context};
+  VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
+          << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
+          << " bdx: " << block_dim_x << " bdy: " << block_dim_y
+          << " bdz: " << block_dim_z;
+  CUresult res = dynload::cuLaunchKernel(
+      function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
+      block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to launch CUDA kernel: " << function
+               << "; result: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully launched kernel";
+  return true;
+}
+
+/* static */ port::Status CUDADriver::LoadCubin(CUcontext context,
+                                                const char *cubin_bytes,
+                                                CUmodule *module) {
+  ScopedActivateContext activation{context};
+  CUresult result = dynload::cuModuleLoadFatBinary(module, cubin_bytes);
+  if (result != CUDA_SUCCESS) {
+    return port::Status{port::error::INTERNAL,
+                        "failed to load in-memory CUBIN: " + ToString(result)};
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ bool CUDADriver::LoadPtx(CUcontext context,
+                                      const char *ptx_contents,
+                                      CUmodule *module) {
+  port::Notification notification;
+  bool ret = true;
+  GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
+                                 &notification]() {
+    ScopedActivateContext activation{context};
+    void *ptx_data = const_cast<char *>(ptx_contents);
+    static const unsigned int kLogBufferBytesLimit = 1024;
+    unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
+    unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
+    port::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
+    port::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
+    bool log_verbose = true;
+    CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+                              CU_JIT_ERROR_LOG_BUFFER,
+                              CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+                              CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
+    // Note that the driver API wants the contents of this values to be stored
+    // in an array of void*s, so we coerce them accordingly.
+    void *option_values[] = {
+        port::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)),
+        port::bit_cast<void *>(error_log_buffer.data()),
+        port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
+        port::bit_cast<void *>(info_log_buffer.data()),
+        port::bit_cast<void *>(uintptr_t(log_verbose))};
+    CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values));
+
+    CUresult res;
+    {
+      // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
+      // module loading: see http://b/13248943
+      
+      res = dynload::cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options),
+                                        options, option_values);
+    }
+
+    // The PTX JIT mutates the values in the option values array to reflect the
+    // size of the logs it output; now that we've made the call, read the values
+    // back out.
+    error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
+    info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
+    CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
+    CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
+
+    if (res != CUDA_SUCCESS) {
+      LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
+      // As a precaution for null termination of the API-provided value, ensure
+      // that at least the last byte is null.
+      error_log_buffer[error_log_buffer_bytes ?
+                       error_log_buffer_bytes - 1 : 0] = '\0';
+      LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
+                 << " bytes): " << error_log_buffer.data();
+      ret = false;
+      notification.Notify();
+    }
+
+    VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
+            << " bytes): " << info_log_buffer.data();
+    VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
+            << " bytes): " << error_log_buffer.data();
+    CHECK(module != nullptr);
+    notification.Notify();
+  });
+  notification.WaitForNotification();
+
+  return ret;
+}
+
+/* static */ bool CUDADriver::SynchronousMemsetUint8(CUcontext context,
+                                                     CUdeviceptr location,
+                                                     uint8 value, size_t size) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemsetD8_v2(location, value, size);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronousMemsetUint32(CUcontext context,
+                                                      CUdeviceptr location,
+                                                      uint32 value,
+                                                      size_t uint32_count) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemsetD32_v2(location, value, uint32_count);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::AsynchronousMemsetUint32(CUcontext context,
+                                                       CUdeviceptr location,
+                                                       uint32 value,
+                                                       size_t uint32_count,
+                                                       CUstream stream) {
+  ScopedActivateContext activation{context};
+  CUresult res =
+      dynload::cuMemsetD32Async(location, value, uint32_count, stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memset operation";
+  return true;
+}
+
+/* static */ bool CUDADriver::AddStreamCallback(CUcontext context,
+                                                CUstream stream,
+                                                StreamCallback callback,
+                                                void *data) {
+  // Note: flags param is required to be zero according to CUDA 6.0.
+  CUresult res =
+      dynload::cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "unable to add host callback: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::GetModuleFunction(CUcontext context,
+                                                CUmodule module,
+                                                const char *kernel_name,
+                                                CUfunction *function) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && kernel_name != nullptr);
+  CUresult res = dynload::cuModuleGetFunction(function, module, kernel_name);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
+               << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::GetModuleSymbol(CUcontext context,
+                                              CUmodule module,
+                                              const char *symbol_name,
+                                              CUdeviceptr *dptr,
+                                              size_t *bytes) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && symbol_name != nullptr &&
+        (dptr != nullptr || bytes != nullptr));
+  CUresult res =
+      dynload::cuModuleGetGlobal_v2(dptr, bytes, module, symbol_name);
+  if (res != CUDA_SUCCESS) {
+    // symbol may not be found in the current module, but it may reside in
+    // another module.
+    VLOG(2) << "failed to get symbol \"" << symbol_name
+            << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ void CUDADriver::UnloadModule(CUcontext context, CUmodule module) {
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuModuleUnload(module);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to unload module " << module
+               << "; leaking: " << ToString(res);
+  }
+}
+
+/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
+    CUcontext context) {
+  ScopedActivateContext activated{context};
+  CUdevice device = -1;
+  CUresult result = dynload::cuCtxGetDevice(&device);
+  if (result == CUDA_SUCCESS) {
+    return device;
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::StrCat("failed to get device for context: ", ToString(result))};
+}
+
+/* static */ bool CUDADriver::CreateStream(CUcontext context, CUstream *out) {
+  // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
+  // up synchronization with respect to memsets and any other things that have
+  // to occur on the default stream?
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuStreamCreate(out, 0);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not allocate CUDA stream for context " << context
+               << ": " << ToString(res);
+    return false;
+  }
+
+  VLOG(2) << "successfully created stream " << *out << " for context "
+          << context << " on thread";
+  return true;
+}
+
+/* static */ void CUDADriver::DestroyStream(CUcontext context,
+                                            CUstream *stream) {
+  if (*stream == nullptr) {
+    return;
+  }
+
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuStreamDestroy_v2(*stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to destroy CUDA stream for context " << context
+               << ": " << ToString(res);
+  } else {
+    VLOG(2) << "successfully destroyed stream " << *stream << " for context "
+            << context;
+    *stream = nullptr;
+  }
+}
+
+/* static */ void *CUDADriver::DeviceAllocate(CUcontext context, uint64 bytes) {
+  ScopedActivateContext activated{context};
+  CUdeviceptr result = 0;
+  CUresult res = dynload::cuMemAlloc_v2(&result, bytes);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to allocate "
+               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
+               << " bytes) from device: " << ToString(res);
+    return nullptr;
+  }
+  void *ptr = reinterpret_cast<void *>(result);
+  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
+          << bytes << " bytes";
+  return ptr;
+}
+
+/* static */ void CUDADriver::DeviceDeallocate(CUcontext context,
+                                               void *location) {
+  ScopedActivateContext activation{context};
+  CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+  CUresult res = dynload::cuMemFree_v2(pointer);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to free device memory at " << location
+               << "; result: " << ToString(res);
+  } else {
+    VLOG(2) << "deallocated " << location << " for context " << context;
+  }
+}
+
+/* static */ void *CUDADriver::HostAllocate(CUcontext context, uint64 bytes) {
+  ScopedActivateContext activation{context};
+  void *host_mem = nullptr;
+  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+  CUresult res =
+      dynload::cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to alloc " << bytes
+               << " bytes on host: " << ToString(res);
+  }
+  return host_mem;
+}
+
+/* static */ void CUDADriver::HostDeallocate(CUcontext context,
+                                             void *location) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemFreeHost(location);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "error deallocating host memory at " << location << ": "
+               << ToString(res);
+  }
+}
+
+/* static */ bool CUDADriver::HostRegister(CUcontext context, void *location,
+                                           uint64 bytes) {
+  ScopedActivateContext activation{context};
+  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+  CUresult res =
+      dynload::cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "error registering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool CUDADriver::HostUnregister(CUcontext context,
+                                             void *location) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemHostUnregister(location);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "error unregistering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ port::Status CUDADriver::DestroyEvent(CUcontext context,
+                                                   CUevent *event) {
+  if (*event == nullptr) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        "input event cannot be null"};
+  }
+
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventDestroy_v2(*event);
+  *event = nullptr;
+
+  switch (res) {
+    case CUDA_SUCCESS:
+      return port::Status::OK();
+    case CUDA_ERROR_DEINITIALIZED:
+    case CUDA_ERROR_NOT_INITIALIZED:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          port::Printf("error destroying CUDA event in context %p: %s", context,
+                       ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INTERNAL,
+          port::Printf("error destroying CUDA event in context %p: %s", context,
+                       ToString(res).c_str())};
+  }
+}
+
+/* static */ port::Status CUDADriver::RecordEvent(CUcontext context,
+                                                  CUevent event,
+                                                  CUstream stream) {
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventRecord(event, stream);
+  switch (res) {
+    case CUDA_SUCCESS:
+      return port::Status::OK();
+    case CUDA_ERROR_DEINITIALIZED:
+    case CUDA_ERROR_NOT_INITIALIZED:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          port::Printf("error recording CUDA event on stream %p: %s", stream,
+                       ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INVALID_ARGUMENT,
+          port::Printf("error recording CUDA event on stream %p: %s", stream,
+                       ToString(res).c_str())};
+  }
+}
+
+/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(CUcontext context,
+                                                             CUevent event) {
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventQuery(event);
+  if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("failed to query event: %s", ToString(res).c_str())};
+  }
+
+  return res;
+}
+
+/* static */ bool CUDADriver::GetEventElapsedTime(CUcontext context,
+                                                  float *elapsed_milliseconds,
+                                                  CUevent start, CUevent stop) {
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to get elapsed time between events: "
+               << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::WaitStreamOnEvent(CUcontext context,
+                                                CUstream stream,
+                                                CUevent event) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuStreamWaitEvent(stream, event, 0 /* = flags */);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not wait stream on event: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronizeContext(CUcontext context) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuCtxSynchronize();
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronizeStream(CUcontext context,
+                                                CUstream stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  CUresult res = dynload::cuStreamSynchronize(stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not synchronize on CUDA stream: " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+    return false;
+  }
+  VLOG(2) << "successfully synchronized stream " << stream << " on context "
+          << context;
+  return true;
+}
+
+/* static */ bool CUDADriver::IsStreamIdle(CUcontext context, CUstream stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  CUresult res = dynload::cuStreamQuery(stream);
+  if (res == CUDA_SUCCESS) {
+    return true;
+  }
+
+  if (res != CUDA_ERROR_NOT_READY) {
+    LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
+  }
+  return false;
+}
+
+/* static */ bool CUDADriver::SynchronousMemcpyD2H(CUcontext context,
+                                                   void *host_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to synchronous memcpy from device to host: %s; "
+        "host dst: %p; GPU src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size);
+    return false;
+  }
+  VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
+          << host_dst;
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronousMemcpyH2D(CUcontext context,
+                                                   CUdeviceptr gpu_dst,
+                                                   const void *host_src,
+                                                   uint64 size) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
+        " host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ bool CUDADriver::SynchronousMemcpyD2D(CUcontext context,
+                                                   CUdeviceptr gpu_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
+        "GPU src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst),
+        port::bit_cast<void *>(gpu_src), size, size);
+    return false;
+  }
+  VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CUcontext context,
+                                                    void *host_dst,
+                                                    CUdeviceptr gpu_src,
+                                                    uint64 size,
+                                                    CUstream stream) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyDtoHAsync_v2(host_dst, gpu_src, size, stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
+        "GPU src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2h of " << size
+          << " bytes from " << port::bit_cast<void *>(gpu_src) << " to " << host_dst
+          << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CUcontext context,
+                                                    CUdeviceptr gpu_dst,
+                                                    const void *host_src,
+                                                    uint64 size,
+                                                    CUstream stream) {
+  ScopedActivateContext activation{context};
+  CUresult res = dynload::cuMemcpyHtoDAsync_v2(gpu_dst, host_src, size, stream);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
+        "host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
+          << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CUcontext context,
+                                                    CUdeviceptr gpu_dst,
+                                                    CUdeviceptr gpu_src,
+                                                    uint64 size,
+                                                    CUstream stream) {
+  ScopedActivateContext activation{context};
+  CUresult result =
+      dynload::cuMemcpyDtoDAsync_v2(gpu_dst, gpu_src, size, stream);
+  if (result != CUDA_SUCCESS) {
+    LOG(ERROR) << port::Printf(
+        "failed to enqueue async memcpy from device to device: %s"
+        "; GPU dst: %p on %s %s"
+        "; GPU src: %p on %s %s"
+        "; can access? %s; size: %llu=0x%llx",
+        ToString(result).c_str(), port::bit_cast<void *>(gpu_dst),
+        CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
+        CUDAPointerToDeviceString(gpu_dst).c_str(), port::bit_cast<void *>(gpu_src),
+        CUDAPointerToMemorySpaceString(gpu_src).c_str(),
+        CUDAPointerToDeviceString(gpu_src).c_str(),
+        CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
+
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ port::Status CUDADriver::CreateEvent(CUcontext context,
+                                                  CUevent *result,
+                                                  EventFlags flags) {
+  int cuflags;
+  switch (flags) {
+    case EventFlags::kDefault:
+      cuflags = CU_EVENT_DEFAULT;
+      break;
+    case EventFlags::kDisableTiming:
+      cuflags = CU_EVENT_DISABLE_TIMING;
+      break;
+    default:
+      LOG(FATAL) << "impossible event flags: " << int(flags);
+  }
+
+  ScopedActivateContext activated{context};
+  CUresult res = dynload::cuEventCreate(result, cuflags);
+
+  if (res == CUDA_SUCCESS) {
+    return port::Status::OK();
+  } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
+    return port::Status{port::error::RESOURCE_EXHAUSTED,
+                        "could not create CUDA event: out of device memory"};
+  } else {
+    return port::Status{
+        port::error::FAILED_PRECONDITION,
+        port::StrCat("could not create CUDA event: ", ToString(res))};
+  }
+}
+
+/* static */ int CUDADriver::GetDeviceCount() {
+  int device_count = 0;
+  CUresult res = dynload::cuDeviceGetCount(&device_count);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
+    return 0;
+  }
+
+  if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
+    device_count = 1;
+  }
+  return device_count;
+}
+
+/* static */ port::StatusOr<CUcontext> CUDADriver::GetPointerContext(
+    CUdeviceptr pointer) {
+  CUcontext context = nullptr;
+  CUresult result = dynload::cuPointerGetAttribute(
+      &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
+  if (result == CUDA_SUCCESS) {
+    CHECK(context != nullptr) << "success should entail non-null context";
+    return context;
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::StrCat("failed to query device pointer for context: ",
+                   ToString(result))};
+}
+
+/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
+    CUdeviceptr pointer) {
+  unsigned int value;
+  CUresult result = dynload::cuPointerGetAttribute(
+      &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
+  if (result == CUDA_SUCCESS) {
+    switch (value) {
+      case CU_MEMORYTYPE_DEVICE:
+        return MemorySpace::kDevice;
+      case CU_MEMORYTYPE_HOST:
+        return MemorySpace::kHost;
+      default:
+        return port::Status{
+            port::error::INTERNAL,
+            port::StrCat("unknown memory space provided by CUDA API: ", value)};
+    }
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::StrCat("failed to query device pointer for memory space: ",
+                   ToString(result))};
+}
+
+/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
+                                                             CUdeviceptr *base,
+                                                             size_t *size) {
+  CUresult result = dynload::cuMemGetAddressRange(base, size, dptr);
+  if (result == CUDA_SUCCESS) {
+    return port::Status::OK();
+  } else if (result == CUDA_ERROR_NOT_FOUND) {
+    // We differentiate between "this pointer is unknown" (return here) and
+    // "there was an internal error while performing this operation" (return
+    // below).
+    return port::Status{
+        port::error::NOT_FOUND,
+        port::Printf("not a device pointer %p; %s",
+                     reinterpret_cast<void *>(dptr), ToString(result).c_str())};
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::Printf("failed to get pointer into for device pointer %p; %s",
+                   reinterpret_cast<void *>(dptr), ToString(result).c_str())};
+}
+
+/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
+    CUdeviceptr pointer) {
+  auto result = GetPointerContext(pointer);
+  if (!result.ok()) {
+    return result.status();
+  }
+
+  return DeviceFromContext(result.ValueOrDie());
+}
+
+/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
+                                                           int *cc_minor,
+                                                           CUdevice device) {
+  *cc_major = 0;
+  *cc_minor = 0;
+  CUresult result =
+      dynload::cuDeviceComputeCapability(cc_major, cc_minor, device);
+  if (result == CUDA_SUCCESS) {
+    return port::Status::OK();
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      port::Printf("failed to get compute capability for device: %s; %d",
+                   ToString(result).c_str(), device)};
+}
+
+// Helper function that turns the integer output of cuDeviceGetAttribute to type
+// T and wraps it in a StatusOr.
+template <typename T>
+static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
+                                            CUdevice_attribute attribute) {
+  int value = -1;
+  CUresult result = dynload::cuDeviceGetAttribute(&value, attribute, device);
+  if (result != CUDA_SUCCESS) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        port::StrCat("could not retrieve CUDA device attribute (", attribute,
+                     "): ", ToString(result))};
+  }
+  T converted = value;
+  return converted;
+}
+
+/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
+    CUdevice device) {
+  return GetSimpleAttribute<int>(device,
+                                 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(
+      device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(
+      device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(
+      device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(device,
+                                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(device,
+                                   CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
+}
+
+/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
+    CUdevice device) {
+  return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
+}
+
+/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
+                                            CUdevice device) {
+  int value;
+  CUresult res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
+    return false;
+  }
+  *x = value;
+
+  res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
+    return false;
+  }
+  *y = value;
+
+  res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
+    return false;
+  }
+  *z = value;
+  return true;
+}
+
+/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
+  CUresult res = dynload::cuDriverGetVersion(driver_version);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query driver version: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties,
+                                                  int device_ordinal) {
+  CUresult res =
+      dynload::cuDeviceGetProperties(device_properties, device_ordinal);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query device properties: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
+  int value = -1;
+  CUresult res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query ECC status: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ bool CUDADriver::GetDeviceMemoryInfo(CUcontext context,
+                                                  int64 *free_out,
+                                                  int64 *total_out) {
+  ScopedActivateContext activation{context};
+  size_t free = 0;
+  size_t total = 0;
+  CUresult res = dynload::cuMemGetInfo_v2(&free, &total);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query device memory info: " << ToString(res);
+    return false;
+  }
+
+  *free_out = free;
+  *total_out = total;
+  return true;
+}
+
+/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
+                                                   uint64 *result) {
+  size_t value = -1;
+  CUresult res = dynload::cuDeviceTotalMem_v2(&value, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query total available memory: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
+  string pci_bus_id;
+  static const int kBufferSize = 64;
+  port::InlinedVector<char, 4> chars(kBufferSize);
+  chars[kBufferSize - 1] = '\0';
+  CUresult res =
+      dynload::cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
+    return pci_bus_id;
+  }
+  pci_bus_id = chars.begin();
+  return pci_bus_id;
+}
+
+/* static */ bool CUDADriver::CanEnablePeerAccess(CUcontext from,
+                                                  CUcontext to) {
+  if (from == to) {
+    return true;  // A context can always access its own memory.
+  }
+
+  int can_access_peer = -1;
+  auto from_device = DeviceFromContext(from);
+  if (!from_device.ok()) {
+    LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
+               << from_device.status();
+    return false;
+  }
+  auto to_device = DeviceFromContext(to);
+  if (!to_device.ok()) {
+    LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
+               << to_device.status();
+    return false;
+  }
+  CUresult res = dynload::cuDeviceCanAccessPeer(
+      &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
+    return false;
+  }
+
+  return can_access_peer;
+}
+
+/* static */ port::Status CUDADriver::EnablePeerAccess(CUcontext from,
+                                                       CUcontext to) {
+  if (from == to) {
+    return port::Status::OK();  // A context can always access its own memory.
+  }
+
+  ScopedActivateContext activated{from};
+  CUresult result = dynload::cuCtxEnablePeerAccess(to, 0 /* = flags */);
+  if (result != CUDA_SUCCESS &&
+      result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("failed to enable peer access from %p to %p: %s", from, to,
+                     ToString(result).c_str())};
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
+    CUcontext context, CUfunction kernel, int threads_per_block,
+    size_t dynamic_shared_memory_bytes) {
+  ScopedActivateContext activation{context};
+
+  int max_blocks;
+  CUresult result = dynload::cuOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
+  if (result != CUDA_SUCCESS) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("failed to calculate occupancy of kernel %p: %s", kernel,
+                     ToString(result).c_str())};
+  }
+
+  return max_blocks;
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools