[SE] Try again to query the GPU driver for error descriptions

This code hs been here since 2014, now the oldest supported version of CUDA is 8 so cuGetErrorName should always be available. Also the list of errors is (of course) out of sync with upstream CUDA. Also surface the description of the error to the user, if available. PiperOrigin-RevId: 206191424
author: Benjamin Kramer <kramerb@google.com> 2018-07-26 11:25:44 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-07-26 11:29:38 -0700
commit: de31d0185b8ccb1882112dda0ac763f1c869b7a4 (patch)
tree: 41beda71b8113ed196422807682054b202d41c8d /tensorflow/stream_executor
parent: ecd8decac3d9f3c7cd772e1561b9c2d3f23aa830 (diff)
1 files changed, 8 insertions, 109 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index d508f6594a..dbece3adf9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -102,117 +102,16 @@ class CreatedContexts {
 /* static */ int64 CreatedContexts::next_id_ = 1;  // 0 means "no context"
 
 // Formats CUresult to output prettified values into a log stream.
-// Error summaries taken from:
-// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
-//
-// TODO(leary) switch to cuGetErrorName when updated cuda.h is available.
 string ToString(CUresult result) {
-#define OSTREAM_CUDA_ERROR(__name) \
-  case CUDA_ERROR_##__name:        \
-    return "CUDA_ERROR_" #__name;
-
-///////////////
-// NOTE: here we specify return code values outside of the enum explicitly
-// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver
-// libraries are deployed in the fleet these error codes are backwards
-// compatible, but if we see a "new" one, we want to be able to identify it in
-// the logs.
-//
-// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can
-// eliminate this function and just rely on the driver to provide us these
-// strings.
-//
-// NOTE: "Must reboot all context" below is shorthand for, "must
-// destroy/recreate the offending context and any allocation which come from
-// it if you are to continue using CUDA."
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch"
-  switch (result) {
-    OSTREAM_CUDA_ERROR(INVALID_VALUE)
-    OSTREAM_CUDA_ERROR(OUT_OF_MEMORY)
-    OSTREAM_CUDA_ERROR(NOT_INITIALIZED)
-    OSTREAM_CUDA_ERROR(DEINITIALIZED)
-    OSTREAM_CUDA_ERROR(NO_DEVICE)
-    OSTREAM_CUDA_ERROR(INVALID_DEVICE)
-    OSTREAM_CUDA_ERROR(INVALID_IMAGE)
-    OSTREAM_CUDA_ERROR(INVALID_CONTEXT)
-    OSTREAM_CUDA_ERROR(INVALID_HANDLE)
-    OSTREAM_CUDA_ERROR(NOT_FOUND)
-    OSTREAM_CUDA_ERROR(NOT_READY)
-    OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU)
-
-    // Encountered an uncorrectable ECC error during execution.
-    OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE)
-
-    // Load/store on an invalid address. Must reboot all context.
-    case 700:
-      return "CUDA_ERROR_ILLEGAL_ADDRESS";
-    // Passed too many / wrong arguments, too many threads for register count.
-    case 701:
-      return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-    // Kernel took too long to execute.
-    case 702:
-      return "CUDA_ERROR_LAUNCH_TIMEOUT";
-    // Kernel launch uses an incompatible texturing mode.
-    case 703:
-      return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
-    // Trying to re-enable peer access that already has it enabled.
-    case 704:
-      return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
-    // Trying to disable peer access that has not yet been enabled.
-    case 705:
-      return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
-    // Primary context for the specified device has already been initialized.
-    case 708:
-      return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
-    // Context current to calling thread has been destroyed or is a primary
-    // context that has not yet been initialized.
-    case 709:
-      return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
-    // Device-side assert triggered during kernel execution. Must reboot all
-    // context.
-    case 710:
-      return "CUDA_ERROR_ASSERT";
-    // Hardware resources to enable peer access have been exhausted.
-    case 711:
-      return "CUDA_ERROR_TOO_MANY_PEERS";
-    // Memory range has already been registered.
-    case 712:
-      return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
-    // Pointer does not correspond to any currently registered memory region.
-    case 713:
-      return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
-    // Due to stack corruption or exceeding stack size limit. Must reboot all
-    // context.
-    case 714:
-      return "CUDA_ERROR_HARDWARE_STACK_ERROR";
-    case 715:
-      return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
-    // Load/store on an unaligned memory address. Must reboot all context.
-    case 716:
-      return "CUDA_ERROR_MISALIGNED_ADDRESS";
-    // Device instruction with specific address space given address not
-    // belonging to allowed address space. Must reboot all context.
-    case 717:
-      return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
-    // Device program counter wrapped its address space. Must reboot all
-    // context.
-    case 718:
-      return "CUDA_ERROR_INVALID_PC";
-    // Exception on device while executing a kernel; e.g. deref invalid device
-    // pointer, accessing OOB shared memory. Must reboot all context.
-    case 719:
-      return "CUDA_ERROR_LAUNCH_FAILED";
-
-      OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
-      OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
-      OSTREAM_CUDA_ERROR(NOT_PERMITTED)
-      OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
-      OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
-    default:
-      return port::StrCat("CUresult(", static_cast<int>(result), ")");
+  const char *error_name;
+  if (cuGetErrorName(result, &error_name)) {
+    return port::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
+  }
+  const char *error_string;
+  if (cuGetErrorString(result, &error_string)) {
+    return error_name;
   }
-#pragma GCC diagnostic pop
+  return port::StrCat(error_name, ": ", error_string);
 }
 
 // Returns the current context and checks that it is in the set of CUDA contexts
author	Benjamin Kramer <kramerb@google.com>	2018-07-26 11:25:44 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-07-26 11:29:38 -0700
commit	de31d0185b8ccb1882112dda0ac763f1c869b7a4 (patch)
tree	41beda71b8113ed196422807682054b202d41c8d /tensorflow/stream_executor
parent	ecd8decac3d9f3c7cd772e1561b9c2d3f23aa830 (diff)