aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/stream_executor/cuda/cuda_driver.cc117
1 files changed, 8 insertions, 109 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index d508f6594a..dbece3adf9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -102,117 +102,16 @@ class CreatedContexts {
/* static */ int64 CreatedContexts::next_id_ = 1; // 0 means "no context"
// Formats CUresult to output prettified values into a log stream.
-// Error summaries taken from:
-// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
-//
-// TODO(leary) switch to cuGetErrorName when updated cuda.h is available.
string ToString(CUresult result) {
-#define OSTREAM_CUDA_ERROR(__name) \
- case CUDA_ERROR_##__name: \
- return "CUDA_ERROR_" #__name;
-
-///////////////
-// NOTE: here we specify return code values outside of the enum explicitly
-// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver
-// libraries are deployed in the fleet these error codes are backwards
-// compatible, but if we see a "new" one, we want to be able to identify it in
-// the logs.
-//
-// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can
-// eliminate this function and just rely on the driver to provide us these
-// strings.
-//
-// NOTE: "Must reboot all context" below is shorthand for, "must
-// destroy/recreate the offending context and any allocation which come from
-// it if you are to continue using CUDA."
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch"
- switch (result) {
- OSTREAM_CUDA_ERROR(INVALID_VALUE)
- OSTREAM_CUDA_ERROR(OUT_OF_MEMORY)
- OSTREAM_CUDA_ERROR(NOT_INITIALIZED)
- OSTREAM_CUDA_ERROR(DEINITIALIZED)
- OSTREAM_CUDA_ERROR(NO_DEVICE)
- OSTREAM_CUDA_ERROR(INVALID_DEVICE)
- OSTREAM_CUDA_ERROR(INVALID_IMAGE)
- OSTREAM_CUDA_ERROR(INVALID_CONTEXT)
- OSTREAM_CUDA_ERROR(INVALID_HANDLE)
- OSTREAM_CUDA_ERROR(NOT_FOUND)
- OSTREAM_CUDA_ERROR(NOT_READY)
- OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU)
-
- // Encountered an uncorrectable ECC error during execution.
- OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE)
-
- // Load/store on an invalid address. Must reboot all context.
- case 700:
- return "CUDA_ERROR_ILLEGAL_ADDRESS";
- // Passed too many / wrong arguments, too many threads for register count.
- case 701:
- return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
- // Kernel took too long to execute.
- case 702:
- return "CUDA_ERROR_LAUNCH_TIMEOUT";
- // Kernel launch uses an incompatible texturing mode.
- case 703:
- return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
- // Trying to re-enable peer access that already has it enabled.
- case 704:
- return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
- // Trying to disable peer access that has not yet been enabled.
- case 705:
- return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
- // Primary context for the specified device has already been initialized.
- case 708:
- return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
- // Context current to calling thread has been destroyed or is a primary
- // context that has not yet been initialized.
- case 709:
- return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
- // Device-side assert triggered during kernel execution. Must reboot all
- // context.
- case 710:
- return "CUDA_ERROR_ASSERT";
- // Hardware resources to enable peer access have been exhausted.
- case 711:
- return "CUDA_ERROR_TOO_MANY_PEERS";
- // Memory range has already been registered.
- case 712:
- return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
- // Pointer does not correspond to any currently registered memory region.
- case 713:
- return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
- // Due to stack corruption or exceeding stack size limit. Must reboot all
- // context.
- case 714:
- return "CUDA_ERROR_HARDWARE_STACK_ERROR";
- case 715:
- return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
- // Load/store on an unaligned memory address. Must reboot all context.
- case 716:
- return "CUDA_ERROR_MISALIGNED_ADDRESS";
- // Device instruction with specific address space given address not
- // belonging to allowed address space. Must reboot all context.
- case 717:
- return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
- // Device program counter wrapped its address space. Must reboot all
- // context.
- case 718:
- return "CUDA_ERROR_INVALID_PC";
- // Exception on device while executing a kernel; e.g. deref invalid device
- // pointer, accessing OOB shared memory. Must reboot all context.
- case 719:
- return "CUDA_ERROR_LAUNCH_FAILED";
-
- OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
- OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
- OSTREAM_CUDA_ERROR(NOT_PERMITTED)
- OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
- OSTREAM_CUDA_ERROR(UNKNOWN) // Unknown internal error to CUDA.
- default:
- return port::StrCat("CUresult(", static_cast<int>(result), ")");
+ const char *error_name;
+ if (cuGetErrorName(result, &error_name)) {
+ return port::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
+ }
+ const char *error_string;
+ if (cuGetErrorString(result, &error_string)) {
+ return error_name;
}
-#pragma GCC diagnostic pop
+ return port::StrCat(error_name, ": ", error_string);
}
// Returns the current context and checks that it is in the set of CUDA contexts