diff options
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_driver.cc | 117 |
1 files changed, 8 insertions, 109 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc index d508f6594a..dbece3adf9 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -102,117 +102,16 @@ class CreatedContexts { /* static */ int64 CreatedContexts::next_id_ = 1; // 0 means "no context" // Formats CUresult to output prettified values into a log stream. -// Error summaries taken from: -// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9 -// -// TODO(leary) switch to cuGetErrorName when updated cuda.h is available. string ToString(CUresult result) { -#define OSTREAM_CUDA_ERROR(__name) \ - case CUDA_ERROR_##__name: \ - return "CUDA_ERROR_" #__name; - -/////////////// -// NOTE: here we specify return code values outside of the enum explicitly -// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver -// libraries are deployed in the fleet these error codes are backwards -// compatible, but if we see a "new" one, we want to be able to identify it in -// the logs. -// -// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can -// eliminate this function and just rely on the driver to provide us these -// strings. -// -// NOTE: "Must reboot all context" below is shorthand for, "must -// destroy/recreate the offending context and any allocation which come from -// it if you are to continue using CUDA." -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch" - switch (result) { - OSTREAM_CUDA_ERROR(INVALID_VALUE) - OSTREAM_CUDA_ERROR(OUT_OF_MEMORY) - OSTREAM_CUDA_ERROR(NOT_INITIALIZED) - OSTREAM_CUDA_ERROR(DEINITIALIZED) - OSTREAM_CUDA_ERROR(NO_DEVICE) - OSTREAM_CUDA_ERROR(INVALID_DEVICE) - OSTREAM_CUDA_ERROR(INVALID_IMAGE) - OSTREAM_CUDA_ERROR(INVALID_CONTEXT) - OSTREAM_CUDA_ERROR(INVALID_HANDLE) - OSTREAM_CUDA_ERROR(NOT_FOUND) - OSTREAM_CUDA_ERROR(NOT_READY) - OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU) - - // Encountered an uncorrectable ECC error during execution. - OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE) - - // Load/store on an invalid address. Must reboot all context. - case 700: - return "CUDA_ERROR_ILLEGAL_ADDRESS"; - // Passed too many / wrong arguments, too many threads for register count. - case 701: - return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; - // Kernel took too long to execute. - case 702: - return "CUDA_ERROR_LAUNCH_TIMEOUT"; - // Kernel launch uses an incompatible texturing mode. - case 703: - return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; - // Trying to re-enable peer access that already has it enabled. - case 704: - return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; - // Trying to disable peer access that has not yet been enabled. - case 705: - return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; - // Primary context for the specified device has already been initialized. - case 708: - return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; - // Context current to calling thread has been destroyed or is a primary - // context that has not yet been initialized. - case 709: - return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; - // Device-side assert triggered during kernel execution. Must reboot all - // context. - case 710: - return "CUDA_ERROR_ASSERT"; - // Hardware resources to enable peer access have been exhausted. - case 711: - return "CUDA_ERROR_TOO_MANY_PEERS"; - // Memory range has already been registered. - case 712: - return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; - // Pointer does not correspond to any currently registered memory region. - case 713: - return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; - // Due to stack corruption or exceeding stack size limit. Must reboot all - // context. - case 714: - return "CUDA_ERROR_HARDWARE_STACK_ERROR"; - case 715: - return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; - // Load/store on an unaligned memory address. Must reboot all context. - case 716: - return "CUDA_ERROR_MISALIGNED_ADDRESS"; - // Device instruction with specific address space given address not - // belonging to allowed address space. Must reboot all context. - case 717: - return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; - // Device program counter wrapped its address space. Must reboot all - // context. - case 718: - return "CUDA_ERROR_INVALID_PC"; - // Exception on device while executing a kernel; e.g. deref invalid device - // pointer, accessing OOB shared memory. Must reboot all context. - case 719: - return "CUDA_ERROR_LAUNCH_FAILED"; - - OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE) - OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED) - OSTREAM_CUDA_ERROR(NOT_PERMITTED) - OSTREAM_CUDA_ERROR(NOT_SUPPORTED) - OSTREAM_CUDA_ERROR(UNKNOWN) // Unknown internal error to CUDA. - default: - return port::StrCat("CUresult(", static_cast<int>(result), ")"); + const char *error_name; + if (cuGetErrorName(result, &error_name)) { + return port::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")"); + } + const char *error_string; + if (cuGetErrorString(result, &error_string)) { + return error_name; } -#pragma GCC diagnostic pop + return port::StrCat(error_name, ": ", error_string); } // Returns the current context and checks that it is in the set of CUDA contexts |