aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda/cuda_diagnostics.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_diagnostics.h')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_diagnostics.h85
1 files changed, 85 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
new file mode 100644
index 0000000000..005b3dc310
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -0,0 +1,85 @@
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
+
+#include <tuple>
+
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// e.g. DriverVersion{331, 79}
+using DriverVersion = std::tuple<int, int>;
+
+// Converts a parsed driver version to string form.
+string DriverVersionToString(DriverVersion version);
+
+// Converts a parsed driver version or status value to natural string form.
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+
+// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
+
+class Diagnostician {
+ public:
+ // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+ // not initializing).
+ //
+ // Note: if we're running on a machine that has no GPUs, we don't want to
+ // produce very much log spew beyond saying, "looks like there's no CUDA
+ // kernel
+ // module running".
+ //
+ // Note: we use non-Google-File:: API here because we may be called before
+ // InitGoogle has completed.
+ static void LogDiagnosticInformation();
+
+ // Given the driver version file contents, finds the kernel module version and
+ // returns it as a string.
+ //
+ // This is solely used for more informative log messages when the user is
+ // running on a machine that happens to have a libcuda/kernel driver mismatch.
+ static port::StatusOr<DriverVersion> FindKernelModuleVersion(
+ const string &driver_version_file_contents);
+
+ // Extracts the kernel driver version from the current host.
+ static port::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+ // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+ // driver-interfacing DSO version number. Returns it as a string.
+ static port::StatusOr<DriverVersion> FindDsoVersion();
+
+ // Logs information about the kernel driver version and userspace driver
+ // library version.
+ static void LogDriverVersionInformation();
+
+ private:
+ // Logs information about the loaded nvidia-related kernel modules.
+ static void LogKernelModuleInformation();
+
+ // Given the DSO version number and the driver version file contents, extracts
+ // the driver version and compares, warning the user in the case of
+ // incompatability.
+ //
+ // This is solely used for more informative log messages when the user is
+ // running on a machine that happens to have a libcuda/kernel driver mismatch.
+ static void WarnOnDsoKernelMismatch(
+ port::StatusOr<DriverVersion> dso_version,
+ port::StatusOr<DriverVersion> kernel_version);
+
+ // Logs information about the dev nodes present on this machine: their
+ // existence, permissions, accessibility from this uid/gid.
+ static void LogDevNodeDiagnosticInformation();
+
+ static string GetDevNodePath(int dev_node_ordinal);
+
+ SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
+};
+
+} // namespace cuda
+} // namespace gputools
+} // namespace perftools
+
+#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_