aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda/cuda_diagnostics.h
blob: 005b3dc3101a0d4ab6f4801ae020bbb5b78cb41d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_

#include <tuple>

#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform/port.h"

namespace perftools {
namespace gputools {
namespace cuda {

// e.g. DriverVersion{331, 79}
using DriverVersion = std::tuple<int, int>;

// Converts a parsed driver version to string form.
string DriverVersionToString(DriverVersion version);

// Converts a parsed driver version or status value to natural string form.
string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);

// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);

class Diagnostician {
 public:
  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
  // not initializing).
  //
  // Note: if we're running on a machine that has no GPUs, we don't want to
  // produce very much log spew beyond saying, "looks like there's no CUDA
  // kernel
  // module running".
  //
  // Note: we use non-Google-File:: API here because we may be called before
  // InitGoogle has completed.
  static void LogDiagnosticInformation();

  // Given the driver version file contents, finds the kernel module version and
  // returns it as a string.
  //
  // This is solely used for more informative log messages when the user is
  // running on a machine that happens to have a libcuda/kernel driver mismatch.
  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
      const string &driver_version_file_contents);

  // Extracts the kernel driver version from the current host.
  static port::StatusOr<DriverVersion> FindKernelDriverVersion();

  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
  // driver-interfacing DSO version number. Returns it as a string.
  static port::StatusOr<DriverVersion> FindDsoVersion();

  // Logs information about the kernel driver version and userspace driver
  // library version.
  static void LogDriverVersionInformation();

 private:
  // Logs information about the loaded nvidia-related kernel modules.
  static void LogKernelModuleInformation();

  // Given the DSO version number and the driver version file contents, extracts
  // the driver version and compares, warning the user in the case of
  // incompatability.
  //
  // This is solely used for more informative log messages when the user is
  // running on a machine that happens to have a libcuda/kernel driver mismatch.
  static void WarnOnDsoKernelMismatch(
      port::StatusOr<DriverVersion> dso_version,
      port::StatusOr<DriverVersion> kernel_version);

  // Logs information about the dev nodes present on this machine: their
  // existence, permissions, accessibility from this uid/gid.
  static void LogDevNodeDiagnosticInformation();

  static string GetDevNodePath(int dev_node_ordinal);

  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
};

}  // namespace cuda
}  // namespace gputools
}  // namespace perftools

#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_