blob: 005b3dc3101a0d4ab6f4801ae020bbb5b78cb41d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
#include <tuple>
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform/port.h"
namespace perftools {
namespace gputools {
namespace cuda {
// e.g. DriverVersion{331, 79}
using DriverVersion = std::tuple<int, int>;
// Converts a parsed driver version to string form.
string DriverVersionToString(DriverVersion version);
// Converts a parsed driver version or status value to natural string form.
string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
class Diagnostician {
public:
// Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
// not initializing).
//
// Note: if we're running on a machine that has no GPUs, we don't want to
// produce very much log spew beyond saying, "looks like there's no CUDA
// kernel
// module running".
//
// Note: we use non-Google-File:: API here because we may be called before
// InitGoogle has completed.
static void LogDiagnosticInformation();
// Given the driver version file contents, finds the kernel module version and
// returns it as a string.
//
// This is solely used for more informative log messages when the user is
// running on a machine that happens to have a libcuda/kernel driver mismatch.
static port::StatusOr<DriverVersion> FindKernelModuleVersion(
const string &driver_version_file_contents);
// Extracts the kernel driver version from the current host.
static port::StatusOr<DriverVersion> FindKernelDriverVersion();
// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
// driver-interfacing DSO version number. Returns it as a string.
static port::StatusOr<DriverVersion> FindDsoVersion();
// Logs information about the kernel driver version and userspace driver
// library version.
static void LogDriverVersionInformation();
private:
// Logs information about the loaded nvidia-related kernel modules.
static void LogKernelModuleInformation();
// Given the DSO version number and the driver version file contents, extracts
// the driver version and compares, warning the user in the case of
// incompatability.
//
// This is solely used for more informative log messages when the user is
// running on a machine that happens to have a libcuda/kernel driver mismatch.
static void WarnOnDsoKernelMismatch(
port::StatusOr<DriverVersion> dso_version,
port::StatusOr<DriverVersion> kernel_version);
// Logs information about the dev nodes present on this machine: their
// existence, permissions, accessibility from this uid/gid.
static void LogDevNodeDiagnosticInformation();
static string GetDevNodePath(int dev_node_ordinal);
SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
};
} // namespace cuda
} // namespace gputools
} // namespace perftools
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
|