1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
#include "tensorflow/core/common_runtime/device.h"
#include "tensorflow/core/public/tensor.h"
#include "tensorflow/core/public/status.h"
#include "tensorflow/core/common_runtime/gpu/dma_helper.h"
#include "tensorflow/stream_executor/device_memory.h"
#include "tensorflow/stream_executor/stream.h"
namespace tensorflow {
class RecvTensorResponse;
class TensorProto;
namespace gpu = ::perftools::gputools;
class GPUUtil {
public:
// "tensor" is GPU-local. "dev" is the hosting GPU.
// "device_context" should be the context of the GPU "_Send" op
// which provides the Tensor.
// Sets all necessasry fields of "proto" by transferring value
// bytes from GPU to CPU RAM. "is_dead" indicates that the
// tensor is dead with an uninit value.
static void SetProtoFromGPU(const Tensor& tensor, Device* dev,
const DeviceContext* device_context,
TensorProto* proto, bool is_dead,
StatusCallback done);
// Copies "input" to "output" between devices accessible to the
// local process via some DMA-like method. "edge_name" is the name
// of the tensor being copied, for debugging purposes. Depending on
// the type of devices and memory in use, the copy may be performed
// synchronously or asynchronously. 'done' will be invoked only
// after the copy is actually complete.
static void CopyViaDMA(const string& edge_name,
DeviceContext* send_dev_context,
DeviceContext* recv_dev_context, Device* src,
Device* dst, const AllocatorAttributes src_alloc_attr,
const AllocatorAttributes dst_alloc_attr,
const Tensor* input, Tensor* output,
StatusCallback done);
// Copies the data in 'gpu_tensor' into 'cpu_tensor'.
// 'gpu_tensor''s backing memory must be on 'gpu_device' and
// 'cpu_tensor' must be allocated to be of the same size as
// 'gpu_tensor'. Synchronous: may block.
static void CopyGPUTensorToCPU(Device* gpu_device,
const DeviceContext* device_context,
const Tensor* gpu_tensor, Tensor* cpu_tensor,
StatusCallback done);
// Blocks until all operations queued on the stream associated with
// "gpu_device" at the time of the call have completed. Returns any
// error pending on the stream at completion.
static Status Sync(Device* gpu_device);
// Blocks until all operations queued on all streams associated with the
// corresponding GPU device at the time of call have completed.
// Returns any error pending on the stream at completion.
static Status SyncAll(Device* gpu_device);
// For debugging purpose, given a "device" and a "tensor" allocated
// on the device, return a string printing each byte in the tensor
// (up to a limit). "device" can be either a CPU or a GPU device.
static string MemoryDebugString(const Device* device, Tensor* tensor);
static perftools::gputools::DeviceMemory<float> AsGPUFloat(const Tensor& t);
// Computes a checksum over the contents of "tensor", which is allocated
// on "gpu_device".
static uint64 Checksum(Device* gpu_device,
const DeviceContext* device_context,
const Tensor& tensor);
// Computes a checksum over the contents of "tensor", which is allocated
// in local CPU RAM.
static uint64 Checksum(const Tensor& tensor);
static void CopyCPUTensorToGPU(const Tensor* cpu_tensor,
const DeviceContext* device_context,
Device* gpu_device, Tensor* gpu_tensor,
StatusCallback done);
};
} // namespace tensorflow
#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
|