diff options
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_util.h')
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_util.h | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h new file mode 100644 index 0000000000..1d8c3a054d --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_util.h @@ -0,0 +1,89 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/common_runtime/gpu/dma_helper.h" +#include "tensorflow/stream_executor/device_memory.h" + +#include "tensorflow/stream_executor/stream.h" + +namespace tensorflow { + +class RecvTensorResponse; +class TensorProto; + +namespace gpu = ::perftools::gputools; + +class GPUUtil { + public: + // "tensor" is GPU-local. "dev" is the hosting GPU. + // "device_context" should be the context of the GPU "_Send" op + // which provides the Tensor. + // Sets all necessasry fields of "proto" by transferring value + // bytes from GPU to CPU RAM. "is_dead" indicates that the + // tensor is dead with an uninit value. + static void SetProtoFromGPU(const Tensor& tensor, Device* dev, + const DeviceContext* device_context, + TensorProto* proto, bool is_dead, + StatusCallback done); + + // Copies "input" to "output" between devices accessible to the + // local process via some DMA-like method. "edge_name" is the name + // of the tensor being copied, for debugging purposes. Depending on + // the type of devices and memory in use, the copy may be performed + // synchronously or asynchronously. 'done' will be invoked only + // after the copy is actually complete. + static void CopyViaDMA(const string& edge_name, + DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, const AllocatorAttributes src_alloc_attr, + const AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + StatusCallback done); + + // Copies the data in 'gpu_tensor' into 'cpu_tensor'. + // 'gpu_tensor''s backing memory must be on 'gpu_device' and + // 'cpu_tensor' must be allocated to be of the same size as + // 'gpu_tensor'. Synchronous: may block. + static void CopyGPUTensorToCPU(Device* gpu_device, + const DeviceContext* device_context, + const Tensor* gpu_tensor, Tensor* cpu_tensor, + StatusCallback done); + + // Blocks until all operations queued on the stream associated with + // "gpu_device" at the time of the call have completed. Returns any + // error pending on the stream at completion. + static Status Sync(Device* gpu_device); + + // Blocks until all operations queued on all streams associated with the + // corresponding GPU device at the time of call have completed. + // Returns any error pending on the stream at completion. + static Status SyncAll(Device* gpu_device); + + // For debugging purpose, given a "device" and a "tensor" allocated + // on the device, return a string printing each byte in the tensor + // (up to a limit). "device" can be either a CPU or a GPU device. + static string MemoryDebugString(const Device* device, Tensor* tensor); + + static perftools::gputools::DeviceMemory<float> AsGPUFloat(const Tensor& t); + + // Computes a checksum over the contents of "tensor", which is allocated + // on "gpu_device". + static uint64 Checksum(Device* gpu_device, + const DeviceContext* device_context, + const Tensor& tensor); + + // Computes a checksum over the contents of "tensor", which is allocated + // in local CPU RAM. + static uint64 Checksum(const Tensor& tensor); + + static void CopyCPUTensorToGPU(const Tensor* cpu_tensor, + const DeviceContext* device_context, + Device* gpu_device, Tensor* gpu_tensor, + StatusCallback done); +}; + +} // namespace tensorflow +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_ |