1 files changed, 89 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
new file mode 100644
index 0000000000..1d8c3a054d
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -0,0 +1,89 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/common_runtime/gpu/dma_helper.h"
+#include "tensorflow/stream_executor/device_memory.h"
+
+#include "tensorflow/stream_executor/stream.h"
+
+namespace tensorflow {
+
+class RecvTensorResponse;
+class TensorProto;
+
+namespace gpu = ::perftools::gputools;
+
+class GPUUtil {
+ public:
+  // "tensor" is GPU-local.  "dev" is the hosting GPU.
+  // "device_context" should be the context of the GPU "_Send" op
+  // which provides the Tensor.
+  // Sets all necessasry fields of "proto" by transferring value
+  // bytes from GPU to CPU RAM. "is_dead" indicates that the
+  // tensor is dead with an uninit value.
+  static void SetProtoFromGPU(const Tensor& tensor, Device* dev,
+                              const DeviceContext* device_context,
+                              TensorProto* proto, bool is_dead,
+                              StatusCallback done);
+
+  // Copies "input" to "output" between devices accessible to the
+  // local process via some DMA-like method.  "edge_name" is the name
+  // of the tensor being copied, for debugging purposes. Depending on
+  // the type of devices and memory in use, the copy may be performed
+  // synchronously or asynchronously.  'done' will be invoked only
+  // after the copy is actually complete.
+  static void CopyViaDMA(const string& edge_name,
+                         DeviceContext* send_dev_context,
+                         DeviceContext* recv_dev_context, Device* src,
+                         Device* dst, const AllocatorAttributes src_alloc_attr,
+                         const AllocatorAttributes dst_alloc_attr,
+                         const Tensor* input, Tensor* output,
+                         StatusCallback done);
+
+  // Copies the data in 'gpu_tensor' into 'cpu_tensor'.
+  // 'gpu_tensor''s backing memory must be on 'gpu_device' and
+  // 'cpu_tensor' must be allocated to be of the same size as
+  // 'gpu_tensor'. Synchronous: may block.
+  static void CopyGPUTensorToCPU(Device* gpu_device,
+                                 const DeviceContext* device_context,
+                                 const Tensor* gpu_tensor, Tensor* cpu_tensor,
+                                 StatusCallback done);
+
+  // Blocks until all operations queued on the stream associated with
+  // "gpu_device" at the time of the call have completed.  Returns any
+  // error pending on the stream at completion.
+  static Status Sync(Device* gpu_device);
+
+  // Blocks until all operations queued on all streams associated with the
+  // corresponding GPU device at the time of call have completed.
+  // Returns any error pending on the stream at completion.
+  static Status SyncAll(Device* gpu_device);
+
+  // For debugging purpose, given a "device" and a "tensor" allocated
+  // on the device, return a string printing each byte in the tensor
+  // (up to a limit).  "device" can be either a CPU or a GPU device.
+  static string MemoryDebugString(const Device* device, Tensor* tensor);
+
+  static perftools::gputools::DeviceMemory<float> AsGPUFloat(const Tensor& t);
+
+  // Computes a checksum over the contents of "tensor", which is allocated
+  // on "gpu_device".
+  static uint64 Checksum(Device* gpu_device,
+                         const DeviceContext* device_context,
+                         const Tensor& tensor);
+
+  // Computes a checksum over the contents of "tensor", which is allocated
+  // in local CPU RAM.
+  static uint64 Checksum(const Tensor& tensor);
+
+  static void CopyCPUTensorToGPU(const Tensor* cpu_tensor,
+                                 const DeviceContext* device_context,
+                                 Device* gpu_device, Tensor* gpu_tensor,
+                                 StatusCallback done);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_