Add a GPUOPTIONS option to force all tensors to be gpu_compatible

Change: 153039058
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-04-13 00:52:08 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-04-13 02:09:42 -0700
commit: b4396632f78624057eefc79721e5081254068d48 (patch)
tree: 9452e44fecbc0c90e56bda5dbe137ec5ce290c8f
parent: cf1a098bc78897d04ec14c411ddf2a158fdb0851 (diff)
4 files changed, 29 insertions, 4 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 370b3cc4f6..2525931301 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -108,6 +108,7 @@ class BaseGPUDevice : public LocalDevice {
   mutex trace_mu_;
   int gpu_id_ = -1;
   const bool sync_every_op_ = false;
+  bool force_gpu_compatible_ = false;
   const int32 max_streams_;
   std::unique_ptr<EventMgr> em_;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 94143a55d5..d9fa5a6b96 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -31,12 +31,16 @@ class GPUDevice : public BaseGPUDevice {
             Allocator* cpu_allocator)
       : BaseGPUDevice(options, name, memory_limit, locality, gpu_id,
                       physical_device_desc, gpu_allocator, cpu_allocator,
-                      false /* sync every op */, 1 /* max_streams */) {}
+                      false /* sync every op */, 1 /* max_streams */) {
+    if (options.config.has_gpu_options()) {
+      force_gpu_compatible_ = options.config.gpu_options.force_gpu_compatible;
+    }
+  }
 
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     if (attr.on_host()) {
       ProcessState* ps = ProcessState::singleton();
-      if (attr.gpu_compatible()) {
+      if (attr.gpu_compatible() || force_gpu_compatible_) {
         return ps->GetCUDAHostAllocator(0);
       } else {
         return cpu_allocator_;
@@ -71,12 +75,16 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
   GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
                          Bytes memory_limit, const DeviceLocality& locality,
                          Allocator* allocator)
-      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {}
+      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {
+    if (options.config.has_gpu_options()) {
+      force_gpu_compatible_ = options.config.gpu_options.force_gpu_compatible;
+    }
+  }
   ~GPUCompatibleCPUDevice() override {}
 
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     ProcessState* ps = ProcessState::singleton();
-    if (attr.gpu_compatible()) {
+    if (attr.gpu_compatible() || force_gpu_compatible_) {
       return ps->GetCUDAHostAllocator(0);
     } else {
       // Call the parent's implementation.
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 98e7b171d2..5c0f7232eb 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -64,6 +64,18 @@ message GPUOptions {
   // PollEvents calls, when the queue is empty.  If value is not
   // set or set to 0, gets set to a non-zero default.
   int32 polling_inactive_delay_msecs = 7;
+
+  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
+  // enabling this option forces all CPU tensors to be allocated with Cuda
+  // pinned memory. Normally, TensorFlow will infer which tensors should be
+  // allocated as the pinned memory. But in case where the inference is
+  // incomplete, this option can significantly speed up the cross-device memory
+  // copy performance as long as it fits the memory.
+  // Note that this option is not something that should be
+  // enabled by default for unknown or very large models, since all Cuda pinned
+  // memory is unpageable, having too much pinned memory might negatively impact
+  // the overall host system performance.
+  bool force_gpu_compatible = 8;
 };
 
 // Options passed to the graph optimizer
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
index 48cda623f7..30f7e4e116 100644
--- a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     mtype: "<type \'getset_descriptor\'>"
   }
   member {
+    name: "FORCE_GPU_COMPATIBLE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
     name: "PER_PROCESS_GPU_MEMORY_FRACTION_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-04-13 00:52:08 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-04-13 02:09:42 -0700
commit	b4396632f78624057eefc79721e5081254068d48 (patch)
tree	9452e44fecbc0c90e56bda5dbe137ec5ce290c8f
parent	cf1a098bc78897d04ec14c411ddf2a158fdb0851 (diff)