aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-04-13 00:52:08 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-04-13 02:09:42 -0700
commitb4396632f78624057eefc79721e5081254068d48 (patch)
tree9452e44fecbc0c90e56bda5dbe137ec5ce290c8f
parentcf1a098bc78897d04ec14c411ddf2a158fdb0851 (diff)
Add a GPUOPTIONS option to force all tensors to be gpu_compatible
Change: 153039058
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.h1
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device_factory.cc16
-rw-r--r--tensorflow/core/protobuf/config.proto12
-rw-r--r--tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt4
4 files changed, 29 insertions, 4 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 370b3cc4f6..2525931301 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -108,6 +108,7 @@ class BaseGPUDevice : public LocalDevice {
mutex trace_mu_;
int gpu_id_ = -1;
const bool sync_every_op_ = false;
+ bool force_gpu_compatible_ = false;
const int32 max_streams_;
std::unique_ptr<EventMgr> em_;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 94143a55d5..d9fa5a6b96 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -31,12 +31,16 @@ class GPUDevice : public BaseGPUDevice {
Allocator* cpu_allocator)
: BaseGPUDevice(options, name, memory_limit, locality, gpu_id,
physical_device_desc, gpu_allocator, cpu_allocator,
- false /* sync every op */, 1 /* max_streams */) {}
+ false /* sync every op */, 1 /* max_streams */) {
+ if (options.config.has_gpu_options()) {
+ force_gpu_compatible_ = options.config.gpu_options.force_gpu_compatible;
+ }
+ }
Allocator* GetAllocator(AllocatorAttributes attr) override {
if (attr.on_host()) {
ProcessState* ps = ProcessState::singleton();
- if (attr.gpu_compatible()) {
+ if (attr.gpu_compatible() || force_gpu_compatible_) {
return ps->GetCUDAHostAllocator(0);
} else {
return cpu_allocator_;
@@ -71,12 +75,16 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
Bytes memory_limit, const DeviceLocality& locality,
Allocator* allocator)
- : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {}
+ : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {
+ if (options.config.has_gpu_options()) {
+ force_gpu_compatible_ = options.config.gpu_options.force_gpu_compatible;
+ }
+ }
~GPUCompatibleCPUDevice() override {}
Allocator* GetAllocator(AllocatorAttributes attr) override {
ProcessState* ps = ProcessState::singleton();
- if (attr.gpu_compatible()) {
+ if (attr.gpu_compatible() || force_gpu_compatible_) {
return ps->GetCUDAHostAllocator(0);
} else {
// Call the parent's implementation.
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 98e7b171d2..5c0f7232eb 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -64,6 +64,18 @@ message GPUOptions {
// PollEvents calls, when the queue is empty. If value is not
// set or set to 0, gets set to a non-zero default.
int32 polling_inactive_delay_msecs = 7;
+
+ // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
+ // enabling this option forces all CPU tensors to be allocated with Cuda
+ // pinned memory. Normally, TensorFlow will infer which tensors should be
+ // allocated as the pinned memory. But in case where the inference is
+ // incomplete, this option can significantly speed up the cross-device memory
+ // copy performance as long as it fits the memory.
+ // Note that this option is not something that should be
+ // enabled by default for unknown or very large models, since all Cuda pinned
+ // memory is unpageable, having too much pinned memory might negatively impact
+ // the overall host system performance.
+ bool force_gpu_compatible = 8;
};
// Options passed to the graph optimizer
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
index 48cda623f7..30f7e4e116 100644
--- a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
@@ -23,6 +23,10 @@ tf_class {
mtype: "<type \'getset_descriptor\'>"
}
member {
+ name: "FORCE_GPU_COMPATIBLE_FIELD_NUMBER"
+ mtype: "<type \'int\'>"
+ }
+ member {
name: "PER_PROCESS_GPU_MEMORY_FRACTION_FIELD_NUMBER"
mtype: "<type \'int\'>"
}