aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core
diff options
context:
space:
mode:
authorGravatar Akshay Modi <nareshmodi@google.com>2018-10-04 16:10:21 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-10-04 16:14:56 -0700
commitcf8e7cf89abb4a7783b9a99f17574ea128fa767a (patch)
tree52e733a0ec849c70356ed51675e8ac46916bbc18 /tensorflow/core
parentd6a2e7bcca5683c377b592f177bcac9aeb1c550f (diff)
Pin ops with small integer inputs (already on the cpu) to the cpu in eager.
An environment variable (TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING) is provided to turn this off if necessary (its on by default). PiperOrigin-RevId: 215821915
Diffstat (limited to 'tensorflow/core')
-rw-r--r--tensorflow/core/common_runtime/eager/context.cc4
-rw-r--r--tensorflow/core/common_runtime/eager/context.h2
-rw-r--r--tensorflow/core/common_runtime/eager/execute.cc67
3 files changed, 63 insertions, 10 deletions
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 18420b60fd..f23cefb33d 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -70,7 +70,9 @@ EagerContext::EagerContext(const SessionOptions& opts,
async_default_(async),
log_memory_(LogMemory::IsEnabled()),
env_(opts.env),
- use_send_tensor_rpc_(false) {
+ use_send_tensor_rpc_(false),
+ pin_small_ops_to_cpu_(ReadBoolFromEnvVar(
+ "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", true)) {
if (device_mgr_owned) {
local_device_manager_.reset(device_mgr);
local_unowned_device_manager_ = nullptr;
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 5ed6057ec6..15eeaa8066 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -202,6 +202,7 @@ class EagerContext {
// EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used
// instead (which in-turn use WorkerService.RecvTensor RPCs).
bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
+ bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
private:
void InitDeviceMapAndAsync();
@@ -293,6 +294,7 @@ class EagerContext {
#endif
bool use_send_tensor_rpc_;
+ const bool pin_small_ops_to_cpu_;
};
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 1bc63616d0..a52f933d75 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -579,19 +579,23 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
return Status::OK();
#endif
}
-} // namespace
-Status EagerExecute(EagerOperation* op,
- gtl::InlinedVector<TensorHandle*, 2>* retvals,
- int* num_retvals) {
- // Ensure all resource-touching ops run in the device the resource is,
- // regardless of anything else that has been specified. This is identical to
- // the graph mode behavior.
+// The Op device may be updated if:
+// - A resource touching input is specified: all resource-touching ops run in
+// the device the resource is, regardless of anything else that has been
+// specified. This is identical to the graph mode behavior.
+//
+// - All op inputs are on the CPU, small (<64 elements) and integers
+// (int32/int64). This can be disabled by setting the environment variable
+// "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
+Status MaybeUpdateOpDevice(EagerOperation* op) {
EagerContext* ctx = op->EagerContext();
+ bool device_set_for_resource_variable = false;
+ bool all_inputs_eligible_for_cpu_pinning = ctx->PinSmallOpsToCPU();
+
for (int i = 0; i < op->Inputs().size(); ++i) {
Device* input_op_device = nullptr;
- auto status = op->Inputs()[i]->OpDevice(&input_op_device);
- if (!status.ok()) return status;
+ TF_RETURN_IF_ERROR(op->Inputs()[i]->OpDevice(&input_op_device));
VLOG(2) << "for op " << op->Name() << " input " << i << " "
<< DataTypeString(op->Inputs()[i]->dtype) << " "
<< (input_op_device == nullptr ? "cpu" : input_op_device->name())
@@ -603,8 +607,53 @@ Status EagerExecute(EagerOperation* op,
<< d->name() << " because input #" << i
<< " is a resource in this device.";
op->SetDevice(d);
+
+ device_set_for_resource_variable = true;
+ all_inputs_eligible_for_cpu_pinning = false;
+ } else if (all_inputs_eligible_for_cpu_pinning) {
+ TensorHandle* handle = op->Inputs()[i];
+
+ // Input is on CPU.
+ if (input_op_device != nullptr && input_op_device != ctx->HostCPU()) {
+ all_inputs_eligible_for_cpu_pinning = false;
+ continue;
+ }
+
+ if (handle->dtype != DataType::DT_INT32 &&
+ handle->dtype != DataType::DT_INT64) {
+ all_inputs_eligible_for_cpu_pinning = false;
+ continue;
+ }
+
+ int64 num_elements;
+ TF_RETURN_IF_ERROR(handle->NumElements(&num_elements));
+ if (num_elements > 64) {
+ all_inputs_eligible_for_cpu_pinning = false;
+ }
}
}
+
+ // Ops without inputs are usually ops that generate a tensor in some way and
+ // usually require being present on whatever device they are scheduled on
+ // - for e.g. VarHandleOp or _Recv).
+ // TODO(nareshmodi): Is it possible there is no int32/int64 CPU kernel for
+ // an op, but there is a GPU kernel?
+ if (!op->Inputs().empty() && all_inputs_eligible_for_cpu_pinning) {
+ VLOG(1) << "Forcing op " << op->Name()
+ << " to be on the CPU since all input tensors have an "
+ "int32/int64 dtype, and are small (less than 64 elements).";
+ op->SetDevice(ctx->HostCPU());
+ }
+
+ return Status::OK();
+}
+} // namespace
+
+Status EagerExecute(EagerOperation* op,
+ gtl::InlinedVector<TensorHandle*, 2>* retvals,
+ int* num_retvals) {
+ TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
+
bool op_is_local = IsLocal(op->EagerContext(), op->Device());
if (op_is_local) {