diff options
author | Akshay Modi <nareshmodi@google.com> | 2018-10-04 16:10:21 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-10-04 16:14:56 -0700 |
commit | cf8e7cf89abb4a7783b9a99f17574ea128fa767a (patch) | |
tree | 52e733a0ec849c70356ed51675e8ac46916bbc18 /tensorflow/core/common_runtime | |
parent | d6a2e7bcca5683c377b592f177bcac9aeb1c550f (diff) |
Pin ops with small integer inputs (already on the cpu) to the cpu in eager.
An environment variable (TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING) is provided to turn this off if necessary (its on by default).
PiperOrigin-RevId: 215821915
Diffstat (limited to 'tensorflow/core/common_runtime')
-rw-r--r-- | tensorflow/core/common_runtime/eager/context.cc | 4 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/eager/context.h | 2 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/eager/execute.cc | 67 |
3 files changed, 63 insertions, 10 deletions
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc index 18420b60fd..f23cefb33d 100644 --- a/tensorflow/core/common_runtime/eager/context.cc +++ b/tensorflow/core/common_runtime/eager/context.cc @@ -70,7 +70,9 @@ EagerContext::EagerContext(const SessionOptions& opts, async_default_(async), log_memory_(LogMemory::IsEnabled()), env_(opts.env), - use_send_tensor_rpc_(false) { + use_send_tensor_rpc_(false), + pin_small_ops_to_cpu_(ReadBoolFromEnvVar( + "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", true)) { if (device_mgr_owned) { local_device_manager_.reset(device_mgr); local_unowned_device_manager_ = nullptr; diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h index 5ed6057ec6..15eeaa8066 100644 --- a/tensorflow/core/common_runtime/eager/context.h +++ b/tensorflow/core/common_runtime/eager/context.h @@ -202,6 +202,7 @@ class EagerContext { // EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used // instead (which in-turn use WorkerService.RecvTensor RPCs). bool UseSendTensorRPC() { return use_send_tensor_rpc_; } + bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; } private: void InitDeviceMapAndAsync(); @@ -293,6 +294,7 @@ class EagerContext { #endif bool use_send_tensor_rpc_; + const bool pin_small_ops_to_cpu_; }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc index 1bc63616d0..a52f933d75 100644 --- a/tensorflow/core/common_runtime/eager/execute.cc +++ b/tensorflow/core/common_runtime/eager/execute.cc @@ -579,19 +579,23 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals, return Status::OK(); #endif } -} // namespace -Status EagerExecute(EagerOperation* op, - gtl::InlinedVector<TensorHandle*, 2>* retvals, - int* num_retvals) { - // Ensure all resource-touching ops run in the device the resource is, - // regardless of anything else that has been specified. This is identical to - // the graph mode behavior. +// The Op device may be updated if: +// - A resource touching input is specified: all resource-touching ops run in +// the device the resource is, regardless of anything else that has been +// specified. This is identical to the graph mode behavior. +// +// - All op inputs are on the CPU, small (<64 elements) and integers +// (int32/int64). This can be disabled by setting the environment variable +// "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false". +Status MaybeUpdateOpDevice(EagerOperation* op) { EagerContext* ctx = op->EagerContext(); + bool device_set_for_resource_variable = false; + bool all_inputs_eligible_for_cpu_pinning = ctx->PinSmallOpsToCPU(); + for (int i = 0; i < op->Inputs().size(); ++i) { Device* input_op_device = nullptr; - auto status = op->Inputs()[i]->OpDevice(&input_op_device); - if (!status.ok()) return status; + TF_RETURN_IF_ERROR(op->Inputs()[i]->OpDevice(&input_op_device)); VLOG(2) << "for op " << op->Name() << " input " << i << " " << DataTypeString(op->Inputs()[i]->dtype) << " " << (input_op_device == nullptr ? "cpu" : input_op_device->name()) @@ -603,8 +607,53 @@ Status EagerExecute(EagerOperation* op, << d->name() << " because input #" << i << " is a resource in this device."; op->SetDevice(d); + + device_set_for_resource_variable = true; + all_inputs_eligible_for_cpu_pinning = false; + } else if (all_inputs_eligible_for_cpu_pinning) { + TensorHandle* handle = op->Inputs()[i]; + + // Input is on CPU. + if (input_op_device != nullptr && input_op_device != ctx->HostCPU()) { + all_inputs_eligible_for_cpu_pinning = false; + continue; + } + + if (handle->dtype != DataType::DT_INT32 && + handle->dtype != DataType::DT_INT64) { + all_inputs_eligible_for_cpu_pinning = false; + continue; + } + + int64 num_elements; + TF_RETURN_IF_ERROR(handle->NumElements(&num_elements)); + if (num_elements > 64) { + all_inputs_eligible_for_cpu_pinning = false; + } } } + + // Ops without inputs are usually ops that generate a tensor in some way and + // usually require being present on whatever device they are scheduled on + // - for e.g. VarHandleOp or _Recv). + // TODO(nareshmodi): Is it possible there is no int32/int64 CPU kernel for + // an op, but there is a GPU kernel? + if (!op->Inputs().empty() && all_inputs_eligible_for_cpu_pinning) { + VLOG(1) << "Forcing op " << op->Name() + << " to be on the CPU since all input tensors have an " + "int32/int64 dtype, and are small (less than 64 elements)."; + op->SetDevice(ctx->HostCPU()); + } + + return Status::OK(); +} +} // namespace + +Status EagerExecute(EagerOperation* op, + gtl::InlinedVector<TensorHandle*, 2>* retvals, + int* num_retvals) { + TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op)); + bool op_is_local = IsLocal(op->EagerContext(), op->Device()); if (op_is_local) { |