Turns eager device placement on by default.

Change the device policy to have silent copies, which are logged when RunMetadata tracking is enabled. In the process, changed TensorHandle to always keep its context around if it gets one. Changed TFE_TensorHandleResolve to, if necessary, copy to the CPU (since the user has no control as to whether this copy is needed by default). PiperOrigin-RevId: 190978086
author: Alexandre Passos <apassos@google.com> 2018-03-29 13:35:34 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-03-29 13:38:24 -0700
commit: c6911faaf4702096064542790d8c9e8e6f938d52 (patch)
tree: 3a0349eaf30a69315c972c3be57bf5d44abd34f8 /tensorflow/c/eager
parent: 9d1d379bcdd19d496fd8d2659c21a5510e045c5a (diff)
4 files changed, 90 insertions, 43 deletions
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 028865d360..bb1492fca2 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -201,18 +201,24 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
   const tensorflow::Tensor* t = nullptr;
   status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
   if (!status->status.ok()) return nullptr;
+  tensorflow::TensorHandle* h_cpu = nullptr;
   if (!IsCPU(d)) {
-    TF_SetStatus(status, TF_UNIMPLEMENTED,
-                 tensorflow::strings::StrCat(
-                     "TFE_TensorHandle can be resolved iff it is on CPU (this "
-                     "handle is on ",
-                     d->name(),
-                     "). Consider using TFE_TensorHandleCopyToDevice to get a "
-                     "copy of the tensor on CPU")
-                     .c_str());
-    return nullptr;
+    status->status = h->handle->CopyToDevice(
+        h->handle->Context(), h->handle->Context()->HostCPU(), &h_cpu);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+    status->status = h_cpu->TensorAndDevice(&t, &d, &op_device);
+    if (!status->status.ok()) {
+      h_cpu->Unref();
+      return nullptr;
+    }
   }
-  return tensorflow::TF_TensorFromTensor(*t, status);
+  TF_Tensor* retval = tensorflow::TF_TensorFromTensor(*t, status);
+  if (h_cpu != nullptr) {
+    h_cpu->Unref();
+  }
+  return retval;
 }
 }  // extern "C"
 
@@ -258,17 +264,6 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  if (op->device == nullptr) {
-    // Questionable heuristic ...
-    // - If a device was explicitly set on the op, always use that.
-    // - If not, place on the first non-host device seen.
-    tensorflow::Device* d = nullptr;
-    // TODO(agarwal): This call may block if h is not ready. Avoid this if
-    // possible.
-    status->status = h->handle->Device(&d);
-    if (!status->status.ok()) return;
-    if (!IsCPU(d)) op->device = d;
-  }
   h->handle->Ref();
   op->inputs.push_back(h->handle);
   op->attrs.NumInputs(op->inputs.size());
@@ -436,10 +431,39 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
 
 namespace {
 
+// Initializes the step stats if needed.
+void MaybeInitializeStepStats(tensorflow::StepStats* step_stats,
+                              tensorflow::EagerContext* ctx) {
+  // Lazily initialize the RunMetadata with information about all devices if
+  // this is the first call.
+  while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+    int device_idx = step_stats->dev_stats_size();
+    auto* dev_stats = step_stats->add_dev_stats();
+    dev_stats->set_device(ctx->devices()->at(device_idx)->name());
+  }
+}
+
+int StepStatsDeviceIndex(tensorflow::StepStats* step_stats,
+                         tensorflow::EagerContext* ctx,
+                         tensorflow::Device* device) {
+  // Find the current device's index.
+  if (device == nullptr) {
+    device = ctx->HostCPU();
+  }
+  for (int i = 0; i < ctx->devices()->size(); ++i) {
+    if (ctx->devices()->at(i) == device ||
+        ctx->devices()->at(i)->name() == device->name()) {
+      return i;
+    }
+  }
+  // TODO(apassos) do not fall back to host CPU if device is unknown.
+  return 0;
+}
+
 tensorflow::Status ValidateInputTypeAndPlacement(
-    tensorflow::EagerContext* ctx, tensorflow::Device* host_device,
-    tensorflow::Device* op_device, TFE_Op* op,
-    const tensorflow::OpKernel* kernel) {
+    tensorflow::EagerContext* ctx, tensorflow::Device* op_device, TFE_Op* op,
+    const tensorflow::OpKernel* kernel, tensorflow::RunMetadata* run_metadata) {
+  tensorflow::Device* host_device = ctx->HostCPU();
   const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types();
   if (memtypes.size() != op->inputs.size()) {
     return tensorflow::errors::InvalidArgument(
@@ -489,9 +513,22 @@ tensorflow::Status ValidateInputTypeAndPlacement(
       }
       // We are only here if the policy is warn or silent copies, so we should
       // trigger a copy.
+      auto pre_time = tensorflow::Env::Default()->NowMicros();
       tensorflow::TensorHandle* copied_tensor = nullptr;
       tensorflow::Status status = tensorflow::EagerCopyToDevice(
           handle, ctx, expected_device->name().c_str(), &copied_tensor);
+      if (run_metadata != nullptr) {
+        auto* step_stats = run_metadata->mutable_step_stats();
+        MaybeInitializeStepStats(step_stats, ctx);
+        // Record the sending on the source device for now.
+        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        auto* node_stats = dev_stats->add_node_stats();
+        node_stats->set_node_name("_Send");
+        node_stats->set_all_start_micros(pre_time);
+        node_stats->set_op_end_rel_micros(
+            tensorflow::Env::Default()->NowMicros() - pre_time);
+      }
       if (!status.ok()) {
         if (copied_tensor != nullptr) copied_tensor->Unref();
         return tensorflow::errors::Internal(
@@ -785,8 +822,12 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     tensorflow::Device* input_op_device = nullptr;
     status->status = op->inputs[i]->OpDevice(&input_op_device);
     if (!status->status.ok()) return;
+    VLOG(2) << "for op " << op->name << " input " << i << " "
+            << tensorflow::DataTypeString(op->inputs[i]->dtype) << " "
+            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
+            << " " << (op->device == nullptr ? "cpu" : op->device->name());
     if (op->inputs[i]->dtype == tensorflow::DT_RESOURCE &&
-        input_op_device != op->device) {
+        (input_op_device != op->device || input_op_device == nullptr)) {
       tensorflow::Device* d =
           input_op_device == nullptr ? ctx->context.HostCPU() : input_op_device;
       VLOG(1) << "Changing device of operation " << op->name << " to "
@@ -796,16 +837,13 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     }
   }
   tensorflow::Device* device = op->device;
-  if (!ctx->context.SoftPlacement() && device == nullptr) {
-    device = ctx->context.HostCPU();
-  }
 
   tensorflow::Fprint128 cache_key =
       op->attrs.CacheKey(device == nullptr ? "unspecified" : device->name());
   tensorflow::KernelAndDevice* kernel = ctx->context.GetCachedKernel(cache_key);
   if (kernel == nullptr) {
     const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
-    if (ctx->context.SoftPlacement() && device == nullptr) {
+    if (device == nullptr) {
       device = SelectDevice(ndef, ctx, status);
       if (!status->status.ok()) {
         return;
@@ -867,7 +905,9 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     device = kernel->device();
   }
   status->status = ValidateInputTypeAndPlacement(
-      &ctx->context, ctx->context.HostCPU(), device, op, kernel->kernel());
+      &ctx->context, device, op, kernel->kernel(),
+      ctx->context.ShouldStoreMetadata() ? ctx->context.RunMetadataProto()
+                                         : nullptr);
   if (!status->status.ok()) return;
   std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
   if (ctx->context.ShouldStoreMetadata()) {
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index a5029bf211..3926c22ce1 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -61,17 +61,15 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetConfig(
 // Controls how to act when we try to run an operation on a given device but
 // some input tensors are not on that device.
 typedef enum TFE_ContextDevicePlacementPolicy {
-  // Running operations with input tensors on the wrong device will fail. When
-  // soft placement is enabled acts like TFE_DEVICE_PLACEMENT_SILENT.
+  // Running operations with input tensors on the wrong device will fail.
   TFE_DEVICE_PLACEMENT_EXPLICIT = 0,
   // Copy the tensor to the right device but log a warning.
   TFE_DEVICE_PLACEMENT_WARN = 1,
-  // Silently copy the tensor, which has a performance cost since the
-  // operation will be blocked till the copy completes.
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default placement
+  // policy.
   TFE_DEVICE_PLACEMENT_SILENT = 2,
-  // Default placement policy which silently copies int32 tensors but not other
-  // dtypes.  When soft placement is enabled acts like
-  // TFE_DEVICE_PLACEMENT_SILENT.
+  // Placement policy which silently copies int32 tensors but not other dtypes.
   TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 } TFE_ContextDevicePlacementPolicy;
 
@@ -162,7 +160,11 @@ TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
     TFE_TensorHandle* h, TF_Status* status);
 
-// This function will block till the operation that produces `h` has completed.
+// This function will block till the operation that produces `h` has
+// completed. The memory returned might alias the internal memory used by
+// TensorFlow. Hence, callers should not mutate this memory (for example by
+// modifying the memory region pointed to by TF_TensorData() on the returned
+// TF_Tensor).
 TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
                                                          TF_Status* status);
 
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index e6d2ab75ff..05dc64f521 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -50,8 +50,7 @@ struct TFE_ContextOptions {
   TF_SessionOptions session_options;
   // true if async execution is enabled.
   bool async = false;
-  TFE_ContextDevicePlacementPolicy policy{
-      TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32};
+  TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_SILENT};
 };
 
 struct TFE_Context {
@@ -71,7 +70,7 @@ struct TFE_Context {
 struct TFE_TensorHandle {
   TFE_TensorHandle(const tensorflow::Tensor& t, tensorflow::Device* d,
                    tensorflow::Device* op_device)
-      : handle(new tensorflow::TensorHandle(t, d, op_device)) {}
+      : handle(new tensorflow::TensorHandle(t, d, op_device, nullptr)) {}
 
   TFE_TensorHandle(tensorflow::uint64 node_id, tensorflow::DataType dtype,
                    tensorflow::EagerContext* ctx)
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index d88a6c1dda..701175e494 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -590,7 +590,13 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
   TFE_TensorHandle* m1 = TestMatrixTensorHandle();
   TFE_TensorHandle* m2 = TestMatrixTensorHandle3X2();
   TFE_Op* matmul = MatMulOp(ctx, m1, m2);
+  TFE_OpSetDevice(matmul, "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_Op* matmul2 = MatMulOp(ctx, m1, m1);
+  TFE_OpSetDevice(matmul2, "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_TensorHandle* retvals[1] = {nullptr};
   int num_retvals = 1;
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
@@ -693,14 +699,14 @@ TEST(CAPI, Execute_Min_CPU) {
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteTensorHandle(retvals[0]);
-  TFE_DeleteContext(ctx, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   float output[2] = {0};
   EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
   memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
author	Alexandre Passos <apassos@google.com>	2018-03-29 13:35:34 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-03-29 13:38:24 -0700
commit	c6911faaf4702096064542790d8c9e8e6f938d52 (patch)
tree	3a0349eaf30a69315c972c3be57bf5d44abd34f8 /tensorflow/c/eager
parent	9d1d379bcdd19d496fd8d2659c21a5510e045c5a (diff)