[py_func]: Fix #20021

* EagerPyFunc now validates its assumption that returned tensors are backed by memory on the same device that the EagerPyFunc kernel executed on. * Make the Python trampolining mechanism ensure that this requirement of the kernel is met. * Allow tf.contrib.eager.py_func to execute correctly on devices other than CPU and GPU:0. Prior to this change, tf.contrib.eager.py_func() would copy data from CPU to GPU:0 if necessary, but not the other way around. As a result, the assumptions made by the EagerPyFunc kernel implementation about the placement of returned tensors would be violated. The test added in py_func_test.py, when executed on a machine with a GPU will: - Fail with a segmentation fault (dereferencing GPU memory) without the changes to py_func.cc and script_ops.py - Fail with an error message with the change to py_func.cc but without the change to script_ops.py - Pass with changes to py_func.cc and script_ops.py PiperOrigin-RevId: 200792057
author: Asim Shankar <ashankar@google.com> 2018-06-15 16:26:35 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-06-15 16:31:01 -0700
commit: 4d8a66c5b29428b709f4f54b566a44902ea8173e (patch)
tree: 717b000e9284c2c9ae0710c32df14bc36f86dda7 /tensorflow/python/lib
parent: 96100f90a90bb2db905f50617cbb5e7928480667 (diff)
1 files changed, 48 insertions, 22 deletions
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 30c1a9c759..57139986af 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -55,37 +55,35 @@ struct PyCall {
   string token;
 
   // The device on which Tensors are stored; only used for EagerPyFunc.
-  Device* device;
-
-  // True if and only if the op has been placed on a GPU.
-  bool gpu;
+  Device* device = nullptr;
 
   // True if the call is associated with an EagerPyFunc.
-  bool eager;
+  bool eager = false;
 
   // Inputs and outputs of this function invocation.
   std::vector<Tensor> ins;
   std::vector<Tensor> out;
 };
 
+bool IsCPUDevice(const Device* d) {
+  return d == nullptr || d->tensorflow_gpu_device_info() == nullptr;
+}
+
 // Givens the 'call', prepares the token and inputs as a python tuple
 // that is appropriate for calling the trampoline.
 Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
   int64 n = call->ins.size();
   PyObject* lst = PyList_New(n);
   CHECK(lst);
+  // TFE_TensorHandle assumes that CPU is identified by nullptr.
+  Device* device = IsCPUDevice(call->device) ? nullptr : call->device;
   for (int64 i = 0; i < n; ++i) {
     PyObject* arg = nullptr;
     const Tensor& t = call->ins[i];
     if (call->eager) {
-      if (call->gpu) {
-        arg = EagerTensorFromHandle(
-            new TFE_TensorHandle(t, call->device, call->device));
-      } else {
-        // TFE_TensorHandle assumes that CPU is identified by `nullptr`.
-        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr, nullptr));
-      }
+      arg = EagerTensorFromHandle(new TFE_TensorHandle(t, device, device));
       if (arg == nullptr) {
+        Py_DECREF(lst);
         return errors::Internal("Unable to procure EagerTensor from Tensor.");
       }
     } else {
@@ -97,8 +95,9 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
     }
     PyList_SetItem(lst, i, arg);
   }
-  *tuple = Py_BuildValue("(sON)", call->token.c_str(),
-                         call->gpu ? Py_True : Py_False, lst);
+  const char* device_name =
+      device == nullptr ? nullptr : device->attributes().name().c_str();
+  *tuple = Py_BuildValue("(ssN)", call->token.c_str(), device_name, lst);
   CHECK(*tuple);
   return Status::OK();
 }
@@ -167,9 +166,40 @@ bool IsSingleNone(PyObject* obj) {
 }
 
 // Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
+// Validates that `output_tensor` is backed by memory in `expected_device`
+// (which is assumed to be a local device, one on which the kernel was
+// executed.)
+//
+// It may be nice to copy the tensor to the right device instead of failing if
+// it isn't already there. This is left as a future exercise.  The required
+// device-copying logic is implemented in Python at the moment.
 tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
+                                                const Device* expected_device,
                                                 const Tensor** output_tensor) {
-  return EagerTensor_Handle(eager_tensor)->handle->Tensor(output_tensor);
+  auto handle = EagerTensor_Handle(eager_tensor)->handle;
+  Device* actual_device = nullptr;
+  TF_RETURN_IF_ERROR(handle->Device(&actual_device));
+  TF_RETURN_IF_ERROR(handle->Tensor(output_tensor));
+  // actual_device may be nullptr, which implies local CPU.
+  if (expected_device == actual_device) return Status::OK();
+  const string& expected_device_name = expected_device->attributes().name();
+  if (actual_device == nullptr) {
+    if (!IsCPUDevice(expected_device)) {
+      return errors::Internal(
+          "expected the py_func to return a Tensor backed by memory in ",
+          expected_device_name,
+          ", but is actually backed by local host memory. This is a bug.");
+    }
+    return Status::OK();
+  }
+  const string& actual_device_name = actual_device->attributes().name();
+  if (actual_device_name != expected_device_name) {
+    return errors::Internal(
+        "expected the py_func to return a Tensor backed by memory in ",
+        expected_device_name, ", but is actually in ", actual_device_name,
+        ". This is a bug.");
+  }
+  return Status::OK();
 }
 
 // Calls the registered py function through the trampoline.
@@ -224,7 +254,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
         const PyObject* item = PyList_GetItem(result, i);
         if (EagerTensor_CheckExact(item)) {
           const Tensor* tensor = nullptr;
-          s = ExtractTensorFromEagerTensor(item, &tensor);
+          s = ExtractTensorFromEagerTensor(item, call->device, &tensor);
           if (s.ok()) t = *tensor;
         } else {
           s = errors::FailedPrecondition(
@@ -245,7 +275,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
     DCHECK(call->eager);
     if (result != Py_None) {
       const Tensor* t = nullptr;
-      s = ExtractTensorFromEagerTensor(result, &t);
+      s = ExtractTensorFromEagerTensor(result, call->device, &t);
       if (s.ok()) call->out.push_back(*t);
     }
   } else if (PyArray_Check(result)) {
@@ -449,13 +479,11 @@ class PyFuncOp : public OpKernel {
   explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_));
     eager_ = type_string() == "EagerPyFunc";
-    gpu_ = ctx->device_type().type_string() == DEVICE_GPU;
   }
 
   void Compute(OpKernelContext* ctx) override {
     PyCall call;
     call.token = token_;
-    call.gpu = gpu_;
     call.eager = eager_;
     if (call.eager) {
       // Eager's C API uses `Device`, whereas `OpKernelContext` stores a
@@ -464,6 +492,7 @@ class PyFuncOp : public OpKernel {
       if (call.device == nullptr) {
         ctx->CtxFailureWithWarning(
             errors::Internal("Unrecognized device class"));
+        return;
       }
     }
 
@@ -508,9 +537,6 @@ class PyFuncOp : public OpKernel {
  private:
   string token_;
 
-  // True if and only if this op has been placed on a GPU.
-  bool gpu_;
-
   // True if and only if this op should execute the python function eagerly,
   // i.e., if and only if the eager attribute is set.
   bool eager_;
author	Asim Shankar <ashankar@google.com>	2018-06-15 16:26:35 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-06-15 16:31:01 -0700
commit	4d8a66c5b29428b709f4f54b566a44902ea8173e (patch)
tree	717b000e9284c2c9ae0710c32df14bc36f86dda7 /tensorflow/python/lib
parent	96100f90a90bb2db905f50617cbb5e7928480667 (diff)