diff options
author | Asim Shankar <ashankar@google.com> | 2018-06-15 16:26:35 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-06-15 16:31:01 -0700 |
commit | 4d8a66c5b29428b709f4f54b566a44902ea8173e (patch) | |
tree | 717b000e9284c2c9ae0710c32df14bc36f86dda7 /tensorflow/python/lib | |
parent | 96100f90a90bb2db905f50617cbb5e7928480667 (diff) |
[py_func]: Fix #20021
* EagerPyFunc now validates its assumption that returned tensors are backed by memory on the same device that the EagerPyFunc kernel executed on.
* Make the Python trampolining mechanism ensure that this requirement of the kernel is met.
* Allow tf.contrib.eager.py_func to execute correctly on devices other than CPU and GPU:0.
Prior to this change, tf.contrib.eager.py_func() would copy data from CPU to GPU:0 if necessary, but not the other way around. As a result, the assumptions made by the EagerPyFunc kernel implementation about the placement of returned tensors would be violated.
The test added in py_func_test.py, when executed on a machine with a GPU will:
- Fail with a segmentation fault (dereferencing GPU memory) without the changes to py_func.cc and script_ops.py
- Fail with an error message with the change to py_func.cc but without the change to script_ops.py
- Pass with changes to py_func.cc and script_ops.py
PiperOrigin-RevId: 200792057
Diffstat (limited to 'tensorflow/python/lib')
-rw-r--r-- | tensorflow/python/lib/core/py_func.cc | 70 |
1 files changed, 48 insertions, 22 deletions
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc index 30c1a9c759..57139986af 100644 --- a/tensorflow/python/lib/core/py_func.cc +++ b/tensorflow/python/lib/core/py_func.cc @@ -55,37 +55,35 @@ struct PyCall { string token; // The device on which Tensors are stored; only used for EagerPyFunc. - Device* device; - - // True if and only if the op has been placed on a GPU. - bool gpu; + Device* device = nullptr; // True if the call is associated with an EagerPyFunc. - bool eager; + bool eager = false; // Inputs and outputs of this function invocation. std::vector<Tensor> ins; std::vector<Tensor> out; }; +bool IsCPUDevice(const Device* d) { + return d == nullptr || d->tensorflow_gpu_device_info() == nullptr; +} + // Givens the 'call', prepares the token and inputs as a python tuple // that is appropriate for calling the trampoline. Status MakeArgTuple(const PyCall* call, PyObject** tuple) { int64 n = call->ins.size(); PyObject* lst = PyList_New(n); CHECK(lst); + // TFE_TensorHandle assumes that CPU is identified by nullptr. + Device* device = IsCPUDevice(call->device) ? nullptr : call->device; for (int64 i = 0; i < n; ++i) { PyObject* arg = nullptr; const Tensor& t = call->ins[i]; if (call->eager) { - if (call->gpu) { - arg = EagerTensorFromHandle( - new TFE_TensorHandle(t, call->device, call->device)); - } else { - // TFE_TensorHandle assumes that CPU is identified by `nullptr`. - arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr, nullptr)); - } + arg = EagerTensorFromHandle(new TFE_TensorHandle(t, device, device)); if (arg == nullptr) { + Py_DECREF(lst); return errors::Internal("Unable to procure EagerTensor from Tensor."); } } else { @@ -97,8 +95,9 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) { } PyList_SetItem(lst, i, arg); } - *tuple = Py_BuildValue("(sON)", call->token.c_str(), - call->gpu ? Py_True : Py_False, lst); + const char* device_name = + device == nullptr ? nullptr : device->attributes().name().c_str(); + *tuple = Py_BuildValue("(ssN)", call->token.c_str(), device_name, lst); CHECK(*tuple); return Status::OK(); } @@ -167,9 +166,40 @@ bool IsSingleNone(PyObject* obj) { } // Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`. +// Validates that `output_tensor` is backed by memory in `expected_device` +// (which is assumed to be a local device, one on which the kernel was +// executed.) +// +// It may be nice to copy the tensor to the right device instead of failing if +// it isn't already there. This is left as a future exercise. The required +// device-copying logic is implemented in Python at the moment. tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor, + const Device* expected_device, const Tensor** output_tensor) { - return EagerTensor_Handle(eager_tensor)->handle->Tensor(output_tensor); + auto handle = EagerTensor_Handle(eager_tensor)->handle; + Device* actual_device = nullptr; + TF_RETURN_IF_ERROR(handle->Device(&actual_device)); + TF_RETURN_IF_ERROR(handle->Tensor(output_tensor)); + // actual_device may be nullptr, which implies local CPU. + if (expected_device == actual_device) return Status::OK(); + const string& expected_device_name = expected_device->attributes().name(); + if (actual_device == nullptr) { + if (!IsCPUDevice(expected_device)) { + return errors::Internal( + "expected the py_func to return a Tensor backed by memory in ", + expected_device_name, + ", but is actually backed by local host memory. This is a bug."); + } + return Status::OK(); + } + const string& actual_device_name = actual_device->attributes().name(); + if (actual_device_name != expected_device_name) { + return errors::Internal( + "expected the py_func to return a Tensor backed by memory in ", + expected_device_name, ", but is actually in ", actual_device_name, + ". This is a bug."); + } + return Status::OK(); } // Calls the registered py function through the trampoline. @@ -224,7 +254,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) { const PyObject* item = PyList_GetItem(result, i); if (EagerTensor_CheckExact(item)) { const Tensor* tensor = nullptr; - s = ExtractTensorFromEagerTensor(item, &tensor); + s = ExtractTensorFromEagerTensor(item, call->device, &tensor); if (s.ok()) t = *tensor; } else { s = errors::FailedPrecondition( @@ -245,7 +275,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) { DCHECK(call->eager); if (result != Py_None) { const Tensor* t = nullptr; - s = ExtractTensorFromEagerTensor(result, &t); + s = ExtractTensorFromEagerTensor(result, call->device, &t); if (s.ok()) call->out.push_back(*t); } } else if (PyArray_Check(result)) { @@ -449,13 +479,11 @@ class PyFuncOp : public OpKernel { explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_)); eager_ = type_string() == "EagerPyFunc"; - gpu_ = ctx->device_type().type_string() == DEVICE_GPU; } void Compute(OpKernelContext* ctx) override { PyCall call; call.token = token_; - call.gpu = gpu_; call.eager = eager_; if (call.eager) { // Eager's C API uses `Device`, whereas `OpKernelContext` stores a @@ -464,6 +492,7 @@ class PyFuncOp : public OpKernel { if (call.device == nullptr) { ctx->CtxFailureWithWarning( errors::Internal("Unrecognized device class")); + return; } } @@ -508,9 +537,6 @@ class PyFuncOp : public OpKernel { private: string token_; - // True if and only if this op has been placed on a GPU. - bool gpu_; - // True if and only if this op should execute the python function eagerly, // i.e., if and only if the eager attribute is set. bool eager_; |