diff options
author | 2018-07-02 15:06:53 -0700 | |
---|---|---|
committer | 2018-07-02 15:09:56 -0700 | |
commit | 92f13f24d95360806c31983877f556834d155cbb (patch) | |
tree | b0f0b8ac81bddcb0c8bcbdb22af548e059517148 /tensorflow/python/lib | |
parent | 7ab8ccb2c3e88498f8b99cca6c6ab56f2c5fabf1 (diff) |
[eager]: Fix bug in converting pandas objects to Tensors.
Specifically, fix a segmentation fault when converting objects that implement
the Python sequence protocol (i.e., __getitem__, __len__, and __iter__) but
which do not have contiguous keys.
Fixes #20347
However, there are still some discrepancies possible between
tf.convert_to_tensor(o) (or tf.constant(o)) with and without eager execution
enabled. Fixing those is left as a follow up excercise.
Sample differences:
(1) Empty sequences that have numpy conversions defined.
import pandas as pd
import tensorflow as tf
s = pd.Series([]) # Empty series
t = tf.constant(s)
With eager execution enabled, t.dtype ends up with a dtype of float32 (as
py_seq_tensor.cc considers empty lists to be float32)
With graph construction, t.dtype ends up with a dtype of float64 (as
make_tensor_proto() converts 's' to a numpy array and uses its dtype).
(2) Objects that implement __getitem__, __len__, and __iter__, but are not
convertible to numpy arrays (e.g., do not implement __array__):
- With eager execution enabled, these can be converted to a tensor
- For graph construction, the conversion fails.
PiperOrigin-RevId: 203019624
Diffstat (limited to 'tensorflow/python/lib')
-rw-r--r-- | tensorflow/python/lib/core/py_seq_tensor.cc | 39 |
1 files changed, 38 insertions, 1 deletions
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc index 386be35ba2..3b4f12ae31 100644 --- a/tensorflow/python/lib/core/py_seq_tensor.cc +++ b/tensorflow/python/lib/core/py_seq_tensor.cc @@ -88,6 +88,41 @@ bool IsPyDimension(PyObject* obj) { return ret; } +// Sets *elem to a NEW reference to an element in seq on success. +// REQUIRES: PySequence_Check(seq) && PySequence_Length(seq) > 0. +Status SampleElementFromSequence(PyObject* seq, PyObject** elem) { + *elem = PySequence_GetItem(seq, 0); + if (*elem != nullptr) return Status::OK(); + // seq may implement the sequence protocol (i.e., implement __getitem__) + // but may legitimately not have a 0-th element (__getitem__(self, 0) + // raises a KeyError). For example: + // seq = pandas.Series([0, 1, 2], index=[2, 4, 6]) + // + // We don't actually care for the element at key 0, any element will do + // for inferring the element types. All elements are expected to + // have the same type, and this will be validated when converting + // to an EagerTensor. + PyErr_Clear(); + Safe_PyObjectPtr iter(PyObject_GetIter(seq)); + if (PyErr_Occurred()) { + return errors::InvalidArgument("Cannot infer dtype of a ", + Py_TYPE(seq)->tp_name, + " object: ", PyExceptionFetch()); + } + *elem = PyIter_Next(iter.get()); + if (PyErr_Occurred()) { + return errors::InvalidArgument( + "Cannot infer dtype of a ", Py_TYPE(seq)->tp_name, + " object, as iter(<object>).next() failed: ", PyExceptionFetch()); + } + if (*elem == nullptr) { + return errors::InvalidArgument("Cannot infer dtype of a ", + Py_TYPE(seq)->tp_name, + " object since it is an empty sequence"); + } + return Status::OK(); +} + Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) { std::vector<Safe_PyObjectPtr> refs_to_clean; while (true) { @@ -98,7 +133,9 @@ Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) { auto length = PySequence_Length(obj); if (length > 0) { shape->AddDim(length); - obj = PySequence_GetItem(obj, 0); + PyObject* elem = nullptr; + TF_RETURN_IF_ERROR(SampleElementFromSequence(obj, &elem)); + obj = elem; refs_to_clean.push_back(make_safe(obj)); continue; } else if (length == 0) { |