diff options
-rw-r--r-- | tensorflow/python/estimator/inputs/pandas_io.py | 41 | ||||
-rw-r--r-- | tensorflow/python/estimator/inputs/pandas_io_test.py | 70 |
2 files changed, 107 insertions, 4 deletions
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py index 57f8e5fd6a..616bcb410f 100644 --- a/tensorflow/python/estimator/inputs/pandas_io.py +++ b/tensorflow/python/estimator/inputs/pandas_io.py @@ -18,6 +18,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import six +import uuid import numpy as np from tensorflow.python.estimator.inputs.queues import feeding_functions @@ -35,6 +37,22 @@ except ImportError: HAS_PANDAS = False +def _get_unique_target_key(features, target_column_name): + """Returns a key that does not exist in the input DataFrame `features`. + + Args: + features: DataFrame + target_column_name: Name of the target column as a `str` + + Returns: + A unique key that can be used to insert the target into + features. + """ + if target_column_name in features: + target_column_name += '_' + str(uuid.uuid4()) + return target_column_name + + @estimator_export('estimator.inputs.pandas_input_fn') def pandas_input_fn(x, y=None, @@ -50,7 +68,7 @@ def pandas_input_fn(x, Args: x: pandas `DataFrame` object. - y: pandas `Series` object. `None` if absent. + y: pandas `Series` object or `DataFrame`. `None` if absent. batch_size: int, size of batches to return. num_epochs: int, number of epochs to iterate over data. If not `None`, read attempts that would exceed this value will raise `OutOfRangeError`. @@ -60,7 +78,8 @@ def pandas_input_fn(x, num_threads: Integer, number of threads used for reading and enqueueing. In order to have predicted and repeatable order of reading and enqueueing, such as in prediction and evaluation mode, `num_threads` should be 1. - target_column: str, name to give the target column `y`. + target_column: str, name to give the target column `y`. This parameter + is not used when `y` is a `DataFrame`. Returns: Function, that has signature of ()->(dict of `features`, `target`) @@ -79,6 +98,9 @@ def pandas_input_fn(x, '(it is recommended to set it as True for training); ' 'got {}'.format(shuffle)) + if not isinstance(target_column, six.string_types): + raise TypeError('target_column must be a string type') + x = x.copy() if y is not None: if target_column in x: @@ -88,7 +110,13 @@ def pandas_input_fn(x, if not np.array_equal(x.index, y.index): raise ValueError('Index for x and y are mismatched.\nIndex for x: %s\n' 'Index for y: %s\n' % (x.index, y.index)) - x[target_column] = y + if isinstance(y, pd.DataFrame): + y_columns = [(column, _get_unique_target_key(x, column)) + for column in list(y)] + target_column = [v for _, v in y_columns] + x[target_column] = y + else: + x[target_column] = y # TODO(mdan): These are memory copies. We probably don't need 4x slack space. # The sizes below are consistent with what I've seen elsewhere. @@ -118,7 +146,12 @@ def pandas_input_fn(x, features = features[1:] features = dict(zip(list(x.columns), features)) if y is not None: - target = features.pop(target_column) + if isinstance(target_column, list): + keys = [k for k, _ in y_columns] + values = [features.pop(column) for column in target_column] + target = {k: v for k, v in zip(keys, values)} + else: + target = features.pop(target_column) return features, target return features return input_fn diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py index dcecf6dd61..6f13bc95d2 100644 --- a/tensorflow/python/estimator/inputs/pandas_io_test.py +++ b/tensorflow/python/estimator/inputs/pandas_io_test.py @@ -47,6 +47,16 @@ class PandasIoTest(test.TestCase): y = pd.Series(np.arange(-32, -28), index=index) return x, y + def makeTestDataFrameWithYAsDataFrame(self): + index = np.arange(100, 104) + a = np.arange(4) + b = np.arange(32, 36) + a_label = np.arange(10, 14) + b_label = np.arange(50, 54) + x = pd.DataFrame({'a': a, 'b': b}, index=index) + y = pd.DataFrame({'a_target': a_label, 'b_target': b_label}, index=index) + return x, y + def callInputFnOnce(self, input_fn, session): results = input_fn() coord = coordinator.Coordinator() @@ -65,6 +75,19 @@ class PandasIoTest(test.TestCase): pandas_io.pandas_input_fn( x, y_noindex, batch_size=2, shuffle=False, num_epochs=1) + def testPandasInputFn_RaisesWhenTargetColumnIsAList(self): + if not HAS_PANDAS: + return + + x, y = self.makeTestDataFrame() + + with self.assertRaisesRegexp(TypeError, + 'target_column must be a string type'): + pandas_io.pandas_input_fn(x, y, batch_size=2, + shuffle=False, + num_epochs=1, + target_column=['one', 'two']) + def testPandasInputFn_NonBoolShuffle(self): if not HAS_PANDAS: return @@ -90,6 +113,53 @@ class PandasIoTest(test.TestCase): self.assertAllEqual(features['b'], [32, 33]) self.assertAllEqual(target, [-32, -31]) + def testPandasInputFnWhenYIsDataFrame_ProducesExpectedOutput(self): + if not HAS_PANDAS: + return + with self.test_session() as session: + x, y = self.makeTestDataFrameWithYAsDataFrame() + input_fn = pandas_io.pandas_input_fn( + x, y, batch_size=2, shuffle=False, num_epochs=1) + + features, targets = self.callInputFnOnce(input_fn, session) + + self.assertAllEqual(features['a'], [0, 1]) + self.assertAllEqual(features['b'], [32, 33]) + self.assertAllEqual(targets['a_target'], [10, 11]) + self.assertAllEqual(targets['b_target'], [50, 51]) + + def testPandasInputFnYIsDataFrame_HandlesOverlappingColumns(self): + if not HAS_PANDAS: + return + with self.test_session() as session: + x, y = self.makeTestDataFrameWithYAsDataFrame() + y = y.rename(columns={'a_target': 'a', 'b_target': 'b'}) + input_fn = pandas_io.pandas_input_fn( + x, y, batch_size=2, shuffle=False, num_epochs=1) + + features, targets = self.callInputFnOnce(input_fn, session) + + self.assertAllEqual(features['a'], [0, 1]) + self.assertAllEqual(features['b'], [32, 33]) + self.assertAllEqual(targets['a'], [10, 11]) + self.assertAllEqual(targets['b'], [50, 51]) + + def testPandasInputFnYIsDataFrame_HandlesOverlappingColumnsInTargets(self): + if not HAS_PANDAS: + return + with self.test_session() as session: + x, y = self.makeTestDataFrameWithYAsDataFrame() + y = y.rename(columns={'a_target': 'a', 'b_target': 'a_n'}) + input_fn = pandas_io.pandas_input_fn( + x, y, batch_size=2, shuffle=False, num_epochs=1) + + features, targets = self.callInputFnOnce(input_fn, session) + + self.assertAllEqual(features['a'], [0, 1]) + self.assertAllEqual(features['b'], [32, 33]) + self.assertAllEqual(targets['a'], [10, 11]) + self.assertAllEqual(targets['a_n'], [50, 51]) + def testPandasInputFn_ProducesOutputsForLargeBatchAndMultipleEpochs(self): if not HAS_PANDAS: return |