aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-05-10 16:03:02 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-05-11 10:19:41 -0700
commite09b0b6ebf6bfea5157a7e8b65b9070e56c1c5f2 (patch)
treef76a50aafbcab04acbde60576b80ec94b05348d3
parent770a27161bf9e73860443f62ab539833d39d61b4 (diff)
Move crossed_column to core.
PiperOrigin-RevId: 155687697
-rw-r--r--tensorflow/python/feature_column/feature_column.py187
-rw-r--r--tensorflow/python/feature_column/feature_column_test.py256
2 files changed, 443 insertions, 0 deletions
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 8ce3878d8c..391f3123ac 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -607,6 +607,18 @@ def bucketized_column(source_column, boundaries):
dense_tensor = make_input_layer(features, columns)
```
+ `bucketized_column` can also be crossed with another categorical column using
+ `crossed_column`:
+ ```python
+ price = numeric_column('price')
+ # bucketized_column converts numerical feature to a categorical one.
+ bucketized_price = bucketized_column(price, boundaries=[...])
+ # 'keywords' is a string feature.
+ price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
+ all_feature_columns = [price_x_keywords, ...]
+ linear_prediction = make_linear_model(features, all_feature_columns)
+ ```
+
Args:
source_column: A one-dimensional dense column which is generated with
`numeric_column`.
@@ -1036,6 +1048,107 @@ def weighted_categorical_column(
dtype=dtype)
+def crossed_column(keys, hash_bucket_size, hash_key=None):
+ """Returns a column for performing crosses of categorical features.
+
+ Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
+ the transformation can be thought of as:
+ Hash(cartesian product of features) % `hash_bucket_size`
+
+ For example, if the input features are:
+ * SparseTensor referred by first key: shape = [2, 2]
+ [0, 0]: "a"
+ [1, 0]: "b"
+ [1, 1]: "c"
+
+ * SparseTensor referred by second key: shape = [2, 1]
+ [0, 0]: "d"
+ [1, 0]: "e"
+
+ then crossed feature will look like:
+ shape = [2, 2]
+ [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
+ [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
+ [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
+
+ Here is an example to create a linear model with crosses of string features:
+ ```python
+ keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
+ all_feature_columns = [keywords_x_doc_terms, ...]
+ linear_prediction = make_linear_model(features, all_feature_columns)
+ ```
+
+ You could also use vocabulary lookup before crossing:
+ ```python
+ keywords = categorical_column_with_vocabulary_file(
+ 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
+ keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
+ all_feature_columns = [keywords_x_doc_terms, ...]
+ linear_prediction = make_linear_model(features, all_feature_columns)
+ ```
+
+ If an input feature is of numeric type, you can use
+ `categorical_column_with_identity`, or `bucketized_column`, as in the example:
+ ```python
+ # vertical_id is an integer categorical feature.
+ vertical_id = categorical_column_with_identity('vertical_id', 10K)
+ price = numeric_column('price')
+ # bucketized_column converts numerical feature to a categorical one.
+ bucketized_price = bucketized_column(price, boundaries=[...])
+ vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
+ all_feature_columns = [vertical_id_x_price, ...]
+ linear_prediction = make_linear_model(features, all_feature_columns)
+ ```
+
+ To use crossed column in DNN model, you need to add it in an embedding column
+ as in this example:
+ ```python
+ vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
+ vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
+ dense_tensor = make_input_layer(features, [vertical_id_x_price_embedded, ...])
+ ```
+
+ Args:
+ keys: An iterable identifying the features to be crossed. Each element can
+ be either:
+ * string: Will use the corresponding feature which must be of string type.
+ * `_CategoricalColumn`: Will use the transformed tensor produced by this
+ column. Does not support hashed categorical column.
+ hash_bucket_size: An int > 1. The number of buckets.
+ hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+ function to combine the crosses fingerprints on SparseCrossOp (optional).
+
+ Returns:
+ A `_CrossedColumn`.
+
+ Raises:
+ ValueError: If `len(keys) < 2`.
+ ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
+ ValueError: If any of the keys is `_HashedCategoricalColumn`.
+ ValueError: If `hash_bucket_size < 1`.
+ """
+ if not hash_bucket_size or hash_bucket_size < 1:
+ raise ValueError('hash_bucket_size must be > 1. '
+ 'hash_bucket_size: {}'.format(hash_bucket_size))
+ if not keys or len(keys) < 2:
+ raise ValueError(
+ 'keys must be a list with length > 1. Given: {}'.format(keys))
+ for key in keys:
+ if (not isinstance(key, six.string_types) and
+ not isinstance(key, _CategoricalColumn)):
+ raise ValueError(
+ 'Unsupported key type. All keys must be either string, or '
+ 'categorical column except _HashedCategoricalColumn. '
+ 'Given: {}'.format(key))
+ if isinstance(key, _HashedCategoricalColumn):
+ raise ValueError(
+ '_HashedCategoricalColumn is not supported. Instead, use the feature '
+ 'name as a string. Given: {}'.format(key))
+ return _CrossedColumn(
+ keys=tuple(keys), hash_bucket_size=hash_bucket_size,
+ hash_key=hash_key)
+
+
class _FeatureColumn(object):
"""Represents a feature column abstraction.
@@ -1969,6 +2082,80 @@ class _WeightedCategoricalColumn(
return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
+class _CrossedColumn(
+ _CategoricalColumn,
+ collections.namedtuple('_CrossedColumn',
+ ['keys', 'hash_bucket_size', 'hash_key'])):
+ """See `crossed_column`."""
+
+ @property
+ def name(self):
+ feature_names = []
+ for key in _collect_leaf_level_keys(self):
+ if isinstance(key, _FeatureColumn):
+ feature_names.append(key.name)
+ else: # key must be a string
+ feature_names.append(key)
+ return '_X_'.join(sorted(feature_names))
+
+ @property
+ def _parse_example_config(self):
+ config = {}
+ for key in self.keys:
+ if isinstance(key, _FeatureColumn):
+ config.update(key._parse_example_config) # pylint: disable=protected-access
+ else: # key must be a string
+ config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
+ return config
+
+ def _transform_feature(self, inputs):
+ feature_tensors = []
+ for key in _collect_leaf_level_keys(self):
+ if isinstance(key, six.string_types):
+ feature_tensors.append(inputs.get(key))
+ elif isinstance(key, _CategoricalColumn):
+ ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access
+ if ids_and_weights.weight_tensor is not None:
+ raise ValueError(
+ 'crossed_column does not support weight_tensor, but the given '
+ 'column populates weight_tensor. '
+ 'Given column: {}'.format(key.name))
+ feature_tensors.append(ids_and_weights.id_tensor)
+ else:
+ raise ValueError('Unsupported column type. Given: {}'.format(key))
+ return sparse_ops._sparse_cross_hashed( # pylint: disable=protected-access
+ inputs=feature_tensors,
+ num_buckets=self.hash_bucket_size,
+ hash_key=self.hash_key)
+
+ @property
+ def _num_buckets(self):
+ """Returns number of buckets in this sparse feature."""
+ return self.hash_bucket_size
+
+ def _get_sparse_tensors(self, inputs, weight_collections=None,
+ trainable=None):
+ return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+def _collect_leaf_level_keys(cross):
+ """Collects base keys by expanding all nested crosses.
+
+ Args:
+ cross: A `_CrossedColumn`.
+
+ Returns:
+ A list of strings or `_CategoricalColumn` instances.
+ """
+ leaf_level_keys = []
+ for k in cross.keys:
+ if isinstance(k, _CrossedColumn):
+ leaf_level_keys.extend(_collect_leaf_level_keys(k))
+ else:
+ leaf_level_keys.append(k)
+ return leaf_level_keys
+
+
# TODO(zakaria): Move this to embedding_ops and make it public.
def _safe_embedding_lookup_sparse(embedding_weights,
sparse_ids,
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 388e8a2d01..b09c01d266 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -735,6 +735,262 @@ class HashedCategoricalColumnTest(test.TestCase):
self.assertAllClose(((4.,), (6.,)), predictions.eval())
+class CrossedColumnTest(test.TestCase):
+
+ def test_keys_empty(self):
+ with self.assertRaisesRegexp(
+ ValueError, 'keys must be a list with length > 1'):
+ fc.crossed_column([], 10)
+
+ def test_keys_length_one(self):
+ with self.assertRaisesRegexp(
+ ValueError, 'keys must be a list with length > 1'):
+ fc.crossed_column(['a'], 10)
+
+ def test_key_type_unsupported(self):
+ with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
+ fc.crossed_column(['a', fc.numeric_column('c')], 10)
+
+ with self.assertRaisesRegexp(
+ ValueError, '_HashedCategoricalColumn is not supported'):
+ fc.crossed_column(
+ ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
+
+ def test_hash_bucket_size_negative(self):
+ with self.assertRaisesRegexp(
+ ValueError, 'hash_bucket_size must be > 1'):
+ fc.crossed_column(['a', 'c'], -1)
+
+ def test_hash_bucket_size_zero(self):
+ with self.assertRaisesRegexp(
+ ValueError, 'hash_bucket_size must be > 1'):
+ fc.crossed_column(['a', 'c'], 0)
+
+ def test_hash_bucket_size_none(self):
+ with self.assertRaisesRegexp(
+ ValueError, 'hash_bucket_size must be > 1'):
+ fc.crossed_column(['a', 'c'], None)
+
+ def test_name(self):
+ a = fc.numeric_column('a', dtype=dtypes.int32)
+ b = fc.bucketized_column(a, boundaries=[0, 1])
+ crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+ crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+ self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+ def test_name_ordered_alphabetically(self):
+ """Tests that the name does not depend on the order of given columns."""
+ a = fc.numeric_column('a', dtype=dtypes.int32)
+ b = fc.bucketized_column(a, boundaries=[0, 1])
+ crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+ crossed2 = fc.crossed_column([crossed1, 'c', b], 10)
+ self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+ def test_name_leaf_keys_ordered_alphabetically(self):
+ """Tests that the name does not depend on the order of given columns."""
+ a = fc.numeric_column('a', dtype=dtypes.int32)
+ b = fc.bucketized_column(a, boundaries=[0, 1])
+ crossed1 = fc.crossed_column(['d2', 'c'], 10)
+
+ crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
+ self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+ def test_parse_config(self):
+ a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
+ b = fc.bucketized_column(a, boundaries=[0, 1])
+ crossed = fc.crossed_column([b, 'c'], 10)
+ self.assertEqual({
+ 'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32),
+ 'c': parsing_ops.VarLenFeature(dtypes.string),
+ }, crossed._parse_example_config)
+
+ def test_num_buckets(self):
+ a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
+ b = fc.bucketized_column(a, boundaries=[0, 1])
+ crossed = fc.crossed_column([b, 'c'], 15)
+ self.assertEqual(15, crossed._num_buckets)
+
+ def test_deep_copy(self):
+ a = fc.numeric_column('a', dtype=dtypes.int32)
+ b = fc.bucketized_column(a, boundaries=[0, 1])
+ crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+ crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+ crossed2_copy = copy.deepcopy(crossed2)
+ self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
+ self.assertEqual(15, crossed2_copy.hash_bucket_size)
+ self.assertEqual(5, crossed2_copy.hash_key)
+
+ def test_parse_example(self):
+ price = fc.numeric_column('price', shape=[2])
+ bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+ price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
+ data = example_pb2.Example(features=feature_pb2.Features(
+ feature={
+ 'price':
+ feature_pb2.Feature(float_list=feature_pb2.FloatList(
+ value=[20., 110.])),
+ 'wire':
+ feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+ value=[b'omar', b'stringer'])),
+ }))
+ features = parsing_ops.parse_example(
+ serialized=[data.SerializeToString()],
+ features=price_cross_wire._parse_example_config)
+ self.assertIn('price', features)
+ self.assertIn('wire', features)
+ with self.test_session():
+ self.assertAllEqual([[20., 110.]], features['price'].eval())
+ wire_sparse = features['wire']
+ self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
+ # Use byte constants to pass the open-source test.
+ self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
+ self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
+
+ def test_get_sparse_tensors(self):
+ a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+ b = fc.bucketized_column(a, boundaries=(0, 1))
+ crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+ crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+ with ops.Graph().as_default():
+ builder = fc._LazyBuilder({
+ 'a': constant_op.constant(((-1., .5), (.5, 1.))),
+ 'c': sparse_tensor.SparseTensor(
+ indices=((0, 0), (1, 0), (1, 1)),
+ values=['cA', 'cB', 'cC'],
+ dense_shape=(2, 2)),
+ 'd1': sparse_tensor.SparseTensor(
+ indices=((0, 0), (1, 0), (1, 1)),
+ values=['d1A', 'd1B', 'd1C'],
+ dense_shape=(2, 2)),
+ 'd2': sparse_tensor.SparseTensor(
+ indices=((0, 0), (1, 0), (1, 1)),
+ values=['d2A', 'd2B', 'd2C'],
+ dense_shape=(2, 2)),
+ })
+ id_weight_pair = crossed2._get_sparse_tensors(builder)
+ with _initialized_session():
+ id_tensor_eval = id_weight_pair.id_tensor.eval()
+ self.assertAllEqual(
+ ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
+ (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
+ (1, 14), (1, 15)),
+ id_tensor_eval.indices)
+ # Check exact hashed output. If hashing changes this test will break.
+ # All values are within [0, hash_bucket_size).
+ expected_values = (
+ 6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11)
+ self.assertAllEqual(expected_values, id_tensor_eval.values)
+ self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
+
+ def test_get_sparse_tensors_simple(self):
+ """Same as test_get_sparse_tensors, but with simpler values."""
+ a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+ b = fc.bucketized_column(a, boundaries=(0, 1))
+ crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+ with ops.Graph().as_default():
+ builder = fc._LazyBuilder({
+ 'a': constant_op.constant(((-1., .5), (.5, 1.))),
+ 'c': sparse_tensor.SparseTensor(
+ indices=((0, 0), (1, 0), (1, 1)),
+ values=['cA', 'cB', 'cC'],
+ dense_shape=(2, 2)),
+ })
+ id_weight_pair = crossed._get_sparse_tensors(builder)
+ with _initialized_session():
+ id_tensor_eval = id_weight_pair.id_tensor.eval()
+ self.assertAllEqual(
+ ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
+ id_tensor_eval.indices)
+ # Check exact hashed output. If hashing changes this test will break.
+ # All values are within [0, hash_bucket_size).
+ expected_values = (1, 0, 1, 3, 4, 2)
+ self.assertAllEqual(expected_values, id_tensor_eval.values)
+ self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
+
+ def test_make_linear_model(self):
+ """Tests make_linear_model.
+
+ Uses data from test_get_sparse_tesnsors_simple.
+ """
+ a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+ b = fc.bucketized_column(a, boundaries=(0, 1))
+ crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+ with ops.Graph().as_default():
+ predictions = fc.make_linear_model({
+ 'a': constant_op.constant(((-1., .5), (.5, 1.))),
+ 'c': sparse_tensor.SparseTensor(
+ indices=((0, 0), (1, 0), (1, 1)),
+ values=['cA', 'cB', 'cC'],
+ dense_shape=(2, 2)),
+ }, (crossed,))
+ bias = get_linear_model_bias()
+ crossed_var = get_linear_model_column_var(crossed)
+ with _initialized_session() as sess:
+ self.assertAllClose((0.,), bias.eval())
+ self.assertAllClose(
+ ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
+ self.assertAllClose(((0.,), (0.,)), predictions.eval())
+ sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+ # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+ self.assertAllClose(((3.,), (14.,)), predictions.eval())
+ sess.run(bias.assign((.1,)))
+ self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+
+ def test_make_linear_model_with_weights(self):
+ class _TestColumnWithWeights(fc._CategoricalColumn):
+ """Produces sparse IDs and sparse weights."""
+
+ @property
+ def name(self):
+ return 'test_column'
+
+ @property
+ def _parse_example_config(self):
+ return {
+ self.name: parsing_ops.VarLenFeature(dtypes.int32),
+ '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
+ dtypes.float32),
+ }
+
+ @property
+ def _num_buckets(self):
+ return 5
+
+ def _transform_feature(self, inputs):
+ return (inputs.get(self.name),
+ inputs.get('{}_weights'.format(self.name)))
+
+ def _get_sparse_tensors(self, inputs, weight_collections=None,
+ trainable=None):
+ """Populates both id_tensor and weight_tensor."""
+ ids_and_weights = inputs.get(self)
+ return fc._CategoricalColumn.IdWeightPair(
+ id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+ t = _TestColumnWithWeights()
+ crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+ with ops.Graph().as_default():
+ with self.assertRaisesRegexp(
+ ValueError,
+ 'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+ fc.make_linear_model({
+ t.name: sparse_tensor.SparseTensor(
+ indices=((0, 0), (1, 0), (1, 1)),
+ values=[0, 1, 2],
+ dense_shape=(2, 2)),
+ '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
+ indices=((0, 0), (1, 0), (1, 1)),
+ values=[1., 10., 2.],
+ dense_shape=(2, 2)),
+ 'c': sparse_tensor.SparseTensor(
+ indices=((0, 0), (1, 0), (1, 1)),
+ values=['cA', 'cB', 'cC'],
+ dense_shape=(2, 2)),
+ }, (crossed,))
+
+
def get_linear_model_bias():
with variable_scope.variable_scope('make_linear_model', reuse=True):
return variable_scope.get_variable('bias_weights')