diff options
author | 2017-05-10 16:03:02 -0700 | |
---|---|---|
committer | 2017-05-11 10:19:41 -0700 | |
commit | e09b0b6ebf6bfea5157a7e8b65b9070e56c1c5f2 (patch) | |
tree | f76a50aafbcab04acbde60576b80ec94b05348d3 | |
parent | 770a27161bf9e73860443f62ab539833d39d61b4 (diff) |
Move crossed_column to core.
PiperOrigin-RevId: 155687697
-rw-r--r-- | tensorflow/python/feature_column/feature_column.py | 187 | ||||
-rw-r--r-- | tensorflow/python/feature_column/feature_column_test.py | 256 |
2 files changed, 443 insertions, 0 deletions
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index 8ce3878d8c..391f3123ac 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -607,6 +607,18 @@ def bucketized_column(source_column, boundaries): dense_tensor = make_input_layer(features, columns) ``` + `bucketized_column` can also be crossed with another categorical column using + `crossed_column`: + ```python + price = numeric_column('price') + # bucketized_column converts numerical feature to a categorical one. + bucketized_price = bucketized_column(price, boundaries=[...]) + # 'keywords' is a string feature. + price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K) + all_feature_columns = [price_x_keywords, ...] + linear_prediction = make_linear_model(features, all_feature_columns) + ``` + Args: source_column: A one-dimensional dense column which is generated with `numeric_column`. @@ -1036,6 +1048,107 @@ def weighted_categorical_column( dtype=dtype) +def crossed_column(keys, hash_bucket_size, hash_key=None): + """Returns a column for performing crosses of categorical features. + + Crossed features will be hashed according to `hash_bucket_size`. Conceptually, + the transformation can be thought of as: + Hash(cartesian product of features) % `hash_bucket_size` + + For example, if the input features are: + * SparseTensor referred by first key: shape = [2, 2] + [0, 0]: "a" + [1, 0]: "b" + [1, 1]: "c" + + * SparseTensor referred by second key: shape = [2, 1] + [0, 0]: "d" + [1, 0]: "e" + + then crossed feature will look like: + shape = [2, 2] + [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size + [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size + [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size + + Here is an example to create a linear model with crosses of string features: + ```python + keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K) + all_feature_columns = [keywords_x_doc_terms, ...] + linear_prediction = make_linear_model(features, all_feature_columns) + ``` + + You could also use vocabulary lookup before crossing: + ```python + keywords = categorical_column_with_vocabulary_file( + 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K) + keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K) + all_feature_columns = [keywords_x_doc_terms, ...] + linear_prediction = make_linear_model(features, all_feature_columns) + ``` + + If an input feature is of numeric type, you can use + `categorical_column_with_identity`, or `bucketized_column`, as in the example: + ```python + # vertical_id is an integer categorical feature. + vertical_id = categorical_column_with_identity('vertical_id', 10K) + price = numeric_column('price') + # bucketized_column converts numerical feature to a categorical one. + bucketized_price = bucketized_column(price, boundaries=[...]) + vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) + all_feature_columns = [vertical_id_x_price, ...] + linear_prediction = make_linear_model(features, all_feature_columns) + ``` + + To use crossed column in DNN model, you need to add it in an embedding column + as in this example: + ```python + vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) + vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10) + dense_tensor = make_input_layer(features, [vertical_id_x_price_embedded, ...]) + ``` + + Args: + keys: An iterable identifying the features to be crossed. Each element can + be either: + * string: Will use the corresponding feature which must be of string type. + * `_CategoricalColumn`: Will use the transformed tensor produced by this + column. Does not support hashed categorical column. + hash_bucket_size: An int > 1. The number of buckets. + hash_key: Specify the hash_key that will be used by the `FingerprintCat64` + function to combine the crosses fingerprints on SparseCrossOp (optional). + + Returns: + A `_CrossedColumn`. + + Raises: + ValueError: If `len(keys) < 2`. + ValueError: If any of the keys is neither a string nor `_CategoricalColumn`. + ValueError: If any of the keys is `_HashedCategoricalColumn`. + ValueError: If `hash_bucket_size < 1`. + """ + if not hash_bucket_size or hash_bucket_size < 1: + raise ValueError('hash_bucket_size must be > 1. ' + 'hash_bucket_size: {}'.format(hash_bucket_size)) + if not keys or len(keys) < 2: + raise ValueError( + 'keys must be a list with length > 1. Given: {}'.format(keys)) + for key in keys: + if (not isinstance(key, six.string_types) and + not isinstance(key, _CategoricalColumn)): + raise ValueError( + 'Unsupported key type. All keys must be either string, or ' + 'categorical column except _HashedCategoricalColumn. ' + 'Given: {}'.format(key)) + if isinstance(key, _HashedCategoricalColumn): + raise ValueError( + '_HashedCategoricalColumn is not supported. Instead, use the feature ' + 'name as a string. Given: {}'.format(key)) + return _CrossedColumn( + keys=tuple(keys), hash_bucket_size=hash_bucket_size, + hash_key=hash_key) + + class _FeatureColumn(object): """Represents a feature column abstraction. @@ -1969,6 +2082,80 @@ class _WeightedCategoricalColumn( return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) +class _CrossedColumn( + _CategoricalColumn, + collections.namedtuple('_CrossedColumn', + ['keys', 'hash_bucket_size', 'hash_key'])): + """See `crossed_column`.""" + + @property + def name(self): + feature_names = [] + for key in _collect_leaf_level_keys(self): + if isinstance(key, _FeatureColumn): + feature_names.append(key.name) + else: # key must be a string + feature_names.append(key) + return '_X_'.join(sorted(feature_names)) + + @property + def _parse_example_config(self): + config = {} + for key in self.keys: + if isinstance(key, _FeatureColumn): + config.update(key._parse_example_config) # pylint: disable=protected-access + else: # key must be a string + config.update({key: parsing_ops.VarLenFeature(dtypes.string)}) + return config + + def _transform_feature(self, inputs): + feature_tensors = [] + for key in _collect_leaf_level_keys(self): + if isinstance(key, six.string_types): + feature_tensors.append(inputs.get(key)) + elif isinstance(key, _CategoricalColumn): + ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access + if ids_and_weights.weight_tensor is not None: + raise ValueError( + 'crossed_column does not support weight_tensor, but the given ' + 'column populates weight_tensor. ' + 'Given column: {}'.format(key.name)) + feature_tensors.append(ids_and_weights.id_tensor) + else: + raise ValueError('Unsupported column type. Given: {}'.format(key)) + return sparse_ops._sparse_cross_hashed( # pylint: disable=protected-access + inputs=feature_tensors, + num_buckets=self.hash_bucket_size, + hash_key=self.hash_key) + + @property + def _num_buckets(self): + """Returns number of buckets in this sparse feature.""" + return self.hash_bucket_size + + def _get_sparse_tensors(self, inputs, weight_collections=None, + trainable=None): + return _CategoricalColumn.IdWeightPair(inputs.get(self), None) + + +def _collect_leaf_level_keys(cross): + """Collects base keys by expanding all nested crosses. + + Args: + cross: A `_CrossedColumn`. + + Returns: + A list of strings or `_CategoricalColumn` instances. + """ + leaf_level_keys = [] + for k in cross.keys: + if isinstance(k, _CrossedColumn): + leaf_level_keys.extend(_collect_leaf_level_keys(k)) + else: + leaf_level_keys.append(k) + return leaf_level_keys + + # TODO(zakaria): Move this to embedding_ops and make it public. def _safe_embedding_lookup_sparse(embedding_weights, sparse_ids, diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py index 388e8a2d01..b09c01d266 100644 --- a/tensorflow/python/feature_column/feature_column_test.py +++ b/tensorflow/python/feature_column/feature_column_test.py @@ -735,6 +735,262 @@ class HashedCategoricalColumnTest(test.TestCase): self.assertAllClose(((4.,), (6.,)), predictions.eval()) +class CrossedColumnTest(test.TestCase): + + def test_keys_empty(self): + with self.assertRaisesRegexp( + ValueError, 'keys must be a list with length > 1'): + fc.crossed_column([], 10) + + def test_keys_length_one(self): + with self.assertRaisesRegexp( + ValueError, 'keys must be a list with length > 1'): + fc.crossed_column(['a'], 10) + + def test_key_type_unsupported(self): + with self.assertRaisesRegexp(ValueError, 'Unsupported key type'): + fc.crossed_column(['a', fc.numeric_column('c')], 10) + + with self.assertRaisesRegexp( + ValueError, '_HashedCategoricalColumn is not supported'): + fc.crossed_column( + ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10) + + def test_hash_bucket_size_negative(self): + with self.assertRaisesRegexp( + ValueError, 'hash_bucket_size must be > 1'): + fc.crossed_column(['a', 'c'], -1) + + def test_hash_bucket_size_zero(self): + with self.assertRaisesRegexp( + ValueError, 'hash_bucket_size must be > 1'): + fc.crossed_column(['a', 'c'], 0) + + def test_hash_bucket_size_none(self): + with self.assertRaisesRegexp( + ValueError, 'hash_bucket_size must be > 1'): + fc.crossed_column(['a', 'c'], None) + + def test_name(self): + a = fc.numeric_column('a', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed1 = fc.crossed_column(['d1', 'd2'], 10) + + crossed2 = fc.crossed_column([b, 'c', crossed1], 10) + self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name) + + def test_name_ordered_alphabetically(self): + """Tests that the name does not depend on the order of given columns.""" + a = fc.numeric_column('a', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed1 = fc.crossed_column(['d1', 'd2'], 10) + + crossed2 = fc.crossed_column([crossed1, 'c', b], 10) + self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name) + + def test_name_leaf_keys_ordered_alphabetically(self): + """Tests that the name does not depend on the order of given columns.""" + a = fc.numeric_column('a', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed1 = fc.crossed_column(['d2', 'c'], 10) + + crossed2 = fc.crossed_column([crossed1, 'd1', b], 10) + self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name) + + def test_parse_config(self): + a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed = fc.crossed_column([b, 'c'], 10) + self.assertEqual({ + 'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32), + 'c': parsing_ops.VarLenFeature(dtypes.string), + }, crossed._parse_example_config) + + def test_num_buckets(self): + a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed = fc.crossed_column([b, 'c'], 15) + self.assertEqual(15, crossed._num_buckets) + + def test_deep_copy(self): + a = fc.numeric_column('a', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed1 = fc.crossed_column(['d1', 'd2'], 10) + crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5) + crossed2_copy = copy.deepcopy(crossed2) + self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,) + self.assertEqual(15, crossed2_copy.hash_bucket_size) + self.assertEqual(5, crossed2_copy.hash_key) + + def test_parse_example(self): + price = fc.numeric_column('price', shape=[2]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 50]) + price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'price': + feature_pb2.Feature(float_list=feature_pb2.FloatList( + value=[20., 110.])), + 'wire': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])), + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=price_cross_wire._parse_example_config) + self.assertIn('price', features) + self.assertIn('wire', features) + with self.test_session(): + self.assertAllEqual([[20., 110.]], features['price'].eval()) + wire_sparse = features['wire'] + self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval()) + # Use byte constants to pass the open-source test. + self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval()) + self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval()) + + def test_get_sparse_tensors(self): + a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,)) + b = fc.bucketized_column(a, boundaries=(0, 1)) + crossed1 = fc.crossed_column(['d1', 'd2'], 10) + crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5) + with ops.Graph().as_default(): + builder = fc._LazyBuilder({ + 'a': constant_op.constant(((-1., .5), (.5, 1.))), + 'c': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + 'd1': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['d1A', 'd1B', 'd1C'], + dense_shape=(2, 2)), + 'd2': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['d2A', 'd2B', 'd2C'], + dense_shape=(2, 2)), + }) + id_weight_pair = crossed2._get_sparse_tensors(builder) + with _initialized_session(): + id_tensor_eval = id_weight_pair.id_tensor.eval() + self.assertAllEqual( + ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), + (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), + (1, 14), (1, 15)), + id_tensor_eval.indices) + # Check exact hashed output. If hashing changes this test will break. + # All values are within [0, hash_bucket_size). + expected_values = ( + 6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11) + self.assertAllEqual(expected_values, id_tensor_eval.values) + self.assertAllEqual((2, 16), id_tensor_eval.dense_shape) + + def test_get_sparse_tensors_simple(self): + """Same as test_get_sparse_tensors, but with simpler values.""" + a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,)) + b = fc.bucketized_column(a, boundaries=(0, 1)) + crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5) + with ops.Graph().as_default(): + builder = fc._LazyBuilder({ + 'a': constant_op.constant(((-1., .5), (.5, 1.))), + 'c': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + }) + id_weight_pair = crossed._get_sparse_tensors(builder) + with _initialized_session(): + id_tensor_eval = id_weight_pair.id_tensor.eval() + self.assertAllEqual( + ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)), + id_tensor_eval.indices) + # Check exact hashed output. If hashing changes this test will break. + # All values are within [0, hash_bucket_size). + expected_values = (1, 0, 1, 3, 4, 2) + self.assertAllEqual(expected_values, id_tensor_eval.values) + self.assertAllEqual((2, 4), id_tensor_eval.dense_shape) + + def test_make_linear_model(self): + """Tests make_linear_model. + + Uses data from test_get_sparse_tesnsors_simple. + """ + a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,)) + b = fc.bucketized_column(a, boundaries=(0, 1)) + crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5) + with ops.Graph().as_default(): + predictions = fc.make_linear_model({ + 'a': constant_op.constant(((-1., .5), (.5, 1.))), + 'c': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + }, (crossed,)) + bias = get_linear_model_bias() + crossed_var = get_linear_model_column_var(crossed) + with _initialized_session() as sess: + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose( + ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,)))) + # Expected ids after cross = (1, 0, 1, 3, 4, 2) + self.assertAllClose(((3.,), (14.,)), predictions.eval()) + sess.run(bias.assign((.1,))) + self.assertAllClose(((3.1,), (14.1,)), predictions.eval()) + + def test_make_linear_model_with_weights(self): + class _TestColumnWithWeights(fc._CategoricalColumn): + """Produces sparse IDs and sparse weights.""" + + @property + def name(self): + return 'test_column' + + @property + def _parse_example_config(self): + return { + self.name: parsing_ops.VarLenFeature(dtypes.int32), + '{}_weights'.format(self.name): parsing_ops.VarLenFeature( + dtypes.float32), + } + + @property + def _num_buckets(self): + return 5 + + def _transform_feature(self, inputs): + return (inputs.get(self.name), + inputs.get('{}_weights'.format(self.name))) + + def _get_sparse_tensors(self, inputs, weight_collections=None, + trainable=None): + """Populates both id_tensor and weight_tensor.""" + ids_and_weights = inputs.get(self) + return fc._CategoricalColumn.IdWeightPair( + id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1]) + + t = _TestColumnWithWeights() + crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5) + with ops.Graph().as_default(): + with self.assertRaisesRegexp( + ValueError, + 'crossed_column does not support weight_tensor.*{}'.format(t.name)): + fc.make_linear_model({ + t.name: sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=[0, 1, 2], + dense_shape=(2, 2)), + '{}_weights'.format(t.name): sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=[1., 10., 2.], + dense_shape=(2, 2)), + 'c': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + }, (crossed,)) + + def get_linear_model_bias(): with variable_scope.variable_scope('make_linear_model', reuse=True): return variable_scope.get_variable('bias_weights') |