diff options
Diffstat (limited to 'tensorflow/python/feature_column/feature_column_v2_test.py')
-rw-r--r-- | tensorflow/python/feature_column/feature_column_v2_test.py | 6583 |
1 files changed, 6583 insertions, 0 deletions
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py new file mode 100644 index 0000000000..80a9d5d40e --- /dev/null +++ b/tensorflow/python/feature_column/feature_column_v2_test.py @@ -0,0 +1,6583 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for feature_column.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy + +import numpy as np + +from tensorflow.core.example import example_pb2 +from tensorflow.core.example import feature_pb2 +from tensorflow.core.protobuf import config_pb2 +from tensorflow.core.protobuf import rewriter_config_pb2 +from tensorflow.python.client import session +from tensorflow.python.eager import backprop +from tensorflow.python.eager import context +from tensorflow.python.estimator.inputs import numpy_io +from tensorflow.python.feature_column import feature_column as fc_old +from tensorflow.python.feature_column import feature_column_v2 as fc +from tensorflow.python.feature_column.feature_column_v2 import FeatureColumn +from tensorflow.python.feature_column.feature_column_v2 import FeatureTransformationCache +from tensorflow.python.feature_column.feature_column_v2 import InputLayer +from tensorflow.python.feature_column.feature_column_v2 import StateManager +from tensorflow.python.feature_column.feature_column_v2 import _LinearModel +from tensorflow.python.feature_column.feature_column_v2 import _transform_features +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.framework import ops +from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import lookup_ops +from tensorflow.python.ops import parsing_ops +from tensorflow.python.ops import partitioned_variables +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables as variables_lib +from tensorflow.python.platform import test +from tensorflow.python.training import coordinator +from tensorflow.python.training import queue_runner_impl + + +def _initialized_session(config=None): + sess = session.Session(config=config) + sess.run(variables_lib.global_variables_initializer()) + sess.run(lookup_ops.tables_initializer()) + return sess + + +class LazyColumnTest(test.TestCase): + + def test_transformations_called_once(self): + + class TransformCounter(FeatureColumn): + + def __init__(self): + self.num_transform = 0 + + @property + def name(self): + return 'TransformCounter' + + def transform_feature(self, transformation_cache, state_manager): + self.num_transform += 1 # Count transform calls. + return transformation_cache.get('a', state_manager) + + @property + def parse_example_spec(self): + pass + + transformation_cache = FeatureTransformationCache( + features={'a': [[2], [3.]]}) + column = TransformCounter() + self.assertEqual(0, column.num_transform) + transformation_cache.get(column, None) + self.assertEqual(1, column.num_transform) + transformation_cache.get(column, None) + self.assertEqual(1, column.num_transform) + + def test_returns_transform_output(self): + + class Transformer(FeatureColumn): + + @property + def name(self): + return 'Transformer' + + def transform_feature(self, transformation_cache, state_manager): + return 'Output' + + @property + def parse_example_spec(self): + pass + + transformation_cache = FeatureTransformationCache( + features={'a': [[2], [3.]]}) + column = Transformer() + self.assertEqual('Output', transformation_cache.get(column, None)) + self.assertEqual('Output', transformation_cache.get(column, None)) + + def test_does_not_pollute_given_features_dict(self): + + class Transformer(FeatureColumn): + + @property + def name(self): + return 'Transformer' + + def transform_feature(self, transformation_cache, state_manager): + return 'Output' + + @property + def parse_example_spec(self): + pass + + features = {'a': [[2], [3.]]} + transformation_cache = FeatureTransformationCache(features=features) + transformation_cache.get(Transformer(), None) + self.assertEqual(['a'], list(features.keys())) + + def test_error_if_feature_is_not_found(self): + transformation_cache = FeatureTransformationCache( + features={'a': [[2], [3.]]}) + with self.assertRaisesRegexp(ValueError, + 'bbb is not in features dictionary'): + transformation_cache.get('bbb', None) + with self.assertRaisesRegexp(ValueError, + 'bbb is not in features dictionary'): + transformation_cache.get(u'bbb', None) + + def test_not_supported_feature_column(self): + + class NotAProperColumn(FeatureColumn): + + @property + def name(self): + return 'NotAProperColumn' + + def transform_feature(self, transformation_cache, state_manager): + # It should return not None. + pass + + @property + def parse_example_spec(self): + pass + + transformation_cache = FeatureTransformationCache( + features={'a': [[2], [3.]]}) + with self.assertRaisesRegexp(ValueError, + 'NotAProperColumn is not supported'): + transformation_cache.get(NotAProperColumn(), None) + + def test_key_should_be_string_or_feature_colum(self): + + class NotAFeatureColumn(object): + pass + + transformation_cache = FeatureTransformationCache( + features={'a': [[2], [3.]]}) + with self.assertRaisesRegexp( + TypeError, '"key" must be either a "str" or "FeatureColumn".'): + transformation_cache.get(NotAFeatureColumn(), None) + + +class NumericColumnTest(test.TestCase): + + def test_defaults(self): + a = fc.numeric_column('aaa') + self.assertEqual('aaa', a.key) + self.assertEqual('aaa', a.name) + self.assertEqual((1,), a.shape) + self.assertIsNone(a.default_value) + self.assertEqual(dtypes.float32, a.dtype) + self.assertIsNone(a.normalizer_fn) + + def test_key_should_be_string(self): + with self.assertRaisesRegexp(ValueError, 'key must be a string.'): + fc.numeric_column(key=('aaa',)) + + def test_shape_saved_as_tuple(self): + a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]]) + self.assertEqual((1, 2), a.shape) + + def test_default_value_saved_as_tuple(self): + a = fc.numeric_column('aaa', default_value=4.) + self.assertEqual((4.,), a.default_value) + a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]]) + self.assertEqual(((3., 2.),), a.default_value) + + def test_shape_and_default_value_compatibility(self): + fc.numeric_column('aaa', shape=[2], default_value=[1, 2.]) + with self.assertRaisesRegexp(ValueError, 'The shape of default_value'): + fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.]) + fc.numeric_column( + 'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]]) + with self.assertRaisesRegexp(ValueError, 'The shape of default_value'): + fc.numeric_column( + 'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]]) + with self.assertRaisesRegexp(ValueError, 'The shape of default_value'): + fc.numeric_column( + 'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]]) + + def test_default_value_type_check(self): + fc.numeric_column( + 'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32) + fc.numeric_column( + 'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32) + with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'): + fc.numeric_column( + 'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32) + with self.assertRaisesRegexp(TypeError, + 'default_value must be compatible with dtype'): + fc.numeric_column('aaa', default_value=['string']) + + def test_shape_must_be_positive_integer(self): + with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'): + fc.numeric_column( + 'aaa', shape=[ + 1.0, + ]) + + with self.assertRaisesRegexp(ValueError, + 'shape dimensions must be greater than 0'): + fc.numeric_column( + 'aaa', shape=[ + 0, + ]) + + def test_dtype_is_convertible_to_float(self): + with self.assertRaisesRegexp(ValueError, + 'dtype must be convertible to float'): + fc.numeric_column('aaa', dtype=dtypes.string) + + def test_scalar_default_value_fills_the_shape(self): + a = fc.numeric_column('aaa', shape=[2, 3], default_value=2.) + self.assertEqual(((2., 2., 2.), (2., 2., 2.)), a.default_value) + + def test_parse_spec(self): + a = fc.numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32) + self.assertEqual({ + 'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32) + }, a.parse_example_spec) + + def test_parse_example_no_default_value(self): + price = fc.numeric_column('price', shape=[2]) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'price': + feature_pb2.Feature(float_list=feature_pb2.FloatList( + value=[20., 110.])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([price])) + self.assertIn('price', features) + with self.test_session(): + self.assertAllEqual([[20., 110.]], features['price'].eval()) + + def test_parse_example_with_default_value(self): + price = fc.numeric_column('price', shape=[2], default_value=11.) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'price': + feature_pb2.Feature(float_list=feature_pb2.FloatList( + value=[20., 110.])) + })) + no_data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'something_else': + feature_pb2.Feature(float_list=feature_pb2.FloatList( + value=[20., 110.])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString(), + no_data.SerializeToString()], + features=fc.make_parse_example_spec([price])) + self.assertIn('price', features) + with self.test_session(): + self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval()) + + def test_normalizer_fn_must_be_callable(self): + with self.assertRaisesRegexp(TypeError, 'must be a callable'): + fc.numeric_column('price', normalizer_fn='NotACallable') + + def test_normalizer_fn_transform_feature(self): + + def _increment_two(input_tensor): + return input_tensor + 2. + + price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two) + output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price], None) + with self.test_session(): + self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval()) + + def test_get_dense_tensor(self): + + def _increment_two(input_tensor): + return input_tensor + 2. + + price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two) + transformation_cache = FeatureTransformationCache({ + 'price': [[1., 2.], [5., 6.]] + }) + self.assertEqual( + transformation_cache.get(price, None), + price.get_dense_tensor(transformation_cache, None)) + + def test_sparse_tensor_not_supported(self): + price = fc.numeric_column('price') + transformation_cache = FeatureTransformationCache({ + 'price': + sparse_tensor.SparseTensor( + indices=[[0, 0]], values=[0.3], dense_shape=[1, 1]) + }) + with self.assertRaisesRegexp(ValueError, 'must be a Tensor'): + price.transform_feature(transformation_cache, None) + + def test_deep_copy(self): + a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]]) + a_copy = copy.deepcopy(a) + self.assertEqual(a_copy.name, 'aaa') + self.assertEqual(a_copy.shape, (1, 2)) + self.assertEqual(a_copy.default_value, ((3., 2.),)) + + def test_numpy_default_value(self): + a = fc.numeric_column( + 'aaa', shape=[1, 2], default_value=np.array([[3., 2.]])) + self.assertEqual(a.default_value, ((3., 2.),)) + + def test_linear_model(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + predictions = fc.linear_model(features, [price]) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + self.assertAllClose([[0.]], price_var.eval()) + self.assertAllClose([[0.], [0.]], predictions.eval()) + sess.run(price_var.assign([[10.]])) + self.assertAllClose([[10.], [50.]], predictions.eval()) + + def test_keras_linear_model(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + predictions = get_keras_linear_model_predictions(features, [price]) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + self.assertAllClose([[0.]], price_var.eval()) + self.assertAllClose([[0.], [0.]], predictions.eval()) + sess.run(price_var.assign([[10.]])) + self.assertAllClose([[10.], [50.]], predictions.eval()) + + +class BucketizedColumnTest(test.TestCase): + + def test_invalid_source_column_type(self): + a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10) + with self.assertRaisesRegexp( + ValueError, + 'source_column must be a column generated with numeric_column'): + fc.bucketized_column(a, boundaries=[0, 1]) + + def test_invalid_source_column_shape(self): + a = fc.numeric_column('aaa', shape=[2, 3]) + with self.assertRaisesRegexp( + ValueError, 'source_column must be one-dimensional column'): + fc.bucketized_column(a, boundaries=[0, 1]) + + def test_invalid_boundaries(self): + a = fc.numeric_column('aaa') + with self.assertRaisesRegexp( + ValueError, 'boundaries must be a sorted list'): + fc.bucketized_column(a, boundaries=None) + with self.assertRaisesRegexp( + ValueError, 'boundaries must be a sorted list'): + fc.bucketized_column(a, boundaries=1.) + with self.assertRaisesRegexp( + ValueError, 'boundaries must be a sorted list'): + fc.bucketized_column(a, boundaries=[1, 0]) + with self.assertRaisesRegexp( + ValueError, 'boundaries must be a sorted list'): + fc.bucketized_column(a, boundaries=[1, 1]) + + def test_name(self): + a = fc.numeric_column('aaa', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + self.assertEqual('aaa_bucketized', b.name) + + def test_parse_spec(self): + a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + self.assertEqual({ + 'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32) + }, b.parse_example_spec) + + def test_variable_shape(self): + a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3]. + self.assertAllEqual((2, 3), b.variable_shape) + + def test_num_buckets(self): + a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + # Column 'aaa` has shape [2] times three buckets -> num_buckets=6. + self.assertEqual(6, b.num_buckets) + + def test_parse_example(self): + price = fc.numeric_column('price', shape=[2]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 50]) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'price': + feature_pb2.Feature(float_list=feature_pb2.FloatList( + value=[20., 110.])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([bucketized_price])) + self.assertIn('price', features) + with self.test_session(): + self.assertAllEqual([[20., 110.]], features['price'].eval()) + + def test_transform_feature(self): + price = fc.numeric_column('price', shape=[2]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + transformed_tensor = _transform_features({ + 'price': [[-1., 1.], [5., 6.]] + }, [bucketized_price], None) + with _initialized_session(): + self.assertAllEqual([[0, 1], [3, 4]], + transformed_tensor[bucketized_price].eval()) + + def test_get_dense_tensor_one_input_value(self): + """Tests _get_dense_tensor() for input with shape=[1].""" + price = fc.numeric_column('price', shape=[1]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + transformation_cache = FeatureTransformationCache({ + 'price': [[-1.], [1.], [5.], [6.]] + }) + with _initialized_session(): + bucketized_price_tensor = bucketized_price.get_dense_tensor( + transformation_cache, None) + self.assertAllClose( + # One-hot tensor. + [[[1., 0., 0., 0., 0.]], + [[0., 1., 0., 0., 0.]], + [[0., 0., 0., 1., 0.]], + [[0., 0., 0., 0., 1.]]], + bucketized_price_tensor.eval()) + + def test_get_dense_tensor_two_input_values(self): + """Tests _get_dense_tensor() for input with shape=[2].""" + price = fc.numeric_column('price', shape=[2]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + transformation_cache = FeatureTransformationCache({ + 'price': [[-1., 1.], [5., 6.]] + }) + with _initialized_session(): + bucketized_price_tensor = bucketized_price.get_dense_tensor( + transformation_cache, None) + self.assertAllClose( + # One-hot tensor. + [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]], + [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]], + bucketized_price_tensor.eval()) + + def test_get_sparse_tensors_one_input_value(self): + """Tests _get_sparse_tensors() for input with shape=[1].""" + price = fc.numeric_column('price', shape=[1]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + transformation_cache = FeatureTransformationCache({ + 'price': [[-1.], [1.], [5.], [6.]] + }) + with _initialized_session() as sess: + id_weight_pair = bucketized_price.get_sparse_tensors( + transformation_cache, None) + self.assertIsNone(id_weight_pair.weight_tensor) + id_tensor_value = sess.run(id_weight_pair.id_tensor) + self.assertAllEqual( + [[0, 0], [1, 0], [2, 0], [3, 0]], id_tensor_value.indices) + self.assertAllEqual([0, 1, 3, 4], id_tensor_value.values) + self.assertAllEqual([4, 1], id_tensor_value.dense_shape) + + def test_get_sparse_tensors_two_input_values(self): + """Tests _get_sparse_tensors() for input with shape=[2].""" + price = fc.numeric_column('price', shape=[2]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + transformation_cache = FeatureTransformationCache({ + 'price': [[-1., 1.], [5., 6.]] + }) + with _initialized_session() as sess: + id_weight_pair = bucketized_price.get_sparse_tensors( + transformation_cache, None) + self.assertIsNone(id_weight_pair.weight_tensor) + id_tensor_value = sess.run(id_weight_pair.id_tensor) + self.assertAllEqual( + [[0, 0], [0, 1], [1, 0], [1, 1]], id_tensor_value.indices) + # Values 0-4 correspond to the first column of the input price. + # Values 5-9 correspond to the second column of the input price. + self.assertAllEqual([0, 6, 3, 9], id_tensor_value.values) + self.assertAllEqual([2, 2], id_tensor_value.dense_shape) + + def test_sparse_tensor_input_not_supported(self): + price = fc.numeric_column('price') + bucketized_price = fc.bucketized_column(price, boundaries=[0, 1]) + transformation_cache = FeatureTransformationCache({ + 'price': + sparse_tensor.SparseTensor( + indices=[[0, 0]], values=[0.3], dense_shape=[1, 1]) + }) + with self.assertRaisesRegexp(ValueError, 'must be a Tensor'): + bucketized_price.transform_feature(transformation_cache, None) + + def test_deep_copy(self): + a = fc.numeric_column('aaa', shape=[2]) + a_bucketized = fc.bucketized_column(a, boundaries=[0, 1]) + a_bucketized_copy = copy.deepcopy(a_bucketized) + self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized') + self.assertAllEqual(a_bucketized_copy.variable_shape, (2, 3)) + self.assertEqual(a_bucketized_copy.boundaries, (0, 1)) + + def test_linear_model_one_input_value(self): + """Tests linear_model() for input with shape=[1].""" + price = fc_old.numeric_column('price', shape=[1]) + bucketized_price = fc_old.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + features = {'price': [[-1.], [1.], [5.], [6.]]} + predictions = fc.linear_model(features, [bucketized_price]) + bias = get_linear_model_bias() + bucketized_price_var = get_linear_model_column_var(bucketized_price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + # One weight variable per bucket, all initialized to zero. + self.assertAllClose( + [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval()) + self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval()) + sess.run(bucketized_price_var.assign( + [[10.], [20.], [30.], [40.], [50.]])) + # price -1. is in the 0th bucket, whose weight is 10. + # price 1. is in the 1st bucket, whose weight is 20. + # price 5. is in the 3rd bucket, whose weight is 40. + # price 6. is in the 4th bucket, whose weight is 50. + self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval()) + sess.run(bias.assign([1.])) + self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval()) + + def test_linear_model_two_input_values(self): + """Tests linear_model() for input with shape=[2].""" + price = fc_old.numeric_column('price', shape=[2]) + bucketized_price = fc_old.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + features = {'price': [[-1., 1.], [5., 6.]]} + predictions = fc.linear_model(features, [bucketized_price]) + bias = get_linear_model_bias() + bucketized_price_var = get_linear_model_column_var(bucketized_price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + # One weight per bucket per input column, all initialized to zero. + self.assertAllClose( + [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]], + bucketized_price_var.eval()) + self.assertAllClose([[0.], [0.]], predictions.eval()) + sess.run(bucketized_price_var.assign( + [[10.], [20.], [30.], [40.], [50.], + [60.], [70.], [80.], [90.], [100.]])) + # 1st example: + # price -1. is in the 0th bucket, whose weight is 10. + # price 1. is in the 6th bucket, whose weight is 70. + # 2nd example: + # price 5. is in the 3rd bucket, whose weight is 40. + # price 6. is in the 9th bucket, whose weight is 100. + self.assertAllClose([[80.], [140.]], predictions.eval()) + sess.run(bias.assign([1.])) + self.assertAllClose([[81.], [141.]], predictions.eval()) + + def test_keras_linear_model_one_input_value(self): + """Tests _LinearModel for input with shape=[1].""" + price = fc_old.numeric_column('price', shape=[1]) + bucketized_price = fc_old.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + features = {'price': [[-1.], [1.], [5.], [6.]]} + predictions = get_keras_linear_model_predictions(features, + [bucketized_price]) + bias = get_linear_model_bias() + bucketized_price_var = get_linear_model_column_var(bucketized_price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + # One weight variable per bucket, all initialized to zero. + self.assertAllClose([[0.], [0.], [0.], [0.], [0.]], + bucketized_price_var.eval()) + self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval()) + sess.run( + bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]])) + # price -1. is in the 0th bucket, whose weight is 10. + # price 1. is in the 1st bucket, whose weight is 20. + # price 5. is in the 3rd bucket, whose weight is 40. + # price 6. is in the 4th bucket, whose weight is 50. + self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval()) + sess.run(bias.assign([1.])) + self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval()) + + def test_keras_linear_model_two_input_values(self): + """Tests _LinearModel for input with shape=[2].""" + price = fc_old.numeric_column('price', shape=[2]) + bucketized_price = fc_old.bucketized_column(price, boundaries=[0, 2, 4, 6]) + with ops.Graph().as_default(): + features = {'price': [[-1., 1.], [5., 6.]]} + predictions = get_keras_linear_model_predictions(features, + [bucketized_price]) + bias = get_linear_model_bias() + bucketized_price_var = get_linear_model_column_var(bucketized_price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + # One weight per bucket per input column, all initialized to zero. + self.assertAllClose( + [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]], + bucketized_price_var.eval()) + self.assertAllClose([[0.], [0.]], predictions.eval()) + sess.run( + bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.], + [60.], [70.], [80.], [90.], [100.]])) + # 1st example: + # price -1. is in the 0th bucket, whose weight is 10. + # price 1. is in the 6th bucket, whose weight is 70. + # 2nd example: + # price 5. is in the 3rd bucket, whose weight is 40. + # price 6. is in the 9th bucket, whose weight is 100. + self.assertAllClose([[80.], [140.]], predictions.eval()) + sess.run(bias.assign([1.])) + self.assertAllClose([[81.], [141.]], predictions.eval()) + + +class HashedCategoricalColumnTest(test.TestCase): + + def test_defaults(self): + a = fc.categorical_column_with_hash_bucket('aaa', 10) + self.assertEqual('aaa', a.name) + self.assertEqual('aaa', a.key) + self.assertEqual(10, a.hash_bucket_size) + self.assertEqual(dtypes.string, a.dtype) + + def test_key_should_be_string(self): + with self.assertRaisesRegexp(ValueError, 'key must be a string.'): + fc.categorical_column_with_hash_bucket(('key',), 10) + + def test_bucket_size_should_be_given(self): + with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'): + fc.categorical_column_with_hash_bucket('aaa', None) + + def test_bucket_size_should_be_positive(self): + with self.assertRaisesRegexp(ValueError, + 'hash_bucket_size must be at least 1'): + fc.categorical_column_with_hash_bucket('aaa', 0) + + def test_dtype_should_be_string_or_integer(self): + fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string) + fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32) + with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'): + fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32) + + def test_deep_copy(self): + original = fc.categorical_column_with_hash_bucket('aaa', 10) + for column in (original, copy.deepcopy(original)): + self.assertEqual('aaa', column.name) + self.assertEqual(10, column.hash_bucket_size) + self.assertEqual(10, column.num_buckets) + self.assertEqual(dtypes.string, column.dtype) + + def test_parse_spec_string(self): + a = fc.categorical_column_with_hash_bucket('aaa', 10) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.string) + }, a.parse_example_spec) + + def test_parse_spec_int(self): + a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int32) + }, a.parse_example_spec) + + def test_parse_example(self): + a = fc.categorical_column_with_hash_bucket('aaa', 10) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a])) + self.assertIn('aaa', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([b'omar', b'stringer'], dtype=np.object_), + dense_shape=[1, 2]), + features['aaa'].eval()) + + def test_strings_should_be_hashed(self): + hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10) + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + outputs = _transform_features({'wire': wire_tensor}, [hashed_sparse], None) + output = outputs[hashed_sparse] + # Check exact hashed output. If hashing changes this test will break. + expected_values = [6, 4, 1] + with self.test_session(): + self.assertEqual(dtypes.int64, output.values.dtype) + self.assertAllEqual(expected_values, output.values.eval()) + self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval()) + self.assertAllEqual(wire_tensor.dense_shape.eval(), + output.dense_shape.eval()) + + def test_tensor_dtype_should_be_string_or_integer(self): + string_fc = fc.categorical_column_with_hash_bucket( + 'a_string', 10, dtype=dtypes.string) + int_fc = fc.categorical_column_with_hash_bucket( + 'a_int', 10, dtype=dtypes.int32) + float_fc = fc.categorical_column_with_hash_bucket( + 'a_float', 10, dtype=dtypes.string) + int_tensor = sparse_tensor.SparseTensor( + values=[101], + indices=[[0, 0]], + dense_shape=[1, 1]) + string_tensor = sparse_tensor.SparseTensor( + values=['101'], + indices=[[0, 0]], + dense_shape=[1, 1]) + float_tensor = sparse_tensor.SparseTensor( + values=[101.], + indices=[[0, 0]], + dense_shape=[1, 1]) + transformation_cache = FeatureTransformationCache({ + 'a_int': int_tensor, + 'a_string': string_tensor, + 'a_float': float_tensor + }) + transformation_cache.get(string_fc, None) + transformation_cache.get(int_fc, None) + with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'): + transformation_cache.get(float_fc, None) + + def test_dtype_should_match_with_tensor(self): + hashed_sparse = fc.categorical_column_with_hash_bucket( + 'wire', 10, dtype=dtypes.int64) + wire_tensor = sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + transformation_cache = FeatureTransformationCache({'wire': wire_tensor}) + with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'): + transformation_cache.get(hashed_sparse, None) + + def test_ints_should_be_hashed(self): + hashed_sparse = fc.categorical_column_with_hash_bucket( + 'wire', 10, dtype=dtypes.int64) + wire_tensor = sparse_tensor.SparseTensor( + values=[101, 201, 301], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + transformation_cache = FeatureTransformationCache({'wire': wire_tensor}) + output = transformation_cache.get(hashed_sparse, None) + # Check exact hashed output. If hashing changes this test will break. + expected_values = [3, 7, 5] + with self.test_session(): + self.assertAllEqual(expected_values, output.values.eval()) + + def test_int32_64_is_compatible(self): + hashed_sparse = fc.categorical_column_with_hash_bucket( + 'wire', 10, dtype=dtypes.int64) + wire_tensor = sparse_tensor.SparseTensor( + values=constant_op.constant([101, 201, 301], dtype=dtypes.int32), + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + transformation_cache = FeatureTransformationCache({'wire': wire_tensor}) + output = transformation_cache.get(hashed_sparse, None) + # Check exact hashed output. If hashing changes this test will break. + expected_values = [3, 7, 5] + with self.test_session(): + self.assertAllEqual(expected_values, output.values.eval()) + + def test_get_sparse_tensors(self): + hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10) + transformation_cache = FeatureTransformationCache({ + 'wire': + sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + }) + id_weight_pair = hashed_sparse.get_sparse_tensors(transformation_cache, + None) + self.assertIsNone(id_weight_pair.weight_tensor) + self.assertEqual( + transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor) + + def DISABLED_test_get_sparse_tensors_weight_collections(self): + column = fc.categorical_column_with_hash_bucket('aaa', 10) + inputs = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + column._get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), + weight_collections=('my_weights',)) + + self.assertItemsEqual( + [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)) + self.assertItemsEqual([], ops.get_collection('my_weights')) + + def test_get_sparse_tensors_dense_input(self): + hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10) + transformation_cache = FeatureTransformationCache({ + 'wire': (('omar', ''), ('stringer', 'marlo')) + }) + id_weight_pair = hashed_sparse.get_sparse_tensors(transformation_cache, + None) + self.assertIsNone(id_weight_pair.weight_tensor) + self.assertEqual( + transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor) + + def test_linear_model(self): + wire_column = fc_old.categorical_column_with_hash_bucket('wire', 4) + self.assertEqual(4, wire_column._num_buckets) + with ops.Graph().as_default(): + predictions = fc.linear_model({ + wire_column.name: sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + }, (wire_column,)) + bias = get_linear_model_bias() + wire_var = get_linear_model_column_var(wire_column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval() + # 'marlo' -> 3: wire_var[3] = 4 + # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6 + self.assertAllClose(((4.,), (6.,)), predictions.eval()) + + def test_keras_linear_model(self): + wire_column = fc_old.categorical_column_with_hash_bucket('wire', 4) + self.assertEqual(4, wire_column._num_buckets) + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + wire_column.name: + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + }, (wire_column,)) + bias = get_linear_model_bias() + wire_var = get_linear_model_column_var(wire_column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval() + # 'marlo' -> 3: wire_var[3] = 4 + # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6 + self.assertAllClose(((4.,), (6.,)), predictions.eval()) + + +class CrossedColumnTest(test.TestCase): + + def test_keys_empty(self): + with self.assertRaisesRegexp( + ValueError, 'keys must be a list with length > 1'): + fc.crossed_column([], 10) + + def test_keys_length_one(self): + with self.assertRaisesRegexp( + ValueError, 'keys must be a list with length > 1'): + fc.crossed_column(['a'], 10) + + def test_key_type_unsupported(self): + with self.assertRaisesRegexp(ValueError, 'Unsupported key type'): + fc.crossed_column(['a', fc.numeric_column('c')], 10) + + with self.assertRaisesRegexp( + ValueError, 'categorical_column_with_hash_bucket is not supported'): + fc.crossed_column( + ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10) + + def test_hash_bucket_size_negative(self): + with self.assertRaisesRegexp( + ValueError, 'hash_bucket_size must be > 1'): + fc.crossed_column(['a', 'c'], -1) + + def test_hash_bucket_size_zero(self): + with self.assertRaisesRegexp( + ValueError, 'hash_bucket_size must be > 1'): + fc.crossed_column(['a', 'c'], 0) + + def test_hash_bucket_size_none(self): + with self.assertRaisesRegexp( + ValueError, 'hash_bucket_size must be > 1'): + fc.crossed_column(['a', 'c'], None) + + def test_name(self): + a = fc.numeric_column('a', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed1 = fc.crossed_column(['d1', 'd2'], 10) + + crossed2 = fc.crossed_column([b, 'c', crossed1], 10) + self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name) + + def test_name_ordered_alphabetically(self): + """Tests that the name does not depend on the order of given columns.""" + a = fc.numeric_column('a', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed1 = fc.crossed_column(['d1', 'd2'], 10) + + crossed2 = fc.crossed_column([crossed1, 'c', b], 10) + self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name) + + def test_name_leaf_keys_ordered_alphabetically(self): + """Tests that the name does not depend on the order of given columns.""" + a = fc.numeric_column('a', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed1 = fc.crossed_column(['d2', 'c'], 10) + + crossed2 = fc.crossed_column([crossed1, 'd1', b], 10) + self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name) + + def test_parse_spec(self): + a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed = fc.crossed_column([b, 'c'], 10) + self.assertEqual({ + 'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32), + 'c': parsing_ops.VarLenFeature(dtypes.string), + }, crossed.parse_example_spec) + + def test_num_buckets(self): + a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed = fc.crossed_column([b, 'c'], 15) + self.assertEqual(15, crossed.num_buckets) + + def test_deep_copy(self): + a = fc.numeric_column('a', dtype=dtypes.int32) + b = fc.bucketized_column(a, boundaries=[0, 1]) + crossed1 = fc.crossed_column(['d1', 'd2'], 10) + crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5) + crossed2_copy = copy.deepcopy(crossed2) + self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,) + self.assertEqual(15, crossed2_copy.hash_bucket_size) + self.assertEqual(5, crossed2_copy.hash_key) + + def test_parse_example(self): + price = fc.numeric_column('price', shape=[2]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 50]) + price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'price': + feature_pb2.Feature(float_list=feature_pb2.FloatList( + value=[20., 110.])), + 'wire': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])), + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([price_cross_wire])) + self.assertIn('price', features) + self.assertIn('wire', features) + with self.test_session(): + self.assertAllEqual([[20., 110.]], features['price'].eval()) + wire_sparse = features['wire'] + self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval()) + # Use byte constants to pass the open-source test. + self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval()) + self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval()) + + def test_transform_feature(self): + price = fc.numeric_column('price', shape=[2]) + bucketized_price = fc.bucketized_column(price, boundaries=[0, 50]) + hash_bucket_size = 10 + price_cross_wire = fc.crossed_column( + [bucketized_price, 'wire'], hash_bucket_size) + features = { + 'price': constant_op.constant([[1., 2.], [5., 6.]]), + 'wire': sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]), + } + outputs = _transform_features(features, [price_cross_wire], None) + output = outputs[price_cross_wire] + with self.test_session() as sess: + output_val = sess.run(output) + self.assertAllEqual( + [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices) + for val in output_val.values: + self.assertIn(val, list(range(hash_bucket_size))) + self.assertAllEqual([2, 4], output_val.dense_shape) + + def test_get_sparse_tensors(self): + a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,)) + b = fc.bucketized_column(a, boundaries=(0, 1)) + crossed1 = fc.crossed_column(['d1', 'd2'], 10) + crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5) + with ops.Graph().as_default(): + transformation_cache = FeatureTransformationCache({ + 'a': + constant_op.constant(((-1., .5), (.5, 1.))), + 'c': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + 'd1': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['d1A', 'd1B', 'd1C'], + dense_shape=(2, 2)), + 'd2': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['d2A', 'd2B', 'd2C'], + dense_shape=(2, 2)), + }) + id_weight_pair = crossed2.get_sparse_tensors(transformation_cache, None) + with _initialized_session(): + id_tensor_eval = id_weight_pair.id_tensor.eval() + self.assertAllEqual( + ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), + (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), + (1, 14), (1, 15)), + id_tensor_eval.indices) + # Check exact hashed output. If hashing changes this test will break. + # All values are within [0, hash_bucket_size). + expected_values = ( + 6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11) + self.assertAllEqual(expected_values, id_tensor_eval.values) + self.assertAllEqual((2, 16), id_tensor_eval.dense_shape) + + def test_get_sparse_tensors_simple(self): + """Same as test_get_sparse_tensors, but with simpler values.""" + a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,)) + b = fc.bucketized_column(a, boundaries=(0, 1)) + crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5) + with ops.Graph().as_default(): + transformation_cache = FeatureTransformationCache({ + 'a': + constant_op.constant(((-1., .5), (.5, 1.))), + 'c': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + }) + id_weight_pair = crossed.get_sparse_tensors(transformation_cache, None) + with _initialized_session(): + id_tensor_eval = id_weight_pair.id_tensor.eval() + self.assertAllEqual( + ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)), + id_tensor_eval.indices) + # Check exact hashed output. If hashing changes this test will break. + # All values are within [0, hash_bucket_size). + expected_values = (1, 0, 1, 3, 4, 2) + self.assertAllEqual(expected_values, id_tensor_eval.values) + self.assertAllEqual((2, 4), id_tensor_eval.dense_shape) + + def test_linear_model(self): + """Tests linear_model. + + Uses data from test_get_sparse_tesnsors_simple. + """ + a = fc_old.numeric_column('a', dtype=dtypes.int32, shape=(2,)) + b = fc_old.bucketized_column(a, boundaries=(0, 1)) + crossed = fc_old.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5) + with ops.Graph().as_default(): + predictions = fc.linear_model({ + 'a': constant_op.constant(((-1., .5), (.5, 1.))), + 'c': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + }, (crossed,)) + bias = get_linear_model_bias() + crossed_var = get_linear_model_column_var(crossed) + with _initialized_session() as sess: + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose( + ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,)))) + # Expected ids after cross = (1, 0, 1, 3, 4, 2) + self.assertAllClose(((3.,), (14.,)), predictions.eval()) + sess.run(bias.assign((.1,))) + self.assertAllClose(((3.1,), (14.1,)), predictions.eval()) + + def test_linear_model_with_weights(self): + + class _TestColumnWithWeights(fc_old._CategoricalColumn): + """Produces sparse IDs and sparse weights.""" + + @property + def name(self): + return 'test_column' + + @property + def _parse_example_spec(self): + return { + self.name: parsing_ops.VarLenFeature(dtypes.int32), + '{}_weights'.format(self.name): parsing_ops.VarLenFeature( + dtypes.float32), + } + + @property + def _num_buckets(self): + return 5 + + def _transform_feature(self, inputs): + return (inputs.get(self.name), + inputs.get('{}_weights'.format(self.name))) + + def _get_sparse_tensors(self, inputs, weight_collections=None, + trainable=None): + """Populates both id_tensor and weight_tensor.""" + ids_and_weights = inputs.get(self) + return fc_old._CategoricalColumn.IdWeightPair( + id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1]) + + t = _TestColumnWithWeights() + crossed = fc_old.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5) + with ops.Graph().as_default(): + with self.assertRaisesRegexp( + ValueError, + 'crossed_column does not support weight_tensor.*{}'.format(t.name)): + fc.linear_model({ + t.name: sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=[0, 1, 2], + dense_shape=(2, 2)), + '{}_weights'.format(t.name): sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=[1., 10., 2.], + dense_shape=(2, 2)), + 'c': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + }, (crossed,)) + + def test_keras_linear_model(self): + """Tests _LinearModel. + + Uses data from test_get_sparse_tesnsors_simple. + """ + a = fc_old.numeric_column('a', dtype=dtypes.int32, shape=(2,)) + b = fc_old.bucketized_column(a, boundaries=(0, 1)) + crossed = fc_old.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5) + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + 'a': + constant_op.constant(((-1., .5), (.5, 1.))), + 'c': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + }, (crossed,)) + bias = get_linear_model_bias() + crossed_var = get_linear_model_column_var(crossed) + with _initialized_session() as sess: + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)), + crossed_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,)))) + # Expected ids after cross = (1, 0, 1, 3, 4, 2) + self.assertAllClose(((3.,), (14.,)), predictions.eval()) + sess.run(bias.assign((.1,))) + self.assertAllClose(((3.1,), (14.1,)), predictions.eval()) + + def test_keras_linear_model_with_weights(self): + + class _TestColumnWithWeights(fc_old._CategoricalColumn): + """Produces sparse IDs and sparse weights.""" + + @property + def name(self): + return 'test_column' + + @property + def _parse_example_spec(self): + return { + self.name: + parsing_ops.VarLenFeature(dtypes.int32), + '{}_weights'.format(self.name): + parsing_ops.VarLenFeature(dtypes.float32), + } + + @property + def _num_buckets(self): + return 5 + + def _transform_feature(self, inputs): + return (inputs.get(self.name), + inputs.get('{}_weights'.format(self.name))) + + def _get_sparse_tensors(self, + inputs, + weight_collections=None, + trainable=None): + """Populates both id_tensor and weight_tensor.""" + ids_and_weights = inputs.get(self) + return fc_old._CategoricalColumn.IdWeightPair( + id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1]) + + t = _TestColumnWithWeights() + crossed = fc_old.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5) + with ops.Graph().as_default(): + with self.assertRaisesRegexp( + ValueError, + 'crossed_column does not support weight_tensor.*{}'.format(t.name)): + get_keras_linear_model_predictions({ + t.name: + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=[0, 1, 2], + dense_shape=(2, 2)), + '{}_weights'.format(t.name): + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=[1., 10., 2.], + dense_shape=(2, 2)), + 'c': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=['cA', 'cB', 'cC'], + dense_shape=(2, 2)), + }, (crossed,)) + + +def get_linear_model_bias(name='linear_model'): + with variable_scope.variable_scope(name, reuse=True): + return variable_scope.get_variable('bias_weights') + + +def get_linear_model_column_var(column, name='linear_model'): + return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, + name + '/' + column.name)[0] + + +def get_keras_linear_model_predictions(features, + feature_columns, + units=1, + sparse_combiner='sum', + weight_collections=None, + trainable=True, + cols_to_vars=None): + keras_linear_model = _LinearModel( + feature_columns, + units, + sparse_combiner, + weight_collections, + trainable, + name='linear_model') + retval = keras_linear_model(features) # pylint: disable=not-callable + if cols_to_vars is not None: + cols_to_vars.update(keras_linear_model.cols_to_vars()) + return retval + + +class LinearModelTest(test.TestCase): + + def test_raises_if_empty_feature_columns(self): + with self.assertRaisesRegexp(ValueError, + 'feature_columns must not be empty'): + fc.linear_model(features={}, feature_columns=[]) + + def test_should_be_feature_column(self): + with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'): + fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported') + + def test_should_be_dense_or_categorical_column(self): + + class NotSupportedColumn(fc_old._FeatureColumn): + + @property + def name(self): + return 'NotSupportedColumn' + + def _transform_feature(self, cache): + pass + + @property + def _parse_example_spec(self): + pass + + with self.assertRaisesRegexp( + ValueError, 'must be either a _DenseColumn or _CategoricalColumn'): + fc.linear_model( + features={'a': [[0]]}, feature_columns=[NotSupportedColumn()]) + + def test_does_not_support_dict_columns(self): + with self.assertRaisesRegexp( + ValueError, 'Expected feature_columns to be iterable, found dict.'): + fc.linear_model( + features={'a': [[0]]}, + feature_columns={'a': fc_old.numeric_column('a')}) + + def test_raises_if_duplicate_name(self): + with self.assertRaisesRegexp( + ValueError, 'Duplicate feature column name found for columns'): + fc.linear_model( + features={'a': [[0]]}, + feature_columns=[ + fc_old.numeric_column('a'), + fc_old.numeric_column('a') + ]) + + def test_dense_bias(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + predictions = fc.linear_model(features, [price]) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + sess.run(price_var.assign([[10.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[15.], [55.]], predictions.eval()) + + def test_sparse_bias(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {'wire_cast': wire_tensor} + predictions = fc.linear_model(features, [wire_cast]) + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval()) + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[1005.], [10015.]], predictions.eval()) + + def test_dense_and_sparse_bias(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]} + predictions = fc.linear_model(features, [wire_cast, price]) + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(bias.assign([5.])) + sess.run(price_var.assign([[10.]])) + self.assertAllClose([[1015.], [10065.]], predictions.eval()) + + def test_dense_and_sparse_column(self): + """When the column is both dense and sparse, uses sparse tensors.""" + + class _DenseAndSparseColumn(fc_old._DenseColumn, fc_old._CategoricalColumn): + + @property + def name(self): + return 'dense_and_sparse_column' + + @property + def _parse_example_spec(self): + return {self.name: parsing_ops.VarLenFeature(self.dtype)} + + def _transform_feature(self, inputs): + return inputs.get(self.name) + + @property + def _variable_shape(self): + raise ValueError('Should not use this method.') + + def _get_dense_tensor(self, inputs, weight_collections=None, + trainable=None): + raise ValueError('Should not use this method.') + + @property + def _num_buckets(self): + return 4 + + def _get_sparse_tensors(self, inputs, weight_collections=None, + trainable=None): + sp_tensor = sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 0], [1, 1]], + values=[2, 0, 3], + dense_shape=[2, 2]) + return fc_old._CategoricalColumn.IdWeightPair(sp_tensor, None) + + dense_and_sparse_column = _DenseAndSparseColumn() + with ops.Graph().as_default(): + sp_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {dense_and_sparse_column.name: sp_tensor} + predictions = fc.linear_model(features, [dense_and_sparse_column]) + bias = get_linear_model_bias() + dense_and_sparse_column_var = get_linear_model_column_var( + dense_and_sparse_column) + with _initialized_session() as sess: + sess.run(dense_and_sparse_column_var.assign( + [[10.], [100.], [1000.], [10000.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[1005.], [10015.]], predictions.eval()) + + def test_dense_multi_output(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + predictions = fc.linear_model(features, [price], units=3) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose(np.zeros((3,)), bias.eval()) + self.assertAllClose(np.zeros((1, 3)), price_var.eval()) + sess.run(price_var.assign([[10., 100., 1000.]])) + sess.run(bias.assign([5., 6., 7.])) + self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]], + predictions.eval()) + + def test_sparse_multi_output(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {'wire_cast': wire_tensor} + predictions = fc.linear_model(features, [wire_cast], units=3) + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + self.assertAllClose(np.zeros((3,)), bias.eval()) + self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval()) + sess.run( + wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [ + 1000., 1100., 1200. + ], [10000., 11000., 12000.]])) + sess.run(bias.assign([5., 6., 7.])) + self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]], + predictions.eval()) + + def test_dense_multi_dimension(self): + price = fc_old.numeric_column('price', shape=2) + with ops.Graph().as_default(): + features = {'price': [[1., 2.], [5., 6.]]} + predictions = fc.linear_model(features, [price]) + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose([[0.], [0.]], price_var.eval()) + sess.run(price_var.assign([[10.], [100.]])) + self.assertAllClose([[210.], [650.]], predictions.eval()) + + def test_sparse_multi_rank(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default(): + wire_tensor = array_ops.sparse_placeholder(dtypes.string) + wire_value = sparse_tensor.SparseTensorValue( + values=['omar', 'stringer', 'marlo', 'omar'], # hashed = [2, 0, 3, 2] + indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]], + dense_shape=[2, 2, 2]) + features = {'wire_cast': wire_tensor} + predictions = fc.linear_model(features, [wire_cast]) + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval()) + self.assertAllClose( + np.zeros((2, 1)), + predictions.eval(feed_dict={wire_tensor: wire_value})) + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + self.assertAllClose( + [[1010.], [11000.]], + predictions.eval(feed_dict={wire_tensor: wire_value})) + + def test_sparse_combiner(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {'wire_cast': wire_tensor} + predictions = fc.linear_model( + features, [wire_cast], sparse_combiner='mean') + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[1005.], [5010.]], predictions.eval()) + + def test_sparse_combiner_with_negative_weights(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + wire_cast_weights = fc_old.weighted_categorical_column(wire_cast, 'weights') + + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = { + 'wire_cast': wire_tensor, + 'weights': constant_op.constant([[1., 1., -1.0]]) + } + predictions = fc.linear_model( + features, [wire_cast_weights], sparse_combiner='sum') + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[1005.], [-9985.]], predictions.eval()) + + def test_dense_multi_dimension_multi_output(self): + price = fc_old.numeric_column('price', shape=2) + with ops.Graph().as_default(): + features = {'price': [[1., 2.], [5., 6.]]} + predictions = fc.linear_model(features, [price], units=3) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose(np.zeros((3,)), bias.eval()) + self.assertAllClose(np.zeros((2, 3)), price_var.eval()) + sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]])) + sess.run(bias.assign([2., 3., 4.])) + self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]], + predictions.eval()) + + def test_raises_if_shape_mismatch(self): + price = fc_old.numeric_column('price', shape=2) + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + with self.assertRaisesRegexp( + Exception, + r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'): + fc.linear_model(features, [price]) + + def test_dense_reshaping(self): + price = fc_old.numeric_column('price', shape=[1, 2]) + with ops.Graph().as_default(): + features = {'price': [[[1., 2.]], [[5., 6.]]]} + predictions = fc.linear_model(features, [price]) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + self.assertAllClose([[0.], [0.]], price_var.eval()) + self.assertAllClose([[0.], [0.]], predictions.eval()) + sess.run(price_var.assign([[10.], [100.]])) + self.assertAllClose([[210.], [650.]], predictions.eval()) + + def test_dense_multi_column(self): + price1 = fc_old.numeric_column('price1', shape=2) + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': [[1., 2.], [5., 6.]], + 'price2': [[3.], [4.]] + } + predictions = fc.linear_model(features, [price1, price2]) + bias = get_linear_model_bias() + price1_var = get_linear_model_column_var(price1) + price2_var = get_linear_model_column_var(price2) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + self.assertAllClose([[0.], [0.]], price1_var.eval()) + self.assertAllClose([[0.]], price2_var.eval()) + self.assertAllClose([[0.], [0.]], predictions.eval()) + sess.run(price1_var.assign([[10.], [100.]])) + sess.run(price2_var.assign([[1000.]])) + sess.run(bias.assign([7.])) + self.assertAllClose([[3217.], [4657.]], predictions.eval()) + + def test_fills_cols_to_vars(self): + price1 = fc_old.numeric_column('price1', shape=2) + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} + cols_to_vars = {} + fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars) + bias = get_linear_model_bias() + price1_var = get_linear_model_column_var(price1) + price2_var = get_linear_model_column_var(price2) + self.assertAllEqual(cols_to_vars['bias'], [bias]) + self.assertAllEqual(cols_to_vars[price1], [price1_var]) + self.assertAllEqual(cols_to_vars[price2], [price2_var]) + + def test_fills_cols_to_vars_partitioned_variables(self): + price1 = fc_old.numeric_column('price1', shape=2) + price2 = fc_old.numeric_column('price2', shape=3) + with ops.Graph().as_default(): + features = { + 'price1': [[1., 2.], [6., 7.]], + 'price2': [[3., 4., 5.], [8., 9., 10.]] + } + cols_to_vars = {} + with variable_scope.variable_scope( + 'linear', + partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)): + fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars) + with _initialized_session(): + self.assertEqual([0.], cols_to_vars['bias'][0].eval()) + # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables. + self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval()) + self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval()) + # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and + # a [1, 1] Variable. + self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval()) + self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval()) + + def test_dense_collection(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default() as g: + features = {'price': [[1.], [5.]]} + fc.linear_model(features, [price], weight_collections=['my-vars']) + my_vars = g.get_collection('my-vars') + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + self.assertIn(bias, my_vars) + self.assertIn(price_var, my_vars) + + def test_sparse_collection(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default() as g: + wire_tensor = sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + features = {'wire_cast': wire_tensor} + fc.linear_model( + features, [wire_cast], weight_collections=['my-vars']) + my_vars = g.get_collection('my-vars') + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + self.assertIn(bias, my_vars) + self.assertIn(wire_cast_var, my_vars) + + def test_dense_trainable_default(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default() as g: + features = {'price': [[1.], [5.]]} + fc.linear_model(features, [price]) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + self.assertIn(bias, trainable_vars) + self.assertIn(price_var, trainable_vars) + + def test_sparse_trainable_default(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default() as g: + wire_tensor = sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + features = {'wire_cast': wire_tensor} + fc.linear_model(features, [wire_cast]) + trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + self.assertIn(bias, trainable_vars) + self.assertIn(wire_cast_var, trainable_vars) + + def test_dense_trainable_false(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default() as g: + features = {'price': [[1.], [5.]]} + fc.linear_model(features, [price], trainable=False) + trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + self.assertEqual([], trainable_vars) + + def test_sparse_trainable_false(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default() as g: + wire_tensor = sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + features = {'wire_cast': wire_tensor} + fc.linear_model(features, [wire_cast], trainable=False) + trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + self.assertEqual([], trainable_vars) + + def test_column_order(self): + price_a = fc_old.numeric_column('price_a') + price_b = fc_old.numeric_column('price_b') + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default() as g: + features = { + 'price_a': [[1.]], + 'price_b': [[3.]], + 'wire_cast': + sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + } + fc.linear_model( + features, [price_a, wire_cast, price_b], + weight_collections=['my-vars']) + my_vars = g.get_collection('my-vars') + self.assertIn('price_a', my_vars[0].name) + self.assertIn('price_b', my_vars[1].name) + self.assertIn('wire_cast', my_vars[2].name) + + with ops.Graph().as_default() as g: + features = { + 'price_a': [[1.]], + 'price_b': [[3.]], + 'wire_cast': + sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + } + fc.linear_model( + features, [wire_cast, price_b, price_a], + weight_collections=['my-vars']) + my_vars = g.get_collection('my-vars') + self.assertIn('price_a', my_vars[0].name) + self.assertIn('price_b', my_vars[1].name) + self.assertIn('wire_cast', my_vars[2].name) + + def test_static_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': [[1.], [5.], [7.]], # batchsize = 3 + 'price2': [[3.], [4.]] # batchsize = 2 + } + with self.assertRaisesRegexp( + ValueError, + 'Batch size \(first dimension\) of each feature must be same.'): # pylint: disable=anomalous-backslash-in-string + fc.linear_model(features, [price1, price2]) + + def test_subset_of_static_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + price3 = fc_old.numeric_column('price3') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 + 'price2': [[3.], [4.]], # batchsize = 2 + 'price3': [[3.], [4.], [5.]] # batchsize = 3 + } + with self.assertRaisesRegexp( + ValueError, + 'Batch size \(first dimension\) of each feature must be same.'): # pylint: disable=anomalous-backslash-in-string + fc.linear_model(features, [price1, price2, price3]) + + def test_runtime_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 + 'price2': [[3.], [4.]] # batchsize = 2 + } + predictions = fc.linear_model(features, [price1, price2]) + with _initialized_session() as sess: + with self.assertRaisesRegexp(errors.OpError, + 'must have the same size and shape'): + sess.run( + predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]}) + + def test_runtime_batch_size_matches(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 + 'price2': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 + } + predictions = fc.linear_model(features, [price1, price2]) + with _initialized_session() as sess: + sess.run( + predictions, + feed_dict={ + features['price1']: [[1.], [5.]], + features['price2']: [[1.], [5.]], + }) + + def test_with_numpy_input_fn(self): + price = fc_old.numeric_column('price') + price_buckets = fc_old.bucketized_column( + price, boundaries=[ + 0., + 10., + 100., + ]) + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + + input_fn = numpy_io.numpy_input_fn( + x={ + 'price': np.array([-1., 2., 13., 104.]), + 'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']), + }, + batch_size=2, + shuffle=False) + features = input_fn() + net = fc.linear_model(features, [price_buckets, body_style]) + # self.assertEqual(1 + 3 + 5, net.shape[1]) + with _initialized_session() as sess: + coord = coordinator.Coordinator() + threads = queue_runner_impl.start_queue_runners(sess, coord=coord) + + bias = get_linear_model_bias() + price_buckets_var = get_linear_model_column_var(price_buckets) + body_style_var = get_linear_model_column_var(body_style) + + sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]])) + sess.run(bias.assign([5.])) + + self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net)) + + coord.request_stop() + coord.join(threads) + + def test_with_1d_sparse_tensor(self): + price = fc_old.numeric_column('price') + price_buckets = fc_old.bucketized_column( + price, boundaries=[ + 0., + 10., + 100., + ]) + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + + # Provides 1-dim tensor and dense tensor. + features = { + 'price': constant_op.constant([-1., 12.,]), + 'body-style': sparse_tensor.SparseTensor( + indices=((0,), (1,)), + values=('sedan', 'hardtop'), + dense_shape=(2,)), + } + self.assertEqual(1, features['price'].shape.ndims) + self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) + + net = fc.linear_model(features, [price_buckets, body_style]) + with _initialized_session() as sess: + bias = get_linear_model_bias() + price_buckets_var = get_linear_model_column_var(price_buckets) + body_style_var = get_linear_model_column_var(body_style) + + sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]])) + sess.run(bias.assign([5.])) + + self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net)) + + def test_with_1d_unknown_shape_sparse_tensor(self): + price = fc_old.numeric_column('price') + price_buckets = fc_old.bucketized_column( + price, boundaries=[ + 0., + 10., + 100., + ]) + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + country = fc_old.categorical_column_with_vocabulary_list( + 'country', vocabulary_list=['US', 'JP', 'CA']) + + # Provides 1-dim tensor and dense tensor. + features = { + 'price': array_ops.placeholder(dtypes.float32), + 'body-style': array_ops.sparse_placeholder(dtypes.string), + 'country': array_ops.placeholder(dtypes.string), + } + self.assertIsNone(features['price'].shape.ndims) + self.assertIsNone(features['body-style'].get_shape().ndims) + + price_data = np.array([-1., 12.]) + body_style_data = sparse_tensor.SparseTensorValue( + indices=((0,), (1,)), + values=('sedan', 'hardtop'), + dense_shape=(2,)) + country_data = np.array(['US', 'CA']) + + net = fc.linear_model(features, [price_buckets, body_style, country]) + bias = get_linear_model_bias() + price_buckets_var = get_linear_model_column_var(price_buckets) + body_style_var = get_linear_model_column_var(body_style) + with _initialized_session() as sess: + sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]])) + sess.run(bias.assign([5.])) + + self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], + sess.run( + net, + feed_dict={ + features['price']: price_data, + features['body-style']: body_style_data, + features['country']: country_data + })) + + def test_with_rank_0_feature(self): + price = fc_old.numeric_column('price') + features = { + 'price': constant_op.constant(0), + } + self.assertEqual(0, features['price'].shape.ndims) + + # Static rank 0 should fail + with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'): + fc.linear_model(features, [price]) + + # Dynamic rank 0 should fail + features = { + 'price': array_ops.placeholder(dtypes.float32), + } + net = fc.linear_model(features, [price]) + self.assertEqual(1, net.shape[1]) + with _initialized_session() as sess: + with self.assertRaisesOpError('Feature .* cannot have rank 0'): + sess.run(net, feed_dict={features['price']: np.array(1)}) + + def test_multiple_linear_models(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + features1 = {'price': [[1.], [5.]]} + features2 = {'price': [[2.], [10.]]} + predictions1 = fc.linear_model(features1, [price]) + predictions2 = fc.linear_model(features2, [price]) + bias1 = get_linear_model_bias(name='linear_model') + bias2 = get_linear_model_bias(name='linear_model_1') + price_var1 = get_linear_model_column_var(price, name='linear_model') + price_var2 = get_linear_model_column_var(price, name='linear_model_1') + with _initialized_session() as sess: + self.assertAllClose([0.], bias1.eval()) + sess.run(price_var1.assign([[10.]])) + sess.run(bias1.assign([5.])) + self.assertAllClose([[15.], [55.]], predictions1.eval()) + self.assertAllClose([0.], bias2.eval()) + sess.run(price_var2.assign([[10.]])) + sess.run(bias2.assign([5.])) + self.assertAllClose([[25.], [105.]], predictions2.eval()) + + +class _LinearModelTest(test.TestCase): + + def test_raises_if_empty_feature_columns(self): + with self.assertRaisesRegexp(ValueError, + 'feature_columns must not be empty'): + get_keras_linear_model_predictions(features={}, feature_columns=[]) + + def test_should_be_feature_column(self): + with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'): + get_keras_linear_model_predictions( + features={'a': [[0]]}, feature_columns='NotSupported') + + def test_should_be_dense_or_categorical_column(self): + + class NotSupportedColumn(fc_old._FeatureColumn): + + @property + def name(self): + return 'NotSupportedColumn' + + def _transform_feature(self, cache): + pass + + @property + def _parse_example_spec(self): + pass + + with self.assertRaisesRegexp( + ValueError, 'must be either a _DenseColumn or _CategoricalColumn'): + get_keras_linear_model_predictions( + features={'a': [[0]]}, feature_columns=[NotSupportedColumn()]) + + def test_does_not_support_dict_columns(self): + with self.assertRaisesRegexp( + ValueError, 'Expected feature_columns to be iterable, found dict.'): + fc.linear_model( + features={'a': [[0]]}, + feature_columns={'a': fc_old.numeric_column('a')}) + + def test_raises_if_duplicate_name(self): + with self.assertRaisesRegexp( + ValueError, 'Duplicate feature column name found for columns'): + get_keras_linear_model_predictions( + features={'a': [[0]]}, + feature_columns=[ + fc_old.numeric_column('a'), + fc_old.numeric_column('a') + ]) + + def test_dense_bias(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + predictions = get_keras_linear_model_predictions(features, [price]) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + sess.run(price_var.assign([[10.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[15.], [55.]], predictions.eval()) + + def test_sparse_bias(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {'wire_cast': wire_tensor} + predictions = get_keras_linear_model_predictions(features, [wire_cast]) + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval()) + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[1005.], [10015.]], predictions.eval()) + + def test_dense_and_sparse_bias(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]} + predictions = get_keras_linear_model_predictions(features, + [wire_cast, price]) + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(bias.assign([5.])) + sess.run(price_var.assign([[10.]])) + self.assertAllClose([[1015.], [10065.]], predictions.eval()) + + def test_dense_and_sparse_column(self): + """When the column is both dense and sparse, uses sparse tensors.""" + + class _DenseAndSparseColumn(fc_old._DenseColumn, fc_old._CategoricalColumn): + + @property + def name(self): + return 'dense_and_sparse_column' + + @property + def _parse_example_spec(self): + return {self.name: parsing_ops.VarLenFeature(self.dtype)} + + def _transform_feature(self, inputs): + return inputs.get(self.name) + + @property + def _variable_shape(self): + raise ValueError('Should not use this method.') + + def _get_dense_tensor(self, + inputs, + weight_collections=None, + trainable=None): + raise ValueError('Should not use this method.') + + @property + def _num_buckets(self): + return 4 + + def _get_sparse_tensors(self, + inputs, + weight_collections=None, + trainable=None): + sp_tensor = sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 0], [1, 1]], + values=[2, 0, 3], + dense_shape=[2, 2]) + return fc_old._CategoricalColumn.IdWeightPair(sp_tensor, None) + + dense_and_sparse_column = _DenseAndSparseColumn() + with ops.Graph().as_default(): + sp_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {dense_and_sparse_column.name: sp_tensor} + predictions = get_keras_linear_model_predictions( + features, [dense_and_sparse_column]) + bias = get_linear_model_bias() + dense_and_sparse_column_var = get_linear_model_column_var( + dense_and_sparse_column) + with _initialized_session() as sess: + sess.run( + dense_and_sparse_column_var.assign([[10.], [100.], [1000.], + [10000.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[1005.], [10015.]], predictions.eval()) + + def test_dense_multi_output(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + predictions = get_keras_linear_model_predictions( + features, [price], units=3) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose(np.zeros((3,)), bias.eval()) + self.assertAllClose(np.zeros((1, 3)), price_var.eval()) + sess.run(price_var.assign([[10., 100., 1000.]])) + sess.run(bias.assign([5., 6., 7.])) + self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]], + predictions.eval()) + + def test_sparse_multi_output(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {'wire_cast': wire_tensor} + predictions = get_keras_linear_model_predictions( + features, [wire_cast], units=3) + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + self.assertAllClose(np.zeros((3,)), bias.eval()) + self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval()) + sess.run( + wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], + [1000., 1100., + 1200.], [10000., 11000., 12000.]])) + sess.run(bias.assign([5., 6., 7.])) + self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]], + predictions.eval()) + + def test_dense_multi_dimension(self): + price = fc_old.numeric_column('price', shape=2) + with ops.Graph().as_default(): + features = {'price': [[1., 2.], [5., 6.]]} + predictions = get_keras_linear_model_predictions(features, [price]) + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose([[0.], [0.]], price_var.eval()) + sess.run(price_var.assign([[10.], [100.]])) + self.assertAllClose([[210.], [650.]], predictions.eval()) + + def test_sparse_multi_rank(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default(): + wire_tensor = array_ops.sparse_placeholder(dtypes.string) + wire_value = sparse_tensor.SparseTensorValue( + values=['omar', 'stringer', 'marlo', 'omar'], # hashed = [2, 0, 3, 2] + indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]], + dense_shape=[2, 2, 2]) + features = {'wire_cast': wire_tensor} + predictions = get_keras_linear_model_predictions(features, [wire_cast]) + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval()) + self.assertAllClose( + np.zeros((2, 1)), + predictions.eval(feed_dict={wire_tensor: wire_value})) + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + self.assertAllClose( + [[1010.], [11000.]], + predictions.eval(feed_dict={wire_tensor: wire_value})) + + def test_sparse_combiner(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default(): + wire_tensor = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + features = {'wire_cast': wire_tensor} + predictions = get_keras_linear_model_predictions( + features, [wire_cast], sparse_combiner='mean') + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + with _initialized_session() as sess: + sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(bias.assign([5.])) + self.assertAllClose([[1005.], [5010.]], predictions.eval()) + + def test_dense_multi_dimension_multi_output(self): + price = fc_old.numeric_column('price', shape=2) + with ops.Graph().as_default(): + features = {'price': [[1., 2.], [5., 6.]]} + predictions = get_keras_linear_model_predictions( + features, [price], units=3) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose(np.zeros((3,)), bias.eval()) + self.assertAllClose(np.zeros((2, 3)), price_var.eval()) + sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]])) + sess.run(bias.assign([2., 3., 4.])) + self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]], + predictions.eval()) + + def test_raises_if_shape_mismatch(self): + price = fc_old.numeric_column('price', shape=2) + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + with self.assertRaisesRegexp( + Exception, + r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'): + get_keras_linear_model_predictions(features, [price]) + + def test_dense_reshaping(self): + price = fc_old.numeric_column('price', shape=[1, 2]) + with ops.Graph().as_default(): + features = {'price': [[[1., 2.]], [[5., 6.]]]} + predictions = get_keras_linear_model_predictions(features, [price]) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + self.assertAllClose([[0.], [0.]], price_var.eval()) + self.assertAllClose([[0.], [0.]], predictions.eval()) + sess.run(price_var.assign([[10.], [100.]])) + self.assertAllClose([[210.], [650.]], predictions.eval()) + + def test_dense_multi_column(self): + price1 = fc_old.numeric_column('price1', shape=2) + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} + predictions = get_keras_linear_model_predictions(features, + [price1, price2]) + bias = get_linear_model_bias() + price1_var = get_linear_model_column_var(price1) + price2_var = get_linear_model_column_var(price2) + with _initialized_session() as sess: + self.assertAllClose([0.], bias.eval()) + self.assertAllClose([[0.], [0.]], price1_var.eval()) + self.assertAllClose([[0.]], price2_var.eval()) + self.assertAllClose([[0.], [0.]], predictions.eval()) + sess.run(price1_var.assign([[10.], [100.]])) + sess.run(price2_var.assign([[1000.]])) + sess.run(bias.assign([7.])) + self.assertAllClose([[3217.], [4657.]], predictions.eval()) + + def test_fills_cols_to_vars(self): + price1 = fc_old.numeric_column('price1', shape=2) + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} + cols_to_vars = {} + get_keras_linear_model_predictions( + features, [price1, price2], cols_to_vars=cols_to_vars) + bias = get_linear_model_bias() + price1_var = get_linear_model_column_var(price1) + price2_var = get_linear_model_column_var(price2) + self.assertAllEqual(cols_to_vars['bias'], [bias]) + self.assertAllEqual(cols_to_vars[price1], [price1_var]) + self.assertAllEqual(cols_to_vars[price2], [price2_var]) + + def test_fills_cols_to_vars_partitioned_variables(self): + price1 = fc_old.numeric_column('price1', shape=2) + price2 = fc_old.numeric_column('price2', shape=3) + with ops.Graph().as_default(): + features = { + 'price1': [[1., 2.], [6., 7.]], + 'price2': [[3., 4., 5.], [8., 9., 10.]] + } + cols_to_vars = {} + with variable_scope.variable_scope( + 'linear', + partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)): + get_keras_linear_model_predictions( + features, [price1, price2], cols_to_vars=cols_to_vars) + with _initialized_session(): + self.assertEqual([0.], cols_to_vars['bias'][0].eval()) + # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables. + self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval()) + self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval()) + # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and + # a [1, 1] Variable. + self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval()) + self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval()) + + def test_dense_collection(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default() as g: + features = {'price': [[1.], [5.]]} + get_keras_linear_model_predictions( + features, [price], weight_collections=['my-vars']) + my_vars = g.get_collection('my-vars') + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + self.assertIn(bias, my_vars) + self.assertIn(price_var, my_vars) + + def test_sparse_collection(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default() as g: + wire_tensor = sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + features = {'wire_cast': wire_tensor} + get_keras_linear_model_predictions( + features, [wire_cast], weight_collections=['my-vars']) + my_vars = g.get_collection('my-vars') + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + self.assertIn(bias, my_vars) + self.assertIn(wire_cast_var, my_vars) + + def test_dense_trainable_default(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default() as g: + features = {'price': [[1.], [5.]]} + get_keras_linear_model_predictions(features, [price]) + bias = get_linear_model_bias() + price_var = get_linear_model_column_var(price) + trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + self.assertIn(bias, trainable_vars) + self.assertIn(price_var, trainable_vars) + + def test_sparse_trainable_default(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default() as g: + wire_tensor = sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + features = {'wire_cast': wire_tensor} + get_keras_linear_model_predictions(features, [wire_cast]) + trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + bias = get_linear_model_bias() + wire_cast_var = get_linear_model_column_var(wire_cast) + self.assertIn(bias, trainable_vars) + self.assertIn(wire_cast_var, trainable_vars) + + def test_dense_trainable_false(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default() as g: + features = {'price': [[1.], [5.]]} + get_keras_linear_model_predictions(features, [price], trainable=False) + trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + self.assertEqual([], trainable_vars) + + def test_sparse_trainable_false(self): + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default() as g: + wire_tensor = sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + features = {'wire_cast': wire_tensor} + get_keras_linear_model_predictions(features, [wire_cast], trainable=False) + trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + self.assertEqual([], trainable_vars) + + def test_column_order(self): + price_a = fc_old.numeric_column('price_a') + price_b = fc_old.numeric_column('price_b') + wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + with ops.Graph().as_default() as g: + features = { + 'price_a': [[1.]], + 'price_b': [[3.]], + 'wire_cast': + sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + } + get_keras_linear_model_predictions( + features, [price_a, wire_cast, price_b], + weight_collections=['my-vars']) + my_vars = g.get_collection('my-vars') + self.assertIn('price_a', my_vars[0].name) + self.assertIn('price_b', my_vars[1].name) + self.assertIn('wire_cast', my_vars[2].name) + + with ops.Graph().as_default() as g: + features = { + 'price_a': [[1.]], + 'price_b': [[3.]], + 'wire_cast': + sparse_tensor.SparseTensor( + values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) + } + get_keras_linear_model_predictions( + features, [wire_cast, price_b, price_a], + weight_collections=['my-vars']) + my_vars = g.get_collection('my-vars') + self.assertIn('price_a', my_vars[0].name) + self.assertIn('price_b', my_vars[1].name) + self.assertIn('wire_cast', my_vars[2].name) + + def test_static_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': [[1.], [5.], [7.]], # batchsize = 3 + 'price2': [[3.], [4.]] # batchsize = 2 + } + with self.assertRaisesRegexp( + ValueError, + 'Batch size \(first dimension\) of each feature must be same.'): # pylint: disable=anomalous-backslash-in-string + get_keras_linear_model_predictions(features, [price1, price2]) + + def test_subset_of_static_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + price3 = fc_old.numeric_column('price3') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 + 'price2': [[3.], [4.]], # batchsize = 2 + 'price3': [[3.], [4.], [5.]] # batchsize = 3 + } + with self.assertRaisesRegexp( + ValueError, + 'Batch size \(first dimension\) of each feature must be same.'): # pylint: disable=anomalous-backslash-in-string + get_keras_linear_model_predictions(features, [price1, price2, price3]) + + def test_runtime_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 + 'price2': [[3.], [4.]] # batchsize = 2 + } + predictions = get_keras_linear_model_predictions(features, + [price1, price2]) + with _initialized_session() as sess: + with self.assertRaisesRegexp(errors.OpError, + 'must have the same size and shape'): + sess.run( + predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]}) + + def test_runtime_batch_size_matches(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 + 'price2': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 + } + predictions = get_keras_linear_model_predictions(features, + [price1, price2]) + with _initialized_session() as sess: + sess.run( + predictions, + feed_dict={ + features['price1']: [[1.], [5.]], + features['price2']: [[1.], [5.]], + }) + + def test_with_numpy_input_fn(self): + price = fc_old.numeric_column('price') + price_buckets = fc_old.bucketized_column( + price, boundaries=[ + 0., + 10., + 100., + ]) + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + + input_fn = numpy_io.numpy_input_fn( + x={ + 'price': np.array([-1., 2., 13., 104.]), + 'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']), + }, + batch_size=2, + shuffle=False) + features = input_fn() + net = get_keras_linear_model_predictions(features, + [price_buckets, body_style]) + # self.assertEqual(1 + 3 + 5, net.shape[1]) + with _initialized_session() as sess: + coord = coordinator.Coordinator() + threads = queue_runner_impl.start_queue_runners(sess, coord=coord) + + bias = get_linear_model_bias() + price_buckets_var = get_linear_model_column_var(price_buckets) + body_style_var = get_linear_model_column_var(body_style) + + sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]])) + sess.run(bias.assign([5.])) + + self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net)) + + coord.request_stop() + coord.join(threads) + + def test_with_1d_sparse_tensor(self): + price = fc_old.numeric_column('price') + price_buckets = fc_old.bucketized_column( + price, boundaries=[ + 0., + 10., + 100., + ]) + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + + # Provides 1-dim tensor and dense tensor. + features = { + 'price': + constant_op.constant([ + -1., + 12., + ]), + 'body-style': + sparse_tensor.SparseTensor( + indices=((0,), (1,)), + values=('sedan', 'hardtop'), + dense_shape=(2,)), + } + self.assertEqual(1, features['price'].shape.ndims) + self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) + + net = get_keras_linear_model_predictions(features, + [price_buckets, body_style]) + with _initialized_session() as sess: + bias = get_linear_model_bias() + price_buckets_var = get_linear_model_column_var(price_buckets) + body_style_var = get_linear_model_column_var(body_style) + + sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]])) + sess.run(bias.assign([5.])) + + self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net)) + + def test_with_1d_unknown_shape_sparse_tensor(self): + price = fc_old.numeric_column('price') + price_buckets = fc_old.bucketized_column( + price, boundaries=[ + 0., + 10., + 100., + ]) + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + country = fc_old.categorical_column_with_vocabulary_list( + 'country', vocabulary_list=['US', 'JP', 'CA']) + + # Provides 1-dim tensor and dense tensor. + features = { + 'price': array_ops.placeholder(dtypes.float32), + 'body-style': array_ops.sparse_placeholder(dtypes.string), + 'country': array_ops.placeholder(dtypes.string), + } + self.assertIsNone(features['price'].shape.ndims) + self.assertIsNone(features['body-style'].get_shape().ndims) + + price_data = np.array([-1., 12.]) + body_style_data = sparse_tensor.SparseTensorValue( + indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,)) + country_data = np.array(['US', 'CA']) + + net = get_keras_linear_model_predictions( + features, [price_buckets, body_style, country]) + bias = get_linear_model_bias() + price_buckets_var = get_linear_model_column_var(price_buckets) + body_style_var = get_linear_model_column_var(body_style) + with _initialized_session() as sess: + sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]])) + sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]])) + sess.run(bias.assign([5.])) + + self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], + sess.run( + net, + feed_dict={ + features['price']: price_data, + features['body-style']: body_style_data, + features['country']: country_data + })) + + def test_with_rank_0_feature(self): + price = fc_old.numeric_column('price') + features = { + 'price': constant_op.constant(0), + } + self.assertEqual(0, features['price'].shape.ndims) + + # Static rank 0 should fail + with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'): + get_keras_linear_model_predictions(features, [price]) + + # Dynamic rank 0 should fail + features = { + 'price': array_ops.placeholder(dtypes.float32), + } + net = get_keras_linear_model_predictions(features, [price]) + self.assertEqual(1, net.shape[1]) + with _initialized_session() as sess: + with self.assertRaisesOpError('Feature .* cannot have rank 0'): + sess.run(net, feed_dict={features['price']: np.array(1)}) + + +class InputLayerTest(test.TestCase): + + @test_util.run_in_graph_and_eager_modes() + def test_retrieving_input(self): + features = {'a': [0.]} + input_layer = InputLayer(fc_old.numeric_column('a')) + inputs = self.evaluate(input_layer(features)) + self.assertAllClose([[0.]], inputs) + + def test_reuses_variables(self): + with context.eager_mode(): + sparse_input = sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (2, 0)), + values=(0, 1, 2), + dense_shape=(3, 3)) + + # Create feature columns (categorical and embedding). + categorical_column = fc_old.categorical_column_with_identity( + key='a', num_buckets=3) + embedding_dimension = 2 + def _embedding_column_initializer(shape, dtype, partition_info): + del shape # unused + del dtype # unused + del partition_info # unused + embedding_values = ( + (1, 0), # id 0 + (0, 1), # id 1 + (1, 1)) # id 2 + return embedding_values + + embedding_column = fc_old.embedding_column( + categorical_column, + dimension=embedding_dimension, + initializer=_embedding_column_initializer) + + input_layer = InputLayer([embedding_column]) + features = {'a': sparse_input} + + inputs = input_layer(features) + variables = input_layer.variables + + # Sanity check: test that the inputs are correct. + self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs) + + # Check that only one variable was created. + self.assertEqual(1, len(variables)) + + # Check that invoking input_layer on the same features does not create + # additional variables + _ = input_layer(features) + self.assertEqual(1, len(variables)) + self.assertEqual(variables[0], input_layer.variables[0]) + + def test_feature_column_input_layer_gradient(self): + with context.eager_mode(): + sparse_input = sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (2, 0)), + values=(0, 1, 2), + dense_shape=(3, 3)) + + # Create feature columns (categorical and embedding). + categorical_column = fc_old.categorical_column_with_identity( + key='a', num_buckets=3) + embedding_dimension = 2 + + def _embedding_column_initializer(shape, dtype, partition_info): + del shape # unused + del dtype # unused + del partition_info # unused + embedding_values = ( + (1, 0), # id 0 + (0, 1), # id 1 + (1, 1)) # id 2 + return embedding_values + + embedding_column = fc_old.embedding_column( + categorical_column, + dimension=embedding_dimension, + initializer=_embedding_column_initializer) + + input_layer = InputLayer([embedding_column]) + features = {'a': sparse_input} + + def scale_matrix(): + matrix = input_layer(features) + return 2 * matrix + + # Sanity check: Verify that scale_matrix returns the correct output. + self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix()) + + # Check that the returned gradient is correct. + grad_function = backprop.implicit_grad(scale_matrix) + grads_and_vars = grad_function() + indexed_slice = grads_and_vars[0][0] + gradient = grads_and_vars[0][0].values + + self.assertAllEqual([0, 1, 2], indexed_slice.indices) + self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient) + + +class FunctionalInputLayerTest(test.TestCase): + + def test_raises_if_empty_feature_columns(self): + with self.assertRaisesRegexp(ValueError, + 'feature_columns must not be empty'): + fc.input_layer(features={}, feature_columns=[]) + + def test_should_be_dense_column(self): + with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'): + fc.input_layer( + features={'a': [[0]]}, + feature_columns=[ + fc_old.categorical_column_with_hash_bucket('wire_cast', 4) + ]) + + def test_does_not_support_dict_columns(self): + with self.assertRaisesRegexp( + ValueError, 'Expected feature_columns to be iterable, found dict.'): + fc.input_layer( + features={'a': [[0]]}, + feature_columns={'a': fc_old.numeric_column('a')}) + + def test_bare_column(self): + with ops.Graph().as_default(): + features = features = {'a': [0.]} + net = fc.input_layer(features, fc_old.numeric_column('a')) + with _initialized_session(): + self.assertAllClose([[0.]], net.eval()) + + def test_column_generator(self): + with ops.Graph().as_default(): + features = features = {'a': [0.], 'b': [1.]} + columns = (fc_old.numeric_column(key) for key in features) + net = fc.input_layer(features, columns) + with _initialized_session(): + self.assertAllClose([[0., 1.]], net.eval()) + + def test_raises_if_duplicate_name(self): + with self.assertRaisesRegexp( + ValueError, 'Duplicate feature column name found for columns'): + fc.input_layer( + features={'a': [[0]]}, + feature_columns=[ + fc_old.numeric_column('a'), + fc_old.numeric_column('a') + ]) + + def test_one_column(self): + price = fc_old.numeric_column('price') + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + net = fc.input_layer(features, [price]) + with _initialized_session(): + self.assertAllClose([[1.], [5.]], net.eval()) + + def test_multi_dimension(self): + price = fc_old.numeric_column('price', shape=2) + with ops.Graph().as_default(): + features = {'price': [[1., 2.], [5., 6.]]} + net = fc.input_layer(features, [price]) + with _initialized_session(): + self.assertAllClose([[1., 2.], [5., 6.]], net.eval()) + + def test_raises_if_shape_mismatch(self): + price = fc_old.numeric_column('price', shape=2) + with ops.Graph().as_default(): + features = {'price': [[1.], [5.]]} + with self.assertRaisesRegexp( + Exception, + r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'): + fc.input_layer(features, [price]) + + def test_reshaping(self): + price = fc_old.numeric_column('price', shape=[1, 2]) + with ops.Graph().as_default(): + features = {'price': [[[1., 2.]], [[5., 6.]]]} + net = fc.input_layer(features, [price]) + with _initialized_session(): + self.assertAllClose([[1., 2.], [5., 6.]], net.eval()) + + def test_multi_column(self): + price1 = fc_old.numeric_column('price1', shape=2) + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': [[1., 2.], [5., 6.]], + 'price2': [[3.], [4.]] + } + net = fc.input_layer(features, [price1, price2]) + with _initialized_session(): + self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval()) + + def test_fills_cols_to_vars(self): + # Provide three _DenseColumn's to input_layer: a _NumericColumn, a + # _BucketizedColumn, and an _EmbeddingColumn. Only the _EmbeddingColumn + # creates a Variable. + price1 = fc_old.numeric_column('price1') + dense_feature = fc_old.numeric_column('dense_feature') + dense_feature_bucketized = fc_old.bucketized_column( + dense_feature, boundaries=[0.]) + some_sparse_column = fc_old.categorical_column_with_hash_bucket( + 'sparse_feature', hash_bucket_size=5) + some_embedding_column = fc_old.embedding_column( + some_sparse_column, dimension=10) + with ops.Graph().as_default(): + features = { + 'price1': [[3.], [4.]], + 'dense_feature': [[-1.], [4.]], + 'sparse_feature': [['a'], ['x']], + } + cols_to_vars = {} + all_cols = [price1, dense_feature_bucketized, some_embedding_column] + fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars) + self.assertItemsEqual(list(cols_to_vars.keys()), all_cols) + self.assertEqual(0, len(cols_to_vars[price1])) + self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized])) + self.assertEqual(1, len(cols_to_vars[some_embedding_column])) + self.assertIsInstance(cols_to_vars[some_embedding_column][0], + variables_lib.Variable) + self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10]) + + def test_fills_cols_to_vars_partitioned_variables(self): + price1 = fc_old.numeric_column('price1') + dense_feature = fc_old.numeric_column('dense_feature') + dense_feature_bucketized = fc_old.bucketized_column( + dense_feature, boundaries=[0.]) + some_sparse_column = fc_old.categorical_column_with_hash_bucket( + 'sparse_feature', hash_bucket_size=5) + some_embedding_column = fc_old.embedding_column( + some_sparse_column, dimension=10) + with ops.Graph().as_default(): + features = { + 'price1': [[3.], [4.]], + 'dense_feature': [[-1.], [4.]], + 'sparse_feature': [['a'], ['x']], + } + cols_to_vars = {} + all_cols = [price1, dense_feature_bucketized, some_embedding_column] + with variable_scope.variable_scope( + 'input_from_feature_columns', + partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)): + fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars) + self.assertItemsEqual(list(cols_to_vars.keys()), all_cols) + self.assertEqual(0, len(cols_to_vars[price1])) + self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized])) + self.assertEqual(3, len(cols_to_vars[some_embedding_column])) + self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [2, 10]) + self.assertAllEqual(cols_to_vars[some_embedding_column][1].shape, [2, 10]) + self.assertAllEqual(cols_to_vars[some_embedding_column][2].shape, [1, 10]) + + def test_column_order(self): + price_a = fc_old.numeric_column('price_a') + price_b = fc_old.numeric_column('price_b') + with ops.Graph().as_default(): + features = { + 'price_a': [[1.]], + 'price_b': [[3.]], + } + net1 = fc.input_layer(features, [price_a, price_b]) + net2 = fc.input_layer(features, [price_b, price_a]) + with _initialized_session(): + self.assertAllClose([[1., 3.]], net1.eval()) + self.assertAllClose([[1., 3.]], net2.eval()) + + def test_fails_for_categorical_column(self): + animal = fc_old.categorical_column_with_identity('animal', num_buckets=4) + with ops.Graph().as_default(): + features = { + 'animal': + sparse_tensor.SparseTensor( + indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) + } + with self.assertRaisesRegexp(Exception, 'must be a _DenseColumn'): + fc.input_layer(features, [animal]) + + def test_static_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': [[1.], [5.], [7.]], # batchsize = 3 + 'price2': [[3.], [4.]] # batchsize = 2 + } + with self.assertRaisesRegexp( + ValueError, + 'Batch size \(first dimension\) of each feature must be same.'): # pylint: disable=anomalous-backslash-in-string + fc.input_layer(features, [price1, price2]) + + def test_subset_of_static_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + price3 = fc_old.numeric_column('price3') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 + 'price2': [[3.], [4.]], # batchsize = 2 + 'price3': [[3.], [4.], [5.]] # batchsize = 3 + } + with self.assertRaisesRegexp( + ValueError, + 'Batch size \(first dimension\) of each feature must be same.'): # pylint: disable=anomalous-backslash-in-string + fc.input_layer(features, [price1, price2, price3]) + + def test_runtime_batch_size_mismatch(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 + 'price2': [[3.], [4.]] # batchsize = 2 + } + net = fc.input_layer(features, [price1, price2]) + with _initialized_session() as sess: + with self.assertRaisesRegexp(errors.OpError, + 'Dimensions of inputs should match'): + sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]}) + + def test_runtime_batch_size_matches(self): + price1 = fc_old.numeric_column('price1') + price2 = fc_old.numeric_column('price2') + with ops.Graph().as_default(): + features = { + 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 + 'price2': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 + } + net = fc.input_layer(features, [price1, price2]) + with _initialized_session() as sess: + sess.run( + net, + feed_dict={ + features['price1']: [[1.], [5.]], + features['price2']: [[1.], [5.]], + }) + + def test_multiple_layers_with_same_embedding_column(self): + some_sparse_column = fc_old.categorical_column_with_hash_bucket( + 'sparse_feature', hash_bucket_size=5) + some_embedding_column = fc_old.embedding_column( + some_sparse_column, dimension=10) + + with ops.Graph().as_default(): + features = { + 'sparse_feature': [['a'], ['x']], + } + all_cols = [some_embedding_column] + fc.input_layer(features, all_cols) + fc.input_layer(features, all_cols) + # Make sure that 2 variables get created in this case. + self.assertEqual(2, len( + ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) + expected_var_names = [ + 'input_layer/sparse_feature_embedding/embedding_weights:0', + 'input_layer_1/sparse_feature_embedding/embedding_weights:0' + ] + self.assertItemsEqual( + expected_var_names, + [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)]) + + def test_multiple_layers_with_same_shared_embedding_column(self): + categorical_column_a = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=3) + categorical_column_b = fc_old.categorical_column_with_identity( + key='bbb', num_buckets=3) + embedding_dimension = 2 + embedding_column_b, embedding_column_a = fc_old.shared_embedding_columns( + [categorical_column_b, categorical_column_a], + dimension=embedding_dimension) + + with ops.Graph().as_default(): + features = { + 'aaa': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)), + 'bbb': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(1, 2, 1), + dense_shape=(2, 2)), + } + all_cols = [embedding_column_a, embedding_column_b] + fc.input_layer(features, all_cols) + fc.input_layer(features, all_cols) + # Make sure that only 1 variable gets created in this case. + self.assertEqual(1, len( + ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) + self.assertItemsEqual( + ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'], + [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)]) + + def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self): + categorical_column_a = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=3) + categorical_column_b = fc_old.categorical_column_with_identity( + key='bbb', num_buckets=3) + embedding_dimension = 2 + embedding_column_b, embedding_column_a = fc_old.shared_embedding_columns( + [categorical_column_b, categorical_column_a], + dimension=embedding_dimension) + all_cols = [embedding_column_a, embedding_column_b] + + with ops.Graph().as_default(): + features = { + 'aaa': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)), + 'bbb': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(1, 2, 1), + dense_shape=(2, 2)), + } + fc.input_layer(features, all_cols) + # Make sure that only 1 variable gets created in this case. + self.assertEqual(1, len( + ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) + + with ops.Graph().as_default(): + features1 = { + 'aaa': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)), + 'bbb': + sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(1, 2, 1), + dense_shape=(2, 2)), + } + + fc.input_layer(features1, all_cols) + # Make sure that only 1 variable gets created in this case. + self.assertEqual(1, len( + ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) + self.assertItemsEqual( + ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'], + [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)]) + + def test_with_numpy_input_fn(self): + embedding_values = ( + (1., 2., 3., 4., 5.), # id 0 + (6., 7., 8., 9., 10.), # id 1 + (11., 12., 13., 14., 15.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + del shape, dtype, partition_info + return embedding_values + + # price has 1 dimension in input_layer + price = fc_old.numeric_column('price') + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + # one_hot_body_style has 3 dims in input_layer. + one_hot_body_style = fc_old.indicator_column(body_style) + # embedded_body_style has 5 dims in input_layer. + embedded_body_style = fc_old.embedding_column( + body_style, dimension=5, initializer=_initializer) + + input_fn = numpy_io.numpy_input_fn( + x={ + 'price': np.array([11., 12., 13., 14.]), + 'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']), + }, + batch_size=2, + shuffle=False) + features = input_fn() + net = fc.input_layer(features, + [price, one_hot_body_style, embedded_body_style]) + self.assertEqual(1 + 3 + 5, net.shape[1]) + with _initialized_session() as sess: + coord = coordinator.Coordinator() + threads = queue_runner_impl.start_queue_runners(sess, coord=coord) + + # Each row is formed by concatenating `embedded_body_style`, + # `one_hot_body_style`, and `price` in order. + self.assertAllEqual( + [[11., 12., 13., 14., 15., 0., 0., 1., 11.], + [1., 2., 3., 4., 5., 1., 0., 0., 12]], + sess.run(net)) + + coord.request_stop() + coord.join(threads) + + def test_with_1d_sparse_tensor(self): + embedding_values = ( + (1., 2., 3., 4., 5.), # id 0 + (6., 7., 8., 9., 10.), # id 1 + (11., 12., 13., 14., 15.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + del shape, dtype, partition_info + return embedding_values + + # price has 1 dimension in input_layer + price = fc_old.numeric_column('price') + + # one_hot_body_style has 3 dims in input_layer. + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + one_hot_body_style = fc_old.indicator_column(body_style) + + # embedded_body_style has 5 dims in input_layer. + country = fc_old.categorical_column_with_vocabulary_list( + 'country', vocabulary_list=['US', 'JP', 'CA']) + embedded_country = fc_old.embedding_column( + country, dimension=5, initializer=_initializer) + + # Provides 1-dim tensor and dense tensor. + features = { + 'price': constant_op.constant([11., 12.,]), + 'body-style': sparse_tensor.SparseTensor( + indices=((0,), (1,)), + values=('sedan', 'hardtop'), + dense_shape=(2,)), + # This is dense tensor for the categorical_column. + 'country': constant_op.constant(['CA', 'US']), + } + self.assertEqual(1, features['price'].shape.ndims) + self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) + self.assertEqual(1, features['country'].shape.ndims) + + net = fc.input_layer(features, + [price, one_hot_body_style, embedded_country]) + self.assertEqual(1 + 3 + 5, net.shape[1]) + with _initialized_session() as sess: + + # Each row is formed by concatenating `embedded_body_style`, + # `one_hot_body_style`, and `price` in order. + self.assertAllEqual( + [[0., 0., 1., 11., 12., 13., 14., 15., 11.], + [1., 0., 0., 1., 2., 3., 4., 5., 12.]], + sess.run(net)) + + def test_with_1d_unknown_shape_sparse_tensor(self): + embedding_values = ( + (1., 2.), # id 0 + (6., 7.), # id 1 + (11., 12.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + del shape, dtype, partition_info + return embedding_values + + # price has 1 dimension in input_layer + price = fc_old.numeric_column('price') + + # one_hot_body_style has 3 dims in input_layer. + body_style = fc_old.categorical_column_with_vocabulary_list( + 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) + one_hot_body_style = fc_old.indicator_column(body_style) + + # embedded_body_style has 5 dims in input_layer. + country = fc_old.categorical_column_with_vocabulary_list( + 'country', vocabulary_list=['US', 'JP', 'CA']) + embedded_country = fc_old.embedding_column( + country, dimension=2, initializer=_initializer) + + # Provides 1-dim tensor and dense tensor. + features = { + 'price': array_ops.placeholder(dtypes.float32), + 'body-style': array_ops.sparse_placeholder(dtypes.string), + # This is dense tensor for the categorical_column. + 'country': array_ops.placeholder(dtypes.string), + } + self.assertIsNone(features['price'].shape.ndims) + self.assertIsNone(features['body-style'].get_shape().ndims) + self.assertIsNone(features['country'].shape.ndims) + + price_data = np.array([11., 12.]) + body_style_data = sparse_tensor.SparseTensorValue( + indices=((0,), (1,)), + values=('sedan', 'hardtop'), + dense_shape=(2,)) + country_data = np.array([['US'], ['CA']]) + + net = fc.input_layer(features, + [price, one_hot_body_style, embedded_country]) + self.assertEqual(1 + 3 + 2, net.shape[1]) + with _initialized_session() as sess: + + # Each row is formed by concatenating `embedded_body_style`, + # `one_hot_body_style`, and `price` in order. + self.assertAllEqual( + [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]], + sess.run( + net, + feed_dict={ + features['price']: price_data, + features['body-style']: body_style_data, + features['country']: country_data + })) + + def test_with_rank_0_feature(self): + # price has 1 dimension in input_layer + price = fc_old.numeric_column('price') + features = { + 'price': constant_op.constant(0), + } + self.assertEqual(0, features['price'].shape.ndims) + + # Static rank 0 should fail + with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'): + fc.input_layer(features, [price]) + + # Dynamic rank 0 should fail + features = { + 'price': array_ops.placeholder(dtypes.float32), + } + net = fc.input_layer(features, [price]) + self.assertEqual(1, net.shape[1]) + with _initialized_session() as sess: + with self.assertRaisesOpError('Feature .* cannot have rank 0'): + sess.run(net, feed_dict={features['price']: np.array(1)}) + + +class MakeParseExampleSpecTest(test.TestCase): + + class _TestFeatureColumn(FeatureColumn, + collections.namedtuple('_TestFeatureColumn', + ('parse_spec'))): + + @property + def name(self): + return "_TestFeatureColumn" + + def transform_feature(self, transformation_cache, state_manager): + pass + + @property + def parse_example_spec(self): + return self.parse_spec + + def test_no_feature_columns(self): + actual = fc.make_parse_example_spec([]) + self.assertDictEqual({}, actual) + + def test_invalid_type(self): + key1 = 'key1' + parse_spec1 = parsing_ops.FixedLenFeature( + shape=(2,), dtype=dtypes.float32, default_value=0.) + with self.assertRaisesRegexp( + ValueError, + 'All feature_columns must be FeatureColumn instances.*invalid_column'): + fc.make_parse_example_spec( + (self._TestFeatureColumn({key1: parse_spec1}), 'invalid_column')) + + def test_one_feature_column(self): + key1 = 'key1' + parse_spec1 = parsing_ops.FixedLenFeature( + shape=(2,), dtype=dtypes.float32, default_value=0.) + actual = fc.make_parse_example_spec( + (self._TestFeatureColumn({key1: parse_spec1}),)) + self.assertDictEqual({key1: parse_spec1}, actual) + + def test_two_feature_columns(self): + key1 = 'key1' + parse_spec1 = parsing_ops.FixedLenFeature( + shape=(2,), dtype=dtypes.float32, default_value=0.) + key2 = 'key2' + parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string) + actual = fc.make_parse_example_spec( + (self._TestFeatureColumn({key1: parse_spec1}), + self._TestFeatureColumn({key2: parse_spec2}))) + self.assertDictEqual({key1: parse_spec1, key2: parse_spec2}, actual) + + def test_equal_keys_different_parse_spec(self): + key1 = 'key1' + parse_spec1 = parsing_ops.FixedLenFeature( + shape=(2,), dtype=dtypes.float32, default_value=0.) + parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string) + with self.assertRaisesRegexp( + ValueError, + 'feature_columns contain different parse_spec for key key1'): + fc.make_parse_example_spec( + (self._TestFeatureColumn({key1: parse_spec1}), + self._TestFeatureColumn({key1: parse_spec2}))) + + def test_equal_keys_equal_parse_spec(self): + key1 = 'key1' + parse_spec1 = parsing_ops.FixedLenFeature( + shape=(2,), dtype=dtypes.float32, default_value=0.) + actual = fc.make_parse_example_spec( + (self._TestFeatureColumn({key1: parse_spec1}), + self._TestFeatureColumn({key1: parse_spec1}))) + self.assertDictEqual({key1: parse_spec1}, actual) + + def test_multiple_features_dict(self): + """parse_spc for one column is a dict with length > 1.""" + key1 = 'key1' + parse_spec1 = parsing_ops.FixedLenFeature( + shape=(2,), dtype=dtypes.float32, default_value=0.) + key2 = 'key2' + parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string) + key3 = 'key3' + parse_spec3 = parsing_ops.VarLenFeature(dtype=dtypes.int32) + actual = fc.make_parse_example_spec( + (self._TestFeatureColumn({key1: parse_spec1}), + self._TestFeatureColumn({key2: parse_spec2, key3: parse_spec3}))) + self.assertDictEqual( + {key1: parse_spec1, key2: parse_spec2, key3: parse_spec3}, actual) + + +def _assert_sparse_tensor_value(test_case, expected, actual): + test_case.assertEqual(np.int64, np.array(actual.indices).dtype) + test_case.assertAllEqual(expected.indices, actual.indices) + + test_case.assertEqual( + np.array(expected.values).dtype, np.array(actual.values).dtype) + test_case.assertAllEqual(expected.values, actual.values) + + test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype) + test_case.assertAllEqual(expected.dense_shape, actual.dense_shape) + + +class VocabularyFileCategoricalColumnTest(test.TestCase): + + def setUp(self): + super(VocabularyFileCategoricalColumnTest, self).setUp() + + # Contains ints, Golden State Warriors jersey numbers: 30, 35, 11, 23, 22 + self._warriors_vocabulary_file_name = test.test_src_dir_path( + 'python/feature_column/testdata/warriors_vocabulary.txt') + self._warriors_vocabulary_size = 5 + + # Contains strings, character names from 'The Wire': omar, stringer, marlo + self._wire_vocabulary_file_name = test.test_src_dir_path( + 'python/feature_column/testdata/wire_vocabulary.txt') + self._wire_vocabulary_size = 3 + + def test_defaults(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file='path_to_file', vocabulary_size=3) + self.assertEqual('aaa', column.name) + self.assertEqual('aaa', column.key) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.string) + }, column.parse_example_spec) + + def test_key_should_be_string(self): + with self.assertRaisesRegexp(ValueError, 'key must be a string.'): + fc.categorical_column_with_vocabulary_file( + key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3) + + def test_all_constructor_args(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file='path_to_file', vocabulary_size=3, + num_oov_buckets=4, dtype=dtypes.int32) + self.assertEqual(7, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int32) + }, column.parse_example_spec) + + def test_deep_copy(self): + original = fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file='path_to_file', vocabulary_size=3, + num_oov_buckets=4, dtype=dtypes.int32) + for column in (original, copy.deepcopy(original)): + self.assertEqual('aaa', column.name) + self.assertEqual(7, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int32) + }, column.parse_example_spec) + + def test_vocabulary_file_none(self): + with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'): + fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file=None, vocabulary_size=3) + + def test_vocabulary_file_empty_string(self): + with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'): + fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file='', vocabulary_size=3) + + def test_invalid_vocabulary_file(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + column.get_sparse_tensors(FeatureTransformationCache({'aaa': inputs}), None) + with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'): + with self.test_session(): + lookup_ops.tables_initializer().run() + + def test_invalid_vocabulary_size(self): + with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'): + fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=-1) + with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'): + fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=0) + + def test_too_large_vocabulary_size(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size + 1) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + column.get_sparse_tensors(FeatureTransformationCache({'aaa': inputs}), None) + with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'): + with self.test_session(): + lookup_ops.tables_initializer().run() + + def test_invalid_num_oov_buckets(self): + with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'): + fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file='path', vocabulary_size=3, + num_oov_buckets=-1) + + def test_invalid_dtype(self): + with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'): + fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file='path', vocabulary_size=3, + dtype=dtypes.float64) + + def test_invalid_buckets_and_default_value(self): + with self.assertRaisesRegexp( + ValueError, 'both num_oov_buckets and default_value'): + fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size, + num_oov_buckets=100, + default_value=2) + + def test_invalid_input_dtype_int32(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size, + dtype=dtypes.string) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(12, 24, 36), + dense_shape=(2, 2)) + with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'): + column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + + def test_invalid_input_dtype_string(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._warriors_vocabulary_file_name, + vocabulary_size=self._warriors_vocabulary_size, + dtype=dtypes.int32) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('omar', 'stringer', 'marlo'), + dense_shape=(2, 2)) + with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'): + column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + + def test_parse_example(self): + a = fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file='path_to_file', vocabulary_size=3) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a])) + self.assertIn('aaa', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([b'omar', b'stringer'], dtype=np.object_), + dense_shape=[1, 2]), + features['aaa'].eval()) + + def test_get_sparse_tensors(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, -1, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_none_vocabulary_size(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', vocabulary_file=self._wire_vocabulary_file_name) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value(self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array( + (2, -1, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_transform_feature(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + id_tensor = _transform_features({'aaa': inputs}, [column], None)[column] + with _initialized_session(): + _assert_sparse_tensor_value(self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array( + (2, -1, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_tensor.eval()) + + def DISABLED_test_get_sparse_tensors_weight_collections(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size) + inputs = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), + weight_collections=('my_weights',)) + + self.assertItemsEqual( + [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)) + self.assertItemsEqual([], ops.get_collection('my_weights')) + + def test_get_sparse_tensors_dense_input(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': (('marlo', ''), ('skywalker', 'omar')) + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=np.array((2, -1, 0), dtype=np.int64), + dense_shape=(2, 2)), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_default_value_in_vocabulary(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size, + default_value=2) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, 2, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_with_oov_buckets(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size, + num_oov_buckets=100) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1), (1, 2)), + values=('marlo', 'skywalker', 'omar', 'heisenberg'), + dense_shape=(2, 3)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, 33, 0, 62), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_small_vocabulary_size(self): + # 'marlo' is the last entry in our vocabulary file, so be setting + # `vocabulary_size` to 1 less than number of entries in file, we take + # 'marlo' out of the vocabulary. + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size - 1) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((-1, -1, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_int32(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._warriors_vocabulary_file_name, + vocabulary_size=self._warriors_vocabulary_size, + dtype=dtypes.int32) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1), (2, 2)), + values=(11, 100, 30, 22), + dense_shape=(3, 3)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, -1, 0, 4), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_int32_dense_input(self): + default_value = -100 + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._warriors_vocabulary_file_name, + vocabulary_size=self._warriors_vocabulary_size, + dtype=dtypes.int32, + default_value=default_value) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22)) + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1), (2, 2)), + values=np.array((2, default_value, 0, 4), dtype=np.int64), + dense_shape=(3, 3)), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_int32_with_oov_buckets(self): + column = fc.categorical_column_with_vocabulary_file( + key='aaa', + vocabulary_file=self._warriors_vocabulary_file_name, + vocabulary_size=self._warriors_vocabulary_size, + dtype=dtypes.int32, + num_oov_buckets=100) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1), (2, 2)), + values=(11, 100, 30, 22), + dense_shape=(3, 3)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, 60, 0, 4), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_linear_model(self): + wire_column = fc_old.categorical_column_with_vocabulary_file( + key='wire', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size, + num_oov_buckets=1) + self.assertEqual(4, wire_column._num_buckets) + with ops.Graph().as_default(): + predictions = fc.linear_model({ + wire_column.name: sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + }, (wire_column,)) + bias = get_linear_model_bias() + wire_var = get_linear_model_column_var(wire_column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval() + # 'marlo' -> 2: wire_var[2] = 3 + # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5 + self.assertAllClose(((3.,), (5.,)), predictions.eval()) + + def test_keras_linear_model(self): + wire_column = fc_old.categorical_column_with_vocabulary_file( + key='wire', + vocabulary_file=self._wire_vocabulary_file_name, + vocabulary_size=self._wire_vocabulary_size, + num_oov_buckets=1) + self.assertEqual(4, wire_column._num_buckets) + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + wire_column.name: + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + }, (wire_column,)) + bias = get_linear_model_bias() + wire_var = get_linear_model_column_var(wire_column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval() + # 'marlo' -> 2: wire_var[2] = 3 + # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5 + self.assertAllClose(((3.,), (5.,)), predictions.eval()) + + +class VocabularyListCategoricalColumnTest(test.TestCase): + + def test_defaults_string(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo')) + self.assertEqual('aaa', column.name) + self.assertEqual('aaa', column.key) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.string) + }, column.parse_example_spec) + + def test_key_should_be_string(self): + with self.assertRaisesRegexp(ValueError, 'key must be a string.'): + fc.categorical_column_with_vocabulary_list( + key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo')) + + def test_defaults_int(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=(12, 24, 36)) + self.assertEqual('aaa', column.name) + self.assertEqual('aaa', column.key) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, column.parse_example_spec) + + def test_all_constructor_args(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32, + default_value=-99) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int32) + }, column.parse_example_spec) + + def test_deep_copy(self): + original = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32) + for column in (original, copy.deepcopy(original)): + self.assertEqual('aaa', column.name) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int32) + }, column.parse_example_spec) + + def test_invalid_dtype(self): + with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'): + fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'), + dtype=dtypes.float32) + + def test_invalid_mapping_dtype(self): + with self.assertRaisesRegexp( + ValueError, r'vocabulary dtype must be string or integer'): + fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=(12., 24., 36.)) + + def test_mismatched_int_dtype(self): + with self.assertRaisesRegexp( + ValueError, r'dtype.*and vocabulary dtype.*do not match'): + fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'), + dtype=dtypes.int32) + + def test_mismatched_string_dtype(self): + with self.assertRaisesRegexp( + ValueError, r'dtype.*and vocabulary dtype.*do not match'): + fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string) + + def test_none_mapping(self): + with self.assertRaisesRegexp( + ValueError, r'vocabulary_list.*must be non-empty'): + fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=None) + + def test_empty_mapping(self): + with self.assertRaisesRegexp( + ValueError, r'vocabulary_list.*must be non-empty'): + fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=tuple([])) + + def test_duplicate_mapping(self): + with self.assertRaisesRegexp(ValueError, 'Duplicate keys'): + fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=(12, 24, 12)) + + def test_invalid_num_oov_buckets(self): + with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'): + fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=(12, 24, 36), + num_oov_buckets=-1) + + def test_invalid_buckets_and_default_value(self): + with self.assertRaisesRegexp( + ValueError, 'both num_oov_buckets and default_value'): + fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=(12, 24, 36), + num_oov_buckets=100, + default_value=2) + + def test_invalid_input_dtype_int32(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo')) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(12, 24, 36), + dense_shape=(2, 2)) + with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'): + column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + + def test_invalid_input_dtype_string(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=(12, 24, 36)) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('omar', 'stringer', 'marlo'), + dense_shape=(2, 2)) + with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'): + column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + + def test_parse_example_string(self): + a = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo')) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a])) + self.assertIn('aaa', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([b'omar', b'stringer'], dtype=np.object_), + dense_shape=[1, 2]), + features['aaa'].eval()) + + def test_parse_example_int(self): + a = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=(11, 21, 31)) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(int64_list=feature_pb2.Int64List( + value=[11, 21])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a])) + self.assertIn('aaa', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=[11, 21], + dense_shape=[1, 2]), + features['aaa'].eval()) + + def test_get_sparse_tensors(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo')) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, -1, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_transform_feature(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo')) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + id_tensor = _transform_features({'aaa': inputs}, [column], None)[column] + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, -1, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_tensor.eval()) + + def DISABLED_test_get_sparse_tensors_weight_collections(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo')) + inputs = sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), + weight_collections=('my_weights',)) + + self.assertItemsEqual( + [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)) + self.assertItemsEqual([], ops.get_collection('my_weights')) + + def test_get_sparse_tensors_dense_input(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo')) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': (('marlo', ''), ('skywalker', 'omar')) + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=np.array((2, -1, 0), dtype=np.int64), + dense_shape=(2, 2)), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_default_value_in_vocabulary(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo'), + default_value=2) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, 2, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_with_oov_buckets(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo'), + num_oov_buckets=100) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1), (1, 2)), + values=('marlo', 'skywalker', 'omar', 'heisenberg'), + dense_shape=(2, 3)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, 33, 0, 62), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_int32(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32), + dtype=dtypes.int32) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1), (2, 2)), + values=np.array((11, 100, 30, 22), dtype=np.int32), + dense_shape=(3, 3)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, -1, 0, 4), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_int32_dense_input(self): + default_value = -100 + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32), + dtype=dtypes.int32, + default_value=default_value) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': + np.array( + ((11, -1, -1), (100, 30, -1), (-1, -1, 22)), dtype=np.int32) + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1), (2, 2)), + values=np.array((2, default_value, 0, 4), dtype=np.int64), + dense_shape=(3, 3)), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_int32_with_oov_buckets(self): + column = fc.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32), + dtype=dtypes.int32, + num_oov_buckets=100) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1), (2, 2)), + values=(11, 100, 30, 22), + dense_shape=(3, 3)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((2, 60, 0, 4), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_linear_model(self): + wire_column = fc_old.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo'), + num_oov_buckets=1) + self.assertEqual(4, wire_column._num_buckets) + with ops.Graph().as_default(): + predictions = fc.linear_model({ + wire_column.name: sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + }, (wire_column,)) + bias = get_linear_model_bias() + wire_var = get_linear_model_column_var(wire_column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval() + # 'marlo' -> 2: wire_var[2] = 3 + # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5 + self.assertAllClose(((3.,), (5.,)), predictions.eval()) + + def test_keras_linear_model(self): + wire_column = fc_old.categorical_column_with_vocabulary_list( + key='aaa', + vocabulary_list=('omar', 'stringer', 'marlo'), + num_oov_buckets=1) + self.assertEqual(4, wire_column._num_buckets) + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + wire_column.name: + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + }, (wire_column,)) + bias = get_linear_model_bias() + wire_var = get_linear_model_column_var(wire_column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval() + # 'marlo' -> 2: wire_var[2] = 3 + # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5 + self.assertAllClose(((3.,), (5.,)), predictions.eval()) + + +class IdentityCategoricalColumnTest(test.TestCase): + + def test_constructor(self): + column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + self.assertEqual('aaa', column.name) + self.assertEqual('aaa', column.key) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, column.parse_example_spec) + + def test_key_should_be_string(self): + with self.assertRaisesRegexp(ValueError, 'key must be a string.'): + fc.categorical_column_with_identity(key=('aaa',), num_buckets=3) + + def test_deep_copy(self): + original = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + for column in (original, copy.deepcopy(original)): + self.assertEqual('aaa', column.name) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, column.parse_example_spec) + + def test_invalid_num_buckets_zero(self): + with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'): + fc.categorical_column_with_identity(key='aaa', num_buckets=0) + + def test_invalid_num_buckets_negative(self): + with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'): + fc.categorical_column_with_identity(key='aaa', num_buckets=-1) + + def test_invalid_default_value_too_small(self): + with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'): + fc.categorical_column_with_identity( + key='aaa', num_buckets=3, default_value=-1) + + def test_invalid_default_value_too_big(self): + with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'): + fc.categorical_column_with_identity( + key='aaa', num_buckets=3, default_value=3) + + def test_invalid_input_dtype(self): + column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('omar', 'stringer', 'marlo'), + dense_shape=(2, 2)) + with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'): + column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + + def test_parse_example(self): + a = fc.categorical_column_with_identity(key='aaa', num_buckets=30) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(int64_list=feature_pb2.Int64List( + value=[11, 21])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a])) + self.assertIn('aaa', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([11, 21], dtype=np.int64), + dense_shape=[1, 2]), + features['aaa'].eval()) + + def test_get_sparse_tensors(self): + column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((0, 1, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_transform_feature(self): + column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)) + id_tensor = _transform_features({'aaa': inputs}, [column], None)[column] + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((0, 1, 0), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_tensor.eval()) + + def DISABLED_test_get_sparse_tensors_weight_collections(self): + column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)) + column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), + weight_collections=('my_weights',)) + + self.assertItemsEqual( + [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)) + self.assertItemsEqual([], ops.get_collection('my_weights')) + + def test_get_sparse_tensors_dense_input(self): + column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': ((0, -1), (1, 0)) + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=np.array((0, 1, 0), dtype=np.int64), + dense_shape=(2, 2)), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_with_inputs_too_small(self): + column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(1, -1, 0), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + with self.assertRaisesRegexp( + errors.OpError, 'assert_greater_or_equal_0'): + id_weight_pair.id_tensor.eval() + + def test_get_sparse_tensors_with_inputs_too_big(self): + column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(1, 99, 0), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + with self.assertRaisesRegexp( + errors.OpError, 'assert_less_than_num_buckets'): + id_weight_pair.id_tensor.eval() + + def test_get_sparse_tensors_with_default_value(self): + column = fc.categorical_column_with_identity( + key='aaa', num_buckets=4, default_value=3) + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(1, -1, 99), + dense_shape=(2, 2)) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array((1, 3, 3), dtype=np.int64), + dense_shape=inputs.dense_shape), + id_weight_pair.id_tensor.eval()) + + def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self): + column = fc.categorical_column_with_identity( + key='aaa', num_buckets=4, default_value=3) + input_indices = array_ops.placeholder(dtype=dtypes.int64) + input_values = array_ops.placeholder(dtype=dtypes.int32) + input_shape = array_ops.placeholder(dtype=dtypes.int64) + inputs = sparse_tensor.SparseTensorValue( + indices=input_indices, + values=input_values, + dense_shape=input_shape) + id_weight_pair = column.get_sparse_tensors( + FeatureTransformationCache({ + 'aaa': inputs + }), None) + self.assertIsNone(id_weight_pair.weight_tensor) + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64), + values=np.array((1, 3, 3), dtype=np.int64), + dense_shape=np.array((2, 2), dtype=np.int64)), + id_weight_pair.id_tensor.eval(feed_dict={ + input_indices: ((0, 0), (1, 0), (1, 1)), + input_values: (1, -1, 99), + input_shape: (2, 2), + })) + + def test_linear_model(self): + column = fc_old.categorical_column_with_identity(key='aaa', num_buckets=3) + self.assertEqual(3, column.num_buckets) + with ops.Graph().as_default(): + predictions = fc.linear_model({ + column.name: sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)) + }, (column,)) + bias = get_linear_model_bias() + weight_var = get_linear_model_column_var(column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + weight_var.assign(((1.,), (2.,), (3.,))).eval() + # weight_var[0] = 1 + # weight_var[2] + weight_var[1] = 3+2 = 5 + self.assertAllClose(((1.,), (5.,)), predictions.eval()) + + def test_keras_linear_model(self): + column = fc_old.categorical_column_with_identity(key='aaa', num_buckets=3) + self.assertEqual(3, column.num_buckets) + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + column.name: + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)) + }, (column,)) + bias = get_linear_model_bias() + weight_var = get_linear_model_column_var(column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + weight_var.assign(((1.,), (2.,), (3.,))).eval() + # weight_var[0] = 1 + # weight_var[2] + weight_var[1] = 3+2 = 5 + self.assertAllClose(((1.,), (5.,)), predictions.eval()) + + +class TransformFeaturesTest(test.TestCase): + + # All transform tests are distributed in column test. + # Here we only test multi column case and naming + def transform_multi_column(self): + bucketized_price = fc.bucketized_column( + fc.numeric_column('price'), boundaries=[0, 2, 4, 6]) + hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10) + with ops.Graph().as_default(): + features = { + 'price': [[-1.], [5.]], + 'wire': + sparse_tensor.SparseTensor( + values=['omar', 'stringer', 'marlo'], + indices=[[0, 0], [1, 0], [1, 1]], + dense_shape=[2, 2]) + } + transformed = _transform_features(features, + [bucketized_price, hashed_sparse], None) + with _initialized_session(): + self.assertIn(bucketized_price.name, transformed[bucketized_price].name) + self.assertAllEqual([[0], [3]], transformed[bucketized_price].eval()) + self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name) + self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values.eval()) + + def test_column_order(self): + """When the column is both dense and sparse, uses sparse tensors.""" + + class _LoggerColumn(FeatureColumn): + + def __init__(self, name): + self._name = name + + @property + def name(self): + return self._name + + def transform_feature(self, transformation_cache, state_manager): + self.call_order = call_logger['count'] + call_logger['count'] += 1 + return 'Anything' + + @property + def parse_example_spec(self): + pass + + with ops.Graph().as_default(): + column1 = _LoggerColumn('1') + column2 = _LoggerColumn('2') + call_logger = {'count': 0} + _transform_features({}, [column1, column2], None) + self.assertEqual(0, column1.call_order) + self.assertEqual(1, column2.call_order) + + call_logger = {'count': 0} + _transform_features({}, [column2, column1], None) + self.assertEqual(0, column1.call_order) + self.assertEqual(1, column2.call_order) + + +class IndicatorColumnTest(test.TestCase): + + def test_indicator_column(self): + a = fc.categorical_column_with_hash_bucket('a', 4) + indicator_a = fc.indicator_column(a) + self.assertEqual(indicator_a.categorical_column.name, 'a') + self.assertEqual(indicator_a.name, 'a_indicator') + self.assertEqual(indicator_a.variable_shape, [1, 4]) + + b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100) + indicator_b = fc.indicator_column(b) + self.assertEqual(indicator_b.categorical_column.name, 'b') + self.assertEqual(indicator_b.name, 'b_indicator') + self.assertEqual(indicator_b.variable_shape, [1, 100]) + + def test_1D_shape_succeeds(self): + animal = fc.indicator_column( + fc.categorical_column_with_hash_bucket('animal', 4)) + transformation_cache = FeatureTransformationCache({ + 'animal': ['fox', 'fox'] + }) + output = transformation_cache.get(animal, None) + with self.test_session(): + self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval()) + + def test_2D_shape_succeeds(self): + # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready. + animal = fc.indicator_column( + fc.categorical_column_with_hash_bucket('animal', 4)) + transformation_cache = FeatureTransformationCache({ + 'animal': + sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 0]], + values=['fox', 'fox'], + dense_shape=[2, 1]) + }) + output = transformation_cache.get(animal, None) + with self.test_session(): + self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval()) + + def test_multi_hot(self): + animal = fc.indicator_column( + fc.categorical_column_with_identity('animal', num_buckets=4)) + + transformation_cache = FeatureTransformationCache({ + 'animal': + sparse_tensor.SparseTensor( + indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2]) + }) + output = transformation_cache.get(animal, None) + with self.test_session(): + self.assertAllEqual([[0., 2., 0., 0.]], output.eval()) + + def test_multi_hot2(self): + animal = fc.indicator_column( + fc.categorical_column_with_identity('animal', num_buckets=4)) + transformation_cache = FeatureTransformationCache({ + 'animal': + sparse_tensor.SparseTensor( + indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) + }) + output = transformation_cache.get(animal, None) + with self.test_session(): + self.assertAllEqual([[0., 1., 1., 0.]], output.eval()) + + def test_deep_copy(self): + a = fc.categorical_column_with_hash_bucket('a', 4) + column = fc.indicator_column(a) + column_copy = copy.deepcopy(column) + self.assertEqual(column_copy.categorical_column.name, 'a') + self.assertEqual(column.name, 'a_indicator') + self.assertEqual(column.variable_shape, [1, 4]) + + def test_parse_example(self): + a = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo')) + a_indicator = fc.indicator_column(a) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a_indicator])) + self.assertIn('aaa', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([b'omar', b'stringer'], dtype=np.object_), + dense_shape=[1, 2]), + features['aaa'].eval()) + + def test_transform(self): + a = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo')) + a_indicator = fc.indicator_column(a) + features = { + 'aaa': sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('marlo', 'skywalker', 'omar'), + dense_shape=(2, 2)) + } + indicator_tensor = _transform_features(features, [a_indicator], + None)[a_indicator] + with _initialized_session(): + self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval()) + + def test_transform_with_weighted_column(self): + # Github issue 12557 + ids = fc.categorical_column_with_vocabulary_list( + key='ids', vocabulary_list=('a', 'b', 'c')) + weights = fc.weighted_categorical_column(ids, 'weights') + indicator = fc.indicator_column(weights) + features = { + 'ids': constant_op.constant([['c', 'b', 'a']]), + 'weights': constant_op.constant([[2., 4., 6.]]) + } + indicator_tensor = _transform_features(features, [indicator], + None)[indicator] + with _initialized_session(): + self.assertAllEqual([[6., 4., 2.]], indicator_tensor.eval()) + + def test_transform_with_missing_value_in_weighted_column(self): + # Github issue 12583 + ids = fc.categorical_column_with_vocabulary_list( + key='ids', vocabulary_list=('a', 'b', 'c')) + weights = fc.weighted_categorical_column(ids, 'weights') + indicator = fc.indicator_column(weights) + features = { + 'ids': constant_op.constant([['c', 'b', 'unknown']]), + 'weights': constant_op.constant([[2., 4., 6.]]) + } + indicator_tensor = _transform_features(features, [indicator], + None)[indicator] + with _initialized_session(): + self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval()) + + def test_transform_with_missing_value_in_categorical_column(self): + # Github issue 12583 + ids = fc.categorical_column_with_vocabulary_list( + key='ids', vocabulary_list=('a', 'b', 'c')) + indicator = fc.indicator_column(ids) + features = { + 'ids': constant_op.constant([['c', 'b', 'unknown']]), + } + indicator_tensor = _transform_features(features, [indicator], + None)[indicator] + with _initialized_session(): + self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval()) + + def test_linear_model(self): + animal = fc_old.indicator_column( + fc_old.categorical_column_with_identity('animal', num_buckets=4)) + with ops.Graph().as_default(): + features = { + 'animal': + sparse_tensor.SparseTensor( + indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) + } + + predictions = fc.linear_model(features, [animal]) + weight_var = get_linear_model_column_var(animal) + with _initialized_session(): + # All should be zero-initialized. + self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval()) + self.assertAllClose([[0.]], predictions.eval()) + weight_var.assign([[1.], [2.], [3.], [4.]]).eval() + self.assertAllClose([[2. + 3.]], predictions.eval()) + + def test_keras_linear_model(self): + animal = fc_old.indicator_column( + fc_old.categorical_column_with_identity('animal', num_buckets=4)) + with ops.Graph().as_default(): + features = { + 'animal': + sparse_tensor.SparseTensor( + indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) + } + + predictions = get_keras_linear_model_predictions(features, [animal]) + weight_var = get_linear_model_column_var(animal) + with _initialized_session(): + # All should be zero-initialized. + self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval()) + self.assertAllClose([[0.]], predictions.eval()) + weight_var.assign([[1.], [2.], [3.], [4.]]).eval() + self.assertAllClose([[2. + 3.]], predictions.eval()) + + def test_input_layer(self): + animal = fc_old.indicator_column( + fc_old.categorical_column_with_identity('animal', num_buckets=4)) + with ops.Graph().as_default(): + features = { + 'animal': + sparse_tensor.SparseTensor( + indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) + } + net = fc.input_layer(features, [animal]) + with _initialized_session(): + self.assertAllClose([[0., 1., 1., 0.]], net.eval()) + + +class _TestStateManager(StateManager): + + def __init__(self, trainable=True): + # Dict of feature_column to a dict of variables. + self._all_variables = {} + self._trainable = trainable + + def get_variable(self, + feature_column, + name, + shape, + dtype=None, + initializer=None): + if feature_column not in self._all_variables: + self._all_variables[feature_column] = {} + var_dict = self._all_variables[feature_column] + if name in var_dict: + return var_dict[name] + else: + var = variable_scope.get_variable( + name=name, + shape=shape, + initializer=initializer, + trainable=self._trainable) + var_dict[name] = var + return var + + +class EmbeddingColumnTest(test.TestCase): + + def test_defaults(self): + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + embedding_dimension = 2 + embedding_column = fc.embedding_column( + categorical_column, dimension=embedding_dimension) + self.assertIs(categorical_column, embedding_column.categorical_column) + self.assertEqual(embedding_dimension, embedding_column.dimension) + self.assertEqual('mean', embedding_column.combiner) + self.assertIsNone(embedding_column.ckpt_to_load_from) + self.assertIsNone(embedding_column.tensor_name_in_ckpt) + self.assertIsNone(embedding_column.max_norm) + self.assertTrue(embedding_column.trainable) + self.assertEqual('aaa_embedding', embedding_column.name) + self.assertEqual((embedding_dimension,), embedding_column.variable_shape) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column.parse_example_spec) + + def test_all_constructor_args(self): + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + embedding_dimension = 2 + embedding_column = fc.embedding_column( + categorical_column, dimension=embedding_dimension, + combiner='my_combiner', initializer=lambda: 'my_initializer', + ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor', + max_norm=42., trainable=False) + self.assertIs(categorical_column, embedding_column.categorical_column) + self.assertEqual(embedding_dimension, embedding_column.dimension) + self.assertEqual('my_combiner', embedding_column.combiner) + self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from) + self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt) + self.assertEqual(42., embedding_column.max_norm) + self.assertFalse(embedding_column.trainable) + self.assertEqual('aaa_embedding', embedding_column.name) + self.assertEqual((embedding_dimension,), embedding_column.variable_shape) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column.parse_example_spec) + + def test_deep_copy(self): + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + embedding_dimension = 2 + original = fc.embedding_column( + categorical_column, dimension=embedding_dimension, + combiner='my_combiner', initializer=lambda: 'my_initializer', + ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor', + max_norm=42., trainable=False) + for embedding_column in (original, copy.deepcopy(original)): + self.assertEqual('aaa', embedding_column.categorical_column.name) + self.assertEqual(3, embedding_column.categorical_column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column.categorical_column.parse_example_spec) + + self.assertEqual(embedding_dimension, embedding_column.dimension) + self.assertEqual('my_combiner', embedding_column.combiner) + self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from) + self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt) + self.assertEqual(42., embedding_column.max_norm) + self.assertFalse(embedding_column.trainable) + self.assertEqual('aaa_embedding', embedding_column.name) + self.assertEqual((embedding_dimension,), embedding_column.variable_shape) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column.parse_example_spec) + + def test_invalid_initializer(self): + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + with self.assertRaisesRegexp(ValueError, 'initializer must be callable'): + fc.embedding_column(categorical_column, dimension=2, initializer='not_fn') + + def test_parse_example(self): + a = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo')) + a_embedded = fc.embedding_column(a, dimension=2) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a_embedded])) + self.assertIn('aaa', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([b'omar', b'stringer'], dtype=np.object_), + dense_shape=[1, 2]), + features['aaa'].eval()) + + def test_transform_feature(self): + a = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + a_embedded = fc.embedding_column(a, dimension=2) + features = { + 'aaa': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)) + } + outputs = _transform_features(features, [a, a_embedded], None) + output_a = outputs[a] + output_embedded = outputs[a_embedded] + with _initialized_session(): + _assert_sparse_tensor_value( + self, output_a.eval(), output_embedded.eval()) + + def test_get_dense_tensor(self): + # Inputs. + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 4), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(4, 5)) + + # Embedding variable. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Expected lookup result, using combiner='mean'. + expected_lookups = ( + # example 0, ids [2], embedding = [7, 11] + (7., 11.), + # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] + (2., 3.5), + # example 2, ids [], embedding = [0, 0] + (0., 0.), + # example 3, ids [1], embedding = [3, 5] + (3., 5.), + ) + + # Build columns. + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column = fc.embedding_column( + categorical_column, dimension=embedding_dimension, + initializer=_initializer) + state_manager = _TestStateManager() + + # Provide sparse input and get dense result. + embedding_lookup = embedding_column.get_dense_tensor( + FeatureTransformationCache({ + 'aaa': sparse_input + }), state_manager) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual(('embedding_weights:0',), + tuple([v.name for v in global_vars])) + with _initialized_session(): + self.assertAllEqual(embedding_values, global_vars[0].eval()) + self.assertAllEqual(expected_lookups, embedding_lookup.eval()) + + def test_get_dense_tensor_3d(self): + # Inputs. + vocabulary_size = 4 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0, 0), (1, 1, 0), (1, 1, 4), (3, 0, 0), (3, 1, 2)), + values=(2, 0, 1, 1, 2), + dense_shape=(4, 2, 5)) + + # Embedding variable. + embedding_dimension = 3 + embedding_values = ( + (1., 2., 4.), # id 0 + (3., 5., 1.), # id 1 + (7., 11., 2.), # id 2 + (2., 7., 12.) # id 3 + ) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Expected lookup result, using combiner='mean'. + expected_lookups = ( + # example 0, ids [[2], []], embedding = [[7, 11, 2], [0, 0, 0]] + ((7., 11., 2.), (0., 0., 0.)), + # example 1, ids [[], [0, 1]], embedding + # = mean([[], [1, 2, 4] + [3, 5, 1]]) = [[0, 0, 0], [2, 3.5, 2.5]] + ((0., 0., 0.), (2., 3.5, 2.5)), + # example 2, ids [[], []], embedding = [[0, 0, 0], [0, 0, 0]] + ((0., 0., 0.), (0., 0., 0.)), + # example 3, ids [[1], [2]], embedding = [[3, 5, 1], [7, 11, 2]] + ((3., 5., 1.), (7., 11., 2.)), + ) + + # Build columns. + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column = fc.embedding_column( + categorical_column, dimension=embedding_dimension, + initializer=_initializer) + state_manager = _TestStateManager() + + # Provide sparse input and get dense result. + embedding_lookup = embedding_column.get_dense_tensor( + FeatureTransformationCache({ + 'aaa': sparse_input + }), state_manager) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual(('embedding_weights:0',), + tuple([v.name for v in global_vars])) + with _initialized_session(): + self.assertAllEqual(embedding_values, global_vars[0].eval()) + self.assertAllEqual(expected_lookups, embedding_lookup.eval()) + + def DISABLED_test_get_dense_tensor_weight_collections(self): + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 4), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(4, 5)) + + # Build columns. + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + embedding_column = fc.embedding_column(categorical_column, dimension=2) + + # Provide sparse input and get dense result. + embedding_column.get_dense_tensor( + FeatureTransformationCache({ + 'aaa': sparse_input + }), + weight_collections=('my_vars',)) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual(('embedding_weights:0',), + tuple([v.name for v in global_vars])) + my_vars = ops.get_collection('my_vars') + self.assertItemsEqual( + ('embedding_weights:0',), tuple([v.name for v in my_vars])) + + def test_get_dense_tensor_placeholder_inputs(self): + # Inputs. + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 4), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(4, 5)) + + # Embedding variable. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Expected lookup result, using combiner='mean'. + expected_lookups = ( + # example 0, ids [2], embedding = [7, 11] + (7., 11.), + # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] + (2., 3.5), + # example 2, ids [], embedding = [0, 0] + (0., 0.), + # example 3, ids [1], embedding = [3, 5] + (3., 5.), + ) + + # Build columns. + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column = fc.embedding_column( + categorical_column, dimension=embedding_dimension, + initializer=_initializer) + state_manager = _TestStateManager() + + # Provide sparse input and get dense result. + input_indices = array_ops.placeholder(dtype=dtypes.int64) + input_values = array_ops.placeholder(dtype=dtypes.int64) + input_shape = array_ops.placeholder(dtype=dtypes.int64) + embedding_lookup = embedding_column.get_dense_tensor( + FeatureTransformationCache({ + 'aaa': + sparse_tensor.SparseTensorValue( + indices=input_indices, + values=input_values, + dense_shape=input_shape) + }), state_manager) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual( + ('embedding_weights:0',), tuple([v.name for v in global_vars])) + with _initialized_session(): + self.assertAllEqual(embedding_values, global_vars[0].eval()) + self.assertAllEqual(expected_lookups, embedding_lookup.eval( + feed_dict={ + input_indices: sparse_input.indices, + input_values: sparse_input.values, + input_shape: sparse_input.dense_shape, + })) + + def test_get_dense_tensor_restore_from_ckpt(self): + # Inputs. + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 4), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(4, 5)) + + # Embedding variable. The checkpoint file contains _embedding_values. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + ckpt_path = test.test_src_dir_path( + 'python/feature_column/testdata/embedding.ckpt') + ckpt_tensor = 'my_embedding' + + # Expected lookup result, using combiner='mean'. + expected_lookups = ( + # example 0, ids [2], embedding = [7, 11] + (7., 11.), + # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] + (2., 3.5), + # example 2, ids [], embedding = [0, 0] + (0., 0.), + # example 3, ids [1], embedding = [3, 5] + (3., 5.), + ) + + # Build columns. + categorical_column = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column = fc.embedding_column( + categorical_column, dimension=embedding_dimension, + ckpt_to_load_from=ckpt_path, + tensor_name_in_ckpt=ckpt_tensor) + state_manager = _TestStateManager() + + # Provide sparse input and get dense result. + embedding_lookup = embedding_column.get_dense_tensor( + FeatureTransformationCache({ + 'aaa': sparse_input + }), state_manager) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual( + ('embedding_weights:0',), tuple([v.name for v in global_vars])) + with _initialized_session(): + self.assertAllEqual(embedding_values, global_vars[0].eval()) + self.assertAllEqual(expected_lookups, embedding_lookup.eval()) + + def test_linear_model(self): + # Inputs. + batch_size = 4 + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 4), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(batch_size, 5)) + + # Embedding variable. + embedding_dimension = 2 + embedding_shape = (vocabulary_size, embedding_dimension) + zeros_embedding_values = np.zeros(embedding_shape) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual(embedding_shape, shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return zeros_embedding_values + + # Build columns. + categorical_column = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column = fc_old.embedding_column( + categorical_column, + dimension=embedding_dimension, + initializer=_initializer) + + with ops.Graph().as_default(): + predictions = fc.linear_model({ + categorical_column.name: sparse_input + }, (embedding_column,)) + expected_var_names = ( + 'linear_model/bias_weights:0', + 'linear_model/aaa_embedding/weights:0', + 'linear_model/aaa_embedding/embedding_weights:0', + ) + self.assertItemsEqual( + expected_var_names, + [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)]) + trainable_vars = { + v.name: v for v in ops.get_collection( + ops.GraphKeys.TRAINABLE_VARIABLES) + } + self.assertItemsEqual(expected_var_names, trainable_vars.keys()) + bias = trainable_vars['linear_model/bias_weights:0'] + embedding_weights = trainable_vars[ + 'linear_model/aaa_embedding/embedding_weights:0'] + linear_weights = trainable_vars[ + 'linear_model/aaa_embedding/weights:0'] + with _initialized_session(): + # Predictions with all zero weights. + self.assertAllClose(np.zeros((1,)), bias.eval()) + self.assertAllClose(zeros_embedding_values, embedding_weights.eval()) + self.assertAllClose( + np.zeros((embedding_dimension, 1)), linear_weights.eval()) + self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval()) + + # Predictions with all non-zero weights. + embedding_weights.assign(( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + )).eval() + linear_weights.assign(((4.,), (6.,))).eval() + # example 0, ids [2], embedding[0] = [7, 11] + # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5] + # example 2, ids [], embedding[2] = [0, 0] + # example 3, ids [1], embedding[3] = [3, 5] + # sum(embeddings * linear_weights) + # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42] + self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval()) + + def test_keras_linear_model(self): + # Inputs. + batch_size = 4 + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 4), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(batch_size, 5)) + + # Embedding variable. + embedding_dimension = 2 + embedding_shape = (vocabulary_size, embedding_dimension) + zeros_embedding_values = np.zeros(embedding_shape) + + def _initializer(shape, dtype, partition_info): + self.assertAllEqual(embedding_shape, shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return zeros_embedding_values + + # Build columns. + categorical_column = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column = fc_old.embedding_column( + categorical_column, + dimension=embedding_dimension, + initializer=_initializer) + + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + categorical_column.name: sparse_input + }, (embedding_column,)) + expected_var_names = ( + 'linear_model/bias_weights:0', + 'linear_model/aaa_embedding/weights:0', + 'linear_model/aaa_embedding/embedding_weights:0', + ) + self.assertItemsEqual( + expected_var_names, + [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)]) + trainable_vars = { + v.name: v + for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + } + self.assertItemsEqual(expected_var_names, trainable_vars.keys()) + bias = trainable_vars['linear_model/bias_weights:0'] + embedding_weights = trainable_vars[ + 'linear_model/aaa_embedding/embedding_weights:0'] + linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0'] + with _initialized_session(): + # Predictions with all zero weights. + self.assertAllClose(np.zeros((1,)), bias.eval()) + self.assertAllClose(zeros_embedding_values, embedding_weights.eval()) + self.assertAllClose( + np.zeros((embedding_dimension, 1)), linear_weights.eval()) + self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval()) + + # Predictions with all non-zero weights. + embedding_weights.assign(( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + )).eval() + linear_weights.assign(((4.,), (6.,))).eval() + # example 0, ids [2], embedding[0] = [7, 11] + # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5] + # example 2, ids [], embedding[2] = [0, 0] + # example 3, ids [1], embedding[3] = [3, 5] + # sum(embeddings * linear_weights) + # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42] + self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval()) + + def test_input_layer(self): + # Inputs. + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 4), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(4, 5)) + + # Embedding variable. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Expected lookup result, using combiner='mean'. + expected_lookups = ( + # example 0, ids [2], embedding = [7, 11] + (7., 11.), + # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] + (2., 3.5), + # example 2, ids [], embedding = [0, 0] + (0., 0.), + # example 3, ids [1], embedding = [3, 5] + (3., 5.), + ) + + # Build columns. + categorical_column = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column = fc_old.embedding_column( + categorical_column, + dimension=embedding_dimension, + initializer=_initializer) + + # Provide sparse input and get dense result. + input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,)) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual( + ('input_layer/aaa_embedding/embedding_weights:0',), + tuple([v.name for v in global_vars])) + trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + self.assertItemsEqual( + ('input_layer/aaa_embedding/embedding_weights:0',), + tuple([v.name for v in trainable_vars])) + with _initialized_session(): + self.assertAllEqual(embedding_values, trainable_vars[0].eval()) + self.assertAllEqual(expected_lookups, input_layer.eval()) + + def test_input_layer_not_trainable(self): + # Inputs. + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 4), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(4, 5)) + + # Embedding variable. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Expected lookup result, using combiner='mean'. + expected_lookups = ( + # example 0, ids [2], embedding = [7, 11] + (7., 11.), + # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] + (2., 3.5), + # example 2, ids [], embedding = [0, 0] + (0., 0.), + # example 3, ids [1], embedding = [3, 5] + (3., 5.), + ) + + # Build columns. + categorical_column = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column = fc_old.embedding_column( + categorical_column, + dimension=embedding_dimension, + initializer=_initializer, + trainable=False) + + # Provide sparse input and get dense result. + input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,)) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual( + ('input_layer/aaa_embedding/embedding_weights:0',), + tuple([v.name for v in global_vars])) + self.assertItemsEqual( + [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)) + with _initialized_session(): + self.assertAllEqual(embedding_values, global_vars[0].eval()) + self.assertAllEqual(expected_lookups, input_layer.eval()) + + +class _TestSharedEmbeddingStateManager(StateManager): + """Manages the state for shared embedding columns. + + This can handle multiple groups of shared embedding columns. + """ + + def __init__(self, trainable=True): + # Dict of shared_embedding_collection_name to a dict of variables. + self._all_variables = {} + self._trainable = trainable + + def get_variable(self, + feature_column, + name, + shape, + dtype=None, + initializer=None): + if not isinstance(feature_column, fc.SharedEmbeddingColumn): + raise ValueError( + 'SharedEmbeddingStateManager can only handle SharedEmbeddingColumns. ' + 'Given type: {} '.format(type(feature_column))) + + collection_name = feature_column.shared_collection_name + if collection_name not in self._all_variables: + self._all_variables[collection_name] = {} + var_dict = self._all_variables[collection_name] + if name in var_dict: + return var_dict[name] + else: + var = variable_scope.get_variable( + name=name, + shape=shape, + initializer=initializer, + trainable=self._trainable) + var_dict[name] = var + return var + + +class SharedEmbeddingColumnTest(test.TestCase): + + def test_defaults(self): + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=3) + embedding_dimension = 2 + embedding_column_b, embedding_column_a = fc.shared_embedding_columns( + [categorical_column_b, categorical_column_a], + dimension=embedding_dimension) + self.assertIs(categorical_column_a, embedding_column_a.categorical_column) + self.assertIs(categorical_column_b, embedding_column_b.categorical_column) + self.assertEqual(embedding_dimension, embedding_column_a.dimension) + self.assertEqual(embedding_dimension, embedding_column_b.dimension) + self.assertEqual('mean', embedding_column_a.combiner) + self.assertEqual('mean', embedding_column_b.combiner) + self.assertIsNone(embedding_column_a.ckpt_to_load_from) + self.assertIsNone(embedding_column_b.ckpt_to_load_from) + self.assertEqual('aaa_bbb_shared_embedding', + embedding_column_a.shared_collection_name) + self.assertEqual('aaa_bbb_shared_embedding', + embedding_column_b.shared_collection_name) + self.assertIsNone(embedding_column_a.tensor_name_in_ckpt) + self.assertIsNone(embedding_column_b.tensor_name_in_ckpt) + self.assertIsNone(embedding_column_a.max_norm) + self.assertIsNone(embedding_column_b.max_norm) + self.assertTrue(embedding_column_a.trainable) + self.assertTrue(embedding_column_b.trainable) + self.assertEqual('aaa_shared_embedding', embedding_column_a.name) + self.assertEqual('bbb_shared_embedding', embedding_column_b.name) + self.assertEqual((embedding_dimension,), embedding_column_a.variable_shape) + self.assertEqual((embedding_dimension,), embedding_column_b.variable_shape) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column_a.parse_example_spec) + self.assertEqual({ + 'bbb': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column_b.parse_example_spec) + + def test_all_constructor_args(self): + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=3) + embedding_dimension = 2 + embedding_column_a, embedding_column_b = fc.shared_embedding_columns( + [categorical_column_a, categorical_column_b], + dimension=embedding_dimension, + combiner='my_combiner', + initializer=lambda: 'my_initializer', + shared_embedding_collection_name='shared_embedding_collection_name', + ckpt_to_load_from='my_ckpt', + tensor_name_in_ckpt='my_ckpt_tensor', + max_norm=42., + trainable=False) + self.assertIs(categorical_column_a, embedding_column_a.categorical_column) + self.assertIs(categorical_column_b, embedding_column_b.categorical_column) + self.assertEqual(embedding_dimension, embedding_column_a.dimension) + self.assertEqual(embedding_dimension, embedding_column_b.dimension) + self.assertEqual('my_combiner', embedding_column_a.combiner) + self.assertEqual('my_combiner', embedding_column_b.combiner) + self.assertEqual('shared_embedding_collection_name', + embedding_column_a.shared_collection_name) + self.assertEqual('shared_embedding_collection_name', + embedding_column_b.shared_collection_name) + self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from) + self.assertEqual('my_ckpt', embedding_column_b.ckpt_to_load_from) + self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt) + self.assertEqual('my_ckpt_tensor', embedding_column_b.tensor_name_in_ckpt) + self.assertEqual(42., embedding_column_a.max_norm) + self.assertEqual(42., embedding_column_b.max_norm) + self.assertFalse(embedding_column_a.trainable) + self.assertFalse(embedding_column_b.trainable) + self.assertEqual('aaa_shared_embedding', embedding_column_a.name) + self.assertEqual('bbb_shared_embedding', embedding_column_b.name) + self.assertEqual((embedding_dimension,), embedding_column_a.variable_shape) + self.assertEqual((embedding_dimension,), embedding_column_b.variable_shape) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column_a.parse_example_spec) + self.assertEqual({ + 'bbb': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column_b.parse_example_spec) + + def test_deep_copy(self): + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=3) + embedding_dimension = 2 + original_a, _ = fc.shared_embedding_columns( + [categorical_column_a, categorical_column_b], + dimension=embedding_dimension, + combiner='my_combiner', + initializer=lambda: 'my_initializer', + shared_embedding_collection_name='shared_embedding_collection_name', + ckpt_to_load_from='my_ckpt', + tensor_name_in_ckpt='my_ckpt_tensor', + max_norm=42., trainable=False) + for embedding_column_a in (original_a, copy.deepcopy(original_a)): + self.assertEqual('aaa', embedding_column_a.categorical_column.name) + self.assertEqual(3, embedding_column_a.categorical_column.num_buckets) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column_a.categorical_column.parse_example_spec) + + self.assertEqual(embedding_dimension, embedding_column_a.dimension) + self.assertEqual('my_combiner', embedding_column_a.combiner) + self.assertEqual('shared_embedding_collection_name', + embedding_column_a.shared_collection_name) + self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from) + self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt) + self.assertEqual(42., embedding_column_a.max_norm) + self.assertFalse(embedding_column_a.trainable) + self.assertEqual('aaa_shared_embedding', embedding_column_a.name) + self.assertEqual((embedding_dimension,), + embedding_column_a.variable_shape) + self.assertEqual({ + 'aaa': parsing_ops.VarLenFeature(dtypes.int64) + }, embedding_column_a.parse_example_spec) + + def test_invalid_initializer(self): + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=3) + with self.assertRaisesRegexp(ValueError, 'initializer must be callable'): + fc.shared_embedding_columns( + [categorical_column_a, categorical_column_b], dimension=2, + initializer='not_fn') + + def test_incompatible_column_type(self): + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=3) + categorical_column_c = fc.categorical_column_with_hash_bucket( + key='ccc', hash_bucket_size=3) + with self.assertRaisesRegexp( + ValueError, 'all categorical_columns must have the same type.*' + 'IdentityCategoricalColumn.*HashedCategoricalColumn'): + fc.shared_embedding_columns( + [categorical_column_a, categorical_column_b, categorical_column_c], + dimension=2) + + def test_weighted_categorical_column_ok(self): + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=3) + weighted_categorical_column_a = fc.weighted_categorical_column( + categorical_column_a, weight_feature_key='aaa_weights') + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=3) + weighted_categorical_column_b = fc.weighted_categorical_column( + categorical_column_b, weight_feature_key='bbb_weights') + fc.shared_embedding_columns( + [weighted_categorical_column_a, categorical_column_b], dimension=2) + fc.shared_embedding_columns( + [categorical_column_a, weighted_categorical_column_b], dimension=2) + fc.shared_embedding_columns( + [weighted_categorical_column_a, weighted_categorical_column_b], + dimension=2) + + def test_parse_example(self): + a = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo')) + b = fc.categorical_column_with_vocabulary_list( + key='bbb', vocabulary_list=('omar', 'stringer', 'marlo')) + a_embedded, b_embedded = fc.shared_embedding_columns( + [a, b], dimension=2) + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])), + 'bbb': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'stringer', b'marlo'])), + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a_embedded, b_embedded])) + self.assertIn('aaa', features) + self.assertIn('bbb', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([b'omar', b'stringer'], dtype=np.object_), + dense_shape=[1, 2]), + features['aaa'].eval()) + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([b'stringer', b'marlo'], dtype=np.object_), + dense_shape=[1, 2]), + features['bbb'].eval()) + + def test_transform_feature(self): + a = fc.categorical_column_with_identity(key='aaa', num_buckets=3) + b = fc.categorical_column_with_identity(key='bbb', num_buckets=3) + a_embedded, b_embedded = fc.shared_embedding_columns( + [a, b], dimension=2) + features = { + 'aaa': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)), + 'bbb': sparse_tensor.SparseTensor( + indices=((0, 0), (1, 0), (1, 1)), + values=(1, 2, 1), + dense_shape=(2, 2)), + } + outputs = _transform_features(features, [a, a_embedded, b, b_embedded], + None) + output_a = outputs[a] + output_a_embedded = outputs[a_embedded] + output_b = outputs[b] + output_b_embedded = outputs[b_embedded] + with _initialized_session(): + _assert_sparse_tensor_value( + self, output_a.eval(), output_a_embedded.eval()) + _assert_sparse_tensor_value( + self, output_b.eval(), output_b_embedded.eval()) + + def test_get_dense_tensor(self): + # Inputs. + vocabulary_size = 3 + # -1 values are ignored. + input_a = np.array( + [[2, -1, -1], # example 0, ids [2] + [0, 1, -1]]) # example 1, ids [0, 1] + input_b = np.array( + [[0, -1, -1], # example 0, ids [0] + [-1, -1, -1]]) # example 1, ids [] + input_features = { + 'aaa': input_a, + 'bbb': input_b + } + + # Embedding variable. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Expected lookup result, using combiner='mean'. + expected_lookups_a = ( + # example 0: + (7., 11.), # ids [2], embedding = [7, 11] + # example 1: + (2., 3.5), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] + ) + expected_lookups_b = ( + # example 0: + (1., 2.), # ids [0], embedding = [1, 2] + # example 1: + (0., 0.), # ids [], embedding = [0, 0] + ) + + # Build columns. + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=vocabulary_size) + embedding_column_a, embedding_column_b = fc.shared_embedding_columns( + [categorical_column_a, categorical_column_b], + dimension=embedding_dimension, initializer=_initializer) + state_manager = _TestSharedEmbeddingStateManager() + + # Provide sparse input and get dense result. + embedding_lookup_a = embedding_column_a.get_dense_tensor( + FeatureTransformationCache(input_features), state_manager) + embedding_lookup_b = embedding_column_b.get_dense_tensor( + FeatureTransformationCache(input_features), state_manager) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual(('embedding_weights:0',), + tuple([v.name for v in global_vars])) + embedding_var = global_vars[0] + with _initialized_session(): + self.assertAllEqual(embedding_values, embedding_var.eval()) + self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval()) + self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval()) + + def DISABLED_test_get_dense_tensor_weight_collections(self): + # Inputs. + vocabulary_size = 3 + # -1 values are ignored. + input_a = np.array([ + [2, -1, -1], # example 0, ids [2] + [0, 1, -1] + ]) # example 1, ids [0, 1] + input_b = np.array([ + [0, -1, -1], # example 0, ids [0] + [-1, -1, -1] + ]) # example 1, ids [] + input_features = {'aaa': input_a, 'bbb': input_b} + + # Embedding variable. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Build columns. + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=vocabulary_size) + embedding_column_a, embedding_column_b = fc.shared_embedding_columns( + [categorical_column_a, categorical_column_b], + dimension=embedding_dimension, + initializer=_initializer) + + fc.input_layer( + input_features, [embedding_column_a, embedding_column_b], + weight_collections=('my_vars',)) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual( + ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',), + tuple(v.name for v in global_vars)) + my_vars = ops.get_collection('my_vars') + self.assertItemsEqual( + ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',), + tuple(v.name for v in my_vars)) + + def test_get_dense_tensor_placeholder_inputs(self): + # Inputs. + vocabulary_size = 3 + # -1 values are ignored. + input_a = np.array( + [[2, -1, -1], # example 0, ids [2] + [0, 1, -1]]) # example 1, ids [0, 1] + input_b = np.array( + [[0, -1, -1], # example 0, ids [0] + [-1, -1, -1]]) # example 1, ids [] + # Specify shape, because dense input must have rank specified. + input_a_placeholder = array_ops.placeholder( + dtype=dtypes.int64, shape=[None, 3]) + input_b_placeholder = array_ops.placeholder( + dtype=dtypes.int64, shape=[None, 3]) + input_features = { + 'aaa': input_a_placeholder, + 'bbb': input_b_placeholder, + } + feed_dict = { + input_a_placeholder: input_a, + input_b_placeholder: input_b, + } + + # Embedding variable. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Build columns. + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + categorical_column_b = fc.categorical_column_with_identity( + key='bbb', num_buckets=vocabulary_size) + embedding_column_a, embedding_column_b = fc.shared_embedding_columns( + [categorical_column_a, categorical_column_b], + dimension=embedding_dimension, initializer=_initializer) + state_manager = _TestSharedEmbeddingStateManager() + + # Provide sparse input and get dense result. + embedding_lookup_a = embedding_column_a.get_dense_tensor( + FeatureTransformationCache(input_features), state_manager) + embedding_lookup_b = embedding_column_b.get_dense_tensor( + FeatureTransformationCache(input_features), state_manager) + + with _initialized_session() as sess: + sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict) + + def test_linear_model(self): + # Inputs. + batch_size = 2 + vocabulary_size = 3 + # -1 values are ignored. + input_a = np.array( + [[2, -1, -1], # example 0, ids [2] + [0, 1, -1]]) # example 1, ids [0, 1] + input_b = np.array( + [[0, -1, -1], # example 0, ids [0] + [-1, -1, -1]]) # example 1, ids [] + + # Embedding variable. + embedding_dimension = 2 + embedding_shape = (vocabulary_size, embedding_dimension) + zeros_embedding_values = np.zeros(embedding_shape) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual(embedding_shape, shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return zeros_embedding_values + + # Build columns. + categorical_column_a = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + categorical_column_b = fc_old.categorical_column_with_identity( + key='bbb', num_buckets=vocabulary_size) + embedding_column_a, embedding_column_b = fc_old.shared_embedding_columns( + [categorical_column_a, categorical_column_b], + dimension=embedding_dimension, + initializer=_initializer) + + with ops.Graph().as_default(): + predictions = fc.linear_model({ + categorical_column_a.name: input_a, + categorical_column_b.name: input_b, + }, (embedding_column_a, embedding_column_b)) + # Linear weights do not follow the column name. But this is a rare use + # case, and fixing it would add too much complexity to the code. + expected_var_names = ( + 'linear_model/bias_weights:0', + 'linear_model/aaa_bbb_shared_embedding/weights:0', + 'linear_model/aaa_bbb_shared_embedding/embedding_weights:0', + 'linear_model/aaa_bbb_shared_embedding_1/weights:0', + ) + self.assertItemsEqual( + expected_var_names, + [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)]) + trainable_vars = { + v.name: v for v in ops.get_collection( + ops.GraphKeys.TRAINABLE_VARIABLES) + } + self.assertItemsEqual(expected_var_names, trainable_vars.keys()) + bias = trainable_vars['linear_model/bias_weights:0'] + embedding_weights = trainable_vars[ + 'linear_model/aaa_bbb_shared_embedding/embedding_weights:0'] + linear_weights_a = trainable_vars[ + 'linear_model/aaa_bbb_shared_embedding/weights:0'] + linear_weights_b = trainable_vars[ + 'linear_model/aaa_bbb_shared_embedding_1/weights:0'] + with _initialized_session(): + # Predictions with all zero weights. + self.assertAllClose(np.zeros((1,)), bias.eval()) + self.assertAllClose(zeros_embedding_values, embedding_weights.eval()) + self.assertAllClose( + np.zeros((embedding_dimension, 1)), linear_weights_a.eval()) + self.assertAllClose( + np.zeros((embedding_dimension, 1)), linear_weights_b.eval()) + self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval()) + + # Predictions with all non-zero weights. + embedding_weights.assign(( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + )).eval() + linear_weights_a.assign(((4.,), (6.,))).eval() + # example 0, ids [2], embedding[0] = [7, 11] + # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5] + # sum(embeddings * linear_weights) + # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29] + linear_weights_b.assign(((3.,), (5.,))).eval() + # example 0, ids [0], embedding[0] = [1, 2] + # example 1, ids [], embedding[1] = 0, 0] + # sum(embeddings * linear_weights) + # = [3*1 + 5*2, 3*0 +5*0] = [13, 0] + self.assertAllClose([[94. + 13.], [29.]], predictions.eval()) + + def test_keras_linear_model(self): + # Inputs. + batch_size = 2 + vocabulary_size = 3 + # -1 values are ignored. + input_a = np.array([ + [2, -1, -1], # example 0, ids [2] + [0, 1, -1] + ]) # example 1, ids [0, 1] + input_b = np.array([ + [0, -1, -1], # example 0, ids [0] + [-1, -1, -1] + ]) # example 1, ids [] + + # Embedding variable. + embedding_dimension = 2 + embedding_shape = (vocabulary_size, embedding_dimension) + zeros_embedding_values = np.zeros(embedding_shape) + + def _initializer(shape, dtype, partition_info): + self.assertAllEqual(embedding_shape, shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return zeros_embedding_values + + # Build columns. + categorical_column_a = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + categorical_column_b = fc_old.categorical_column_with_identity( + key='bbb', num_buckets=vocabulary_size) + embedding_column_a, embedding_column_b = fc_old.shared_embedding_columns( + [categorical_column_a, categorical_column_b], + dimension=embedding_dimension, + initializer=_initializer) + + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + categorical_column_a.name: input_a, + categorical_column_b.name: input_b, + }, (embedding_column_a, embedding_column_b)) + # Linear weights do not follow the column name. But this is a rare use + # case, and fixing it would add too much complexity to the code. + expected_var_names = ( + 'linear_model/bias_weights:0', + 'linear_model/aaa_bbb_shared_embedding/weights:0', + 'linear_model/aaa_bbb_shared_embedding/embedding_weights:0', + 'linear_model/aaa_bbb_shared_embedding_1/weights:0', + ) + self.assertItemsEqual( + expected_var_names, + [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)]) + trainable_vars = { + v.name: v + for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + } + self.assertItemsEqual(expected_var_names, trainable_vars.keys()) + bias = trainable_vars['linear_model/bias_weights:0'] + embedding_weights = trainable_vars[ + 'linear_model/aaa_bbb_shared_embedding/embedding_weights:0'] + linear_weights_a = trainable_vars[ + 'linear_model/aaa_bbb_shared_embedding/weights:0'] + linear_weights_b = trainable_vars[ + 'linear_model/aaa_bbb_shared_embedding_1/weights:0'] + with _initialized_session(): + # Predictions with all zero weights. + self.assertAllClose(np.zeros((1,)), bias.eval()) + self.assertAllClose(zeros_embedding_values, embedding_weights.eval()) + self.assertAllClose( + np.zeros((embedding_dimension, 1)), linear_weights_a.eval()) + self.assertAllClose( + np.zeros((embedding_dimension, 1)), linear_weights_b.eval()) + self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval()) + + # Predictions with all non-zero weights. + embedding_weights.assign(( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + )).eval() + linear_weights_a.assign(((4.,), (6.,))).eval() + # example 0, ids [2], embedding[0] = [7, 11] + # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5] + # sum(embeddings * linear_weights) + # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29] + linear_weights_b.assign(((3.,), (5.,))).eval() + # example 0, ids [0], embedding[0] = [1, 2] + # example 1, ids [], embedding[1] = 0, 0] + # sum(embeddings * linear_weights) + # = [3*1 + 5*2, 3*0 +5*0] = [13, 0] + self.assertAllClose([[94. + 13.], [29.]], predictions.eval()) + + def _test_input_layer(self, trainable=True): + # Inputs. + vocabulary_size = 3 + sparse_input_a = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + indices=((0, 0), (1, 0), (1, 4)), + values=(2, 0, 1), + dense_shape=(2, 5)) + sparse_input_b = sparse_tensor.SparseTensorValue( + # example 0, ids [0] + # example 1, ids [] + indices=((0, 0),), + values=(0,), + dense_shape=(2, 5)) + + # Embedding variable. + embedding_dimension = 2 + embedding_values = ( + (1., 2.), # id 0 + (3., 5.), # id 1 + (7., 11.) # id 2 + ) + def _initializer(shape, dtype, partition_info): + self.assertAllEqual((vocabulary_size, embedding_dimension), shape) + self.assertEqual(dtypes.float32, dtype) + self.assertIsNone(partition_info) + return embedding_values + + # Expected lookup result, using combiner='mean'. + expected_lookups = ( + # example 0: + # A ids [2], embedding = [7, 11] + # B ids [0], embedding = [1, 2] + (7., 11., 1., 2.), + # example 1: + # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] + # B ids [], embedding = [0, 0] + (2., 3.5, 0., 0.), + ) + + # Build columns. + categorical_column_a = fc_old.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + categorical_column_b = fc_old.categorical_column_with_identity( + key='bbb', num_buckets=vocabulary_size) + embedding_column_a, embedding_column_b = fc_old.shared_embedding_columns( + [categorical_column_a, categorical_column_b], + dimension=embedding_dimension, + initializer=_initializer, + trainable=trainable) + + # Provide sparse input and get dense result. + input_layer = fc.input_layer( + features={'aaa': sparse_input_a, 'bbb': sparse_input_b}, + feature_columns=(embedding_column_b, embedding_column_a)) + + # Assert expected embedding variable and lookups. + global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + self.assertItemsEqual( + ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'], + tuple([v.name for v in global_vars])) + trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) + if trainable: + self.assertItemsEqual( + ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'], + tuple([v.name for v in trainable_vars])) + else: + self.assertItemsEqual([], tuple([v.name for v in trainable_vars])) + shared_embedding_vars = global_vars + with _initialized_session(): + self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval()) + self.assertAllEqual(expected_lookups, input_layer.eval()) + + def test_input_layer(self): + self._test_input_layer() + + def test_input_layer_no_trainable(self): + self._test_input_layer(trainable=False) + + +class WeightedCategoricalColumnTest(test.TestCase): + + def test_defaults(self): + column = fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + self.assertEqual('ids_weighted_by_values', column.name) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'ids': parsing_ops.VarLenFeature(dtypes.int64), + 'values': parsing_ops.VarLenFeature(dtypes.float32) + }, column.parse_example_spec) + + def test_deep_copy(self): + """Tests deepcopy of categorical_column_with_hash_bucket.""" + original = fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + for column in (original, copy.deepcopy(original)): + self.assertEqual('ids_weighted_by_values', column.name) + self.assertEqual(3, column.num_buckets) + self.assertEqual({ + 'ids': parsing_ops.VarLenFeature(dtypes.int64), + 'values': parsing_ops.VarLenFeature(dtypes.float32) + }, column.parse_example_spec) + + def test_invalid_dtype_none(self): + with self.assertRaisesRegexp(ValueError, 'is not convertible to float'): + fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values', + dtype=None) + + def test_invalid_dtype_string(self): + with self.assertRaisesRegexp(ValueError, 'is not convertible to float'): + fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values', + dtype=dtypes.string) + + def test_invalid_input_dtype(self): + column = fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + strings = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('omar', 'stringer', 'marlo'), + dense_shape=(2, 2)) + with self.assertRaisesRegexp(ValueError, 'Bad dtype'): + _transform_features({'ids': strings, 'values': strings}, (column,), None) + + def test_column_name_collision(self): + with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'): + fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='aaa', num_buckets=3), + weight_feature_key='aaa').parse_example_spec() + + def test_missing_weights(self): + column = fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=('omar', 'stringer', 'marlo'), + dense_shape=(2, 2)) + with self.assertRaisesRegexp( + ValueError, 'values is not in features dictionary'): + _transform_features({'ids': inputs}, (column,), None) + + def test_parse_example(self): + a = fc.categorical_column_with_vocabulary_list( + key='aaa', vocabulary_list=('omar', 'stringer', 'marlo')) + a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights') + data = example_pb2.Example(features=feature_pb2.Features( + feature={ + 'aaa': + feature_pb2.Feature(bytes_list=feature_pb2.BytesList( + value=[b'omar', b'stringer'])), + 'weights': + feature_pb2.Feature(float_list=feature_pb2.FloatList( + value=[1., 10.])) + })) + features = parsing_ops.parse_example( + serialized=[data.SerializeToString()], + features=fc.make_parse_example_spec([a_weighted])) + self.assertIn('aaa', features) + self.assertIn('weights', features) + with self.test_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([b'omar', b'stringer'], dtype=np.object_), + dense_shape=[1, 2]), + features['aaa'].eval()) + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [0, 1]], + values=np.array([1., 10.], dtype=np.float32), + dense_shape=[1, 2]), + features['weights'].eval()) + + def test_transform_features(self): + column = fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 1, 0), + dense_shape=(2, 2)) + weights = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0.5, 1.0, 0.1), + dense_shape=(2, 2)) + id_tensor, weight_tensor = _transform_features({ + 'ids': inputs, + 'values': weights, + }, (column,), None)[column] + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array(inputs.values, dtype=np.int64), + dense_shape=inputs.dense_shape), + id_tensor.eval()) + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=weights.indices, + values=np.array(weights.values, dtype=np.float32), + dense_shape=weights.dense_shape), + weight_tensor.eval()) + + def test_transform_features_dense_input(self): + column = fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + weights = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0.5, 1.0, 0.1), + dense_shape=(2, 2)) + id_tensor, weight_tensor = _transform_features({ + 'ids': ((0, -1), (1, 0)), + 'values': weights, + }, (column,), None)[column] + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=np.array((0, 1, 0), dtype=np.int64), + dense_shape=(2, 2)), + id_tensor.eval()) + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=weights.indices, + values=np.array(weights.values, dtype=np.float32), + dense_shape=weights.dense_shape), + weight_tensor.eval()) + + def test_transform_features_dense_weights(self): + column = fc.weighted_categorical_column( + categorical_column=fc.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + inputs = sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(2, 1, 0), + dense_shape=(2, 2)) + id_tensor, weight_tensor = _transform_features({ + 'ids': inputs, + 'values': ((.5, 0.), (1., .1)), + }, (column,), None)[column] + with _initialized_session(): + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=inputs.indices, + values=np.array(inputs.values, dtype=np.int64), + dense_shape=inputs.dense_shape), + id_tensor.eval()) + _assert_sparse_tensor_value( + self, + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=np.array((.5, 1., .1), dtype=np.float32), + dense_shape=(2, 2)), + weight_tensor.eval()) + + def test_keras_linear_model(self): + column = fc_old.weighted_categorical_column( + categorical_column=fc_old.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + 'ids': + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)), + 'values': + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(.5, 1., .1), + dense_shape=(2, 2)) + }, (column,)) + bias = get_linear_model_bias() + weight_var = get_linear_model_column_var(column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + weight_var.assign(((1.,), (2.,), (3.,))).eval() + # weight_var[0] * weights[0, 0] = 1 * .5 = .5 + # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1] + # = 3*1 + 2*.1 = 3+.2 = 3.2 + self.assertAllClose(((.5,), (3.2,)), predictions.eval()) + + def test_keras_linear_model_mismatched_shape(self): + column = fc_old.weighted_categorical_column( + categorical_column=fc_old.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + with ops.Graph().as_default(): + with self.assertRaisesRegexp(ValueError, + r'Dimensions.*are not compatible'): + get_keras_linear_model_predictions({ + 'ids': + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)), + 'values': + sparse_tensor.SparseTensorValue( + indices=((0, 0), (0, 1), (1, 0), (1, 1)), + values=(.5, 11., 1., .1), + dense_shape=(2, 2)) + }, (column,)) + + def test_keras_linear_model_mismatched_dense_values(self): + column = fc_old.weighted_categorical_column( + categorical_column=fc_old.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions( + { + 'ids': + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)), + 'values': ((.5,), (1.,)) + }, (column,), + sparse_combiner='mean') + # Disabling the constant folding optimizer here since it changes the + # error message differently on CPU and GPU. + config = config_pb2.ConfigProto() + config.graph_options.rewrite_options.constant_folding = ( + rewriter_config_pb2.RewriterConfig.OFF) + with _initialized_session(config): + with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'): + predictions.eval() + + def test_keras_linear_model_mismatched_dense_shape(self): + column = fc_old.weighted_categorical_column( + categorical_column=fc_old.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + with ops.Graph().as_default(): + predictions = get_keras_linear_model_predictions({ + 'ids': + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)), + 'values': ((.5,), (1.,), (.1,)) + }, (column,)) + bias = get_linear_model_bias() + weight_var = get_linear_model_column_var(column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + weight_var.assign(((1.,), (2.,), (3.,))).eval() + # weight_var[0] * weights[0, 0] = 1 * .5 = .5 + # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1] + # = 3*1 + 2*.1 = 3+.2 = 3.2 + self.assertAllClose(((.5,), (3.2,)), predictions.eval()) + + def test_linear_model(self): + column = fc_old.weighted_categorical_column( + categorical_column=fc_old.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + with ops.Graph().as_default(): + predictions = fc.linear_model({ + 'ids': sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)), + 'values': sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(.5, 1., .1), + dense_shape=(2, 2)) + }, (column,)) + bias = get_linear_model_bias() + weight_var = get_linear_model_column_var(column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + weight_var.assign(((1.,), (2.,), (3.,))).eval() + # weight_var[0] * weights[0, 0] = 1 * .5 = .5 + # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1] + # = 3*1 + 2*.1 = 3+.2 = 3.2 + self.assertAllClose(((.5,), (3.2,)), predictions.eval()) + + def test_linear_model_mismatched_shape(self): + column = fc_old.weighted_categorical_column( + categorical_column=fc_old.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + with ops.Graph().as_default(): + with self.assertRaisesRegexp( + ValueError, r'Dimensions.*are not compatible'): + fc.linear_model({ + 'ids': sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)), + 'values': sparse_tensor.SparseTensorValue( + indices=((0, 0), (0, 1), (1, 0), (1, 1)), + values=(.5, 11., 1., .1), + dense_shape=(2, 2)) + }, (column,)) + + def test_linear_model_mismatched_dense_values(self): + column = fc_old.weighted_categorical_column( + categorical_column=fc_old.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + with ops.Graph().as_default(): + predictions = fc.linear_model( + { + 'ids': + sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)), + 'values': ((.5,), (1.,)) + }, (column,), + sparse_combiner='mean') + # Disabling the constant folding optimizer here since it changes the + # error message differently on CPU and GPU. + config = config_pb2.ConfigProto() + config.graph_options.rewrite_options.constant_folding = ( + rewriter_config_pb2.RewriterConfig.OFF) + with _initialized_session(config): + with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'): + predictions.eval() + + def test_linear_model_mismatched_dense_shape(self): + column = fc_old.weighted_categorical_column( + categorical_column=fc_old.categorical_column_with_identity( + key='ids', num_buckets=3), + weight_feature_key='values') + with ops.Graph().as_default(): + predictions = fc.linear_model({ + 'ids': sparse_tensor.SparseTensorValue( + indices=((0, 0), (1, 0), (1, 1)), + values=(0, 2, 1), + dense_shape=(2, 2)), + 'values': ((.5,), (1.,), (.1,)) + }, (column,)) + bias = get_linear_model_bias() + weight_var = get_linear_model_column_var(column) + with _initialized_session(): + self.assertAllClose((0.,), bias.eval()) + self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval()) + self.assertAllClose(((0.,), (0.,)), predictions.eval()) + weight_var.assign(((1.,), (2.,), (3.,))).eval() + # weight_var[0] * weights[0, 0] = 1 * .5 = .5 + # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1] + # = 3*1 + 2*.1 = 3+.2 = 3.2 + self.assertAllClose(((.5,), (3.2,)), predictions.eval()) + + # TODO(ptucker): Add test with embedding of weighted categorical. + +if __name__ == '__main__': + test.main() |