aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/factorization
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-03-15 06:56:23 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-03-15 07:00:20 -0700
commit10927e9f77d0bfebb597b5cc64fa3908db23361c (patch)
tree71290ac8b48fdda9dba773ae2ffd65f5a20f73ba /tensorflow/contrib/factorization
parentb08c54271084b05ea822b3348a3a448a9fe3b898 (diff)
Add ability to use feature_columns in KMeans Estimator.
PiperOrigin-RevId: 189179304
Diffstat (limited to 'tensorflow/contrib/factorization')
-rw-r--r--tensorflow/contrib/factorization/BUILD2
-rw-r--r--tensorflow/contrib/factorization/python/ops/kmeans.py49
-rw-r--r--tensorflow/contrib/factorization/python/ops/kmeans_test.py37
3 files changed, 62 insertions, 26 deletions
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 180f1b68f3..90f10f1fa8 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -66,6 +66,7 @@ tf_custom_op_py_library(
"//tensorflow/python:variables",
"//tensorflow/python/estimator",
"//tensorflow/python/estimator:model_fn",
+ "//tensorflow/python/feature_column:feature_column_py",
"//third_party/py/numpy",
],
)
@@ -238,6 +239,7 @@ py_test(
"//tensorflow/python:random_ops",
"//tensorflow/python:training",
"//tensorflow/python/estimator:run_config",
+ "//tensorflow/python/feature_column:feature_column_py",
"//third_party/py/numpy",
],
)
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index c092f85d35..38faca119d 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.factorization.python.ops import clustering_ops
from tensorflow.python.estimator import estimator
from tensorflow.python.estimator import model_fn as model_fn_lib
from tensorflow.python.estimator.export import export_output
+from tensorflow.python.feature_column import feature_column as fc
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
@@ -105,24 +106,32 @@ class _InitializeClustersHook(session_run_hook.SessionRunHook):
logging.info(e)
-def _parse_features_if_necessary(features):
+def _parse_features_if_necessary(features, feature_columns):
"""Helper function to convert the input points into a usable format.
Args:
- features: The input points.
+ features: The input features.
+ feature_columns: An optionable iterable containing all the feature columns
+ used by the model. All items in the set should be feature column instances
+ that can be passed to `tf.feature_column.input_layer`. If this is None,
+ all features will be used.
Returns:
- If `features` is a dict of `k` features, each of which is a vector of `n`
- scalars, the return value is a Tensor of shape `(n, k)` representing `n`
- input points, where the items in the `k` dimension are sorted
- lexicographically by `features` key. If `features` is not a dict, it is
- returned unmodified.
+ If `features` is a dict of `k` features (optionally filtered by
+ `feature_columns`), each of which is a vector of `n` scalars, the return
+ value is a Tensor of shape `(n, k)` representing `n` input points, where the
+ items in the `k` dimension are sorted lexicographically by `features` key.
+ If `features` is not a dict, it is returned unmodified.
"""
- if isinstance(features, dict):
- keys = sorted(features.keys())
- with ops.colocate_with(features[keys[0]]):
- features = array_ops.concat([features[k] for k in keys], axis=1)
- return features
+ if not isinstance(features, dict):
+ return features
+
+ if feature_columns:
+ return fc.input_layer(features, feature_columns)
+
+ keys = sorted(features.keys())
+ with ops.colocate_with(features[keys[0]]):
+ return array_ops.concat([features[k] for k in keys], axis=1)
class _ModelFn(object):
@@ -130,7 +139,8 @@ class _ModelFn(object):
def __init__(self, num_clusters, initial_clusters, distance_metric,
random_seed, use_mini_batch, mini_batch_steps_per_iteration,
- kmeans_plus_plus_num_retries, relative_tolerance):
+ kmeans_plus_plus_num_retries, relative_tolerance,
+ feature_columns):
self._num_clusters = num_clusters
self._initial_clusters = initial_clusters
self._distance_metric = distance_metric
@@ -139,6 +149,7 @@ class _ModelFn(object):
self._mini_batch_steps_per_iteration = mini_batch_steps_per_iteration
self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
self._relative_tolerance = relative_tolerance
+ self._feature_columns = feature_columns
def model_fn(self, features, mode, config):
"""Model function for the estimator.
@@ -166,7 +177,7 @@ class _ModelFn(object):
# input_points is a single Tensor. Therefore, the sharding functionality
# in clustering_ops is unused, and some of the values below are lists of a
# single item.
- input_points = _parse_features_if_necessary(features)
+ input_points = _parse_features_if_necessary(features, self._feature_columns)
# Let N = the number of input_points.
# all_distances: A list of one matrix of shape (N, num_clusters). Each value
@@ -316,7 +327,8 @@ class KMeansClustering(estimator.Estimator):
mini_batch_steps_per_iteration=1,
kmeans_plus_plus_num_retries=2,
relative_tolerance=None,
- config=None):
+ config=None,
+ feature_columns=None):
"""Creates an Estimator for running KMeans training and inference.
This Estimator implements the following variants of the K-means algorithm:
@@ -383,6 +395,10 @@ class KMeansClustering(estimator.Estimator):
iterations. Stops learning if the loss changes less than this amount.
This may not work correctly if `use_mini_batch=True`.
config: See @{tf.estimator.Estimator}.
+ feature_columns: An optionable iterable containing all the feature columns
+ used by the model. All items in the set should be feature column
+ instances that can be passed to `tf.feature_column.input_layer`. If this
+ is None, all features will be used.
Raises:
ValueError: An invalid argument was passed to `initial_clusters` or
@@ -402,7 +418,8 @@ class KMeansClustering(estimator.Estimator):
model_fn=_ModelFn(
num_clusters, initial_clusters, distance_metric, random_seed,
use_mini_batch, mini_batch_steps_per_iteration,
- kmeans_plus_plus_num_retries, relative_tolerance).model_fn,
+ kmeans_plus_plus_num_retries, relative_tolerance,
+ feature_columns).model_fn,
model_dir=model_dir,
config=config)
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index 06a2c52c11..0103cc4439 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -27,6 +27,7 @@ from sklearn.cluster import KMeans as SklearnKMeans
# pylint: disable=g-import-not-at-top
from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_lib
from tensorflow.python.estimator import run_config
+from tensorflow.python.feature_column import feature_column as fc
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
@@ -226,27 +227,43 @@ class KMeansTest(KMeansTestBase):
self._infer_helper(kmeans, clusters, 10)
self._infer_helper(kmeans, clusters, 1)
+ def _parse_feature_dict_helper(self, features, parsed_feature_dict):
+ # Perform a sanity check.
+ self.assertEqual(features.shape, parsed_feature_dict.shape)
+ self.assertEqual(features.dtype, parsed_feature_dict.dtype)
+ # Then check that running the tensor yields the original list of points.
+ with self.test_session() as sess:
+ parsed_points = sess.run(parsed_feature_dict)
+ self.assertAllEqual(self.points, parsed_points)
+
def test_parse_features(self):
"""Tests the various behaviours of kmeans._parse_features_if_necessary."""
# No-op if a tensor is passed in.
features = constant_op.constant(self.points)
- parsed_features = kmeans_lib._parse_features_if_necessary(features)
+ parsed_features = kmeans_lib._parse_features_if_necessary(features, None)
self.assertAllEqual(features, parsed_features)
- # A dict is transformed into a tensor.
+ # All values from a feature dict are transformed into a tensor.
feature_dict = {
'x': [[point[0]] for point in self.points],
'y': [[point[1]] for point in self.points]
}
- parsed_feature_dict = kmeans_lib._parse_features_if_necessary(feature_dict)
- # Perform a sanity check.
- self.assertEqual(features.shape, parsed_feature_dict.shape)
- self.assertEqual(features.dtype, parsed_feature_dict.dtype)
- # Then check that running the tensor yields the original list of points.
- with self.test_session() as sess:
- parsed_points = sess.run(parsed_feature_dict)
- self.assertAllEqual(self.points, parsed_points)
+ parsed_feature_dict = kmeans_lib._parse_features_if_necessary(
+ feature_dict, None)
+ self._parse_feature_dict_helper(features, parsed_feature_dict)
+
+ # Only the feature_columns of a feature dict are transformed into a tensor.
+ feature_dict_with_extras = {
+ 'foo': 'bar',
+ 'x': [[point[0]] for point in self.points],
+ 'baz': {'fizz': 'buzz'},
+ 'y': [[point[1]] for point in self.points]
+ }
+ feature_columns = [fc.numeric_column(key='x'), fc.numeric_column(key='y')]
+ parsed_feature_dict = kmeans_lib._parse_features_if_necessary(
+ feature_dict_with_extras, feature_columns)
+ self._parse_feature_dict_helper(features, parsed_feature_dict)
class KMeansTestMultiStageInit(KMeansTestBase):