Updated wide-n-deep tutorial code to use core version of estimators and feature-columns.

PiperOrigin-RevId: 159984663
author: Mustafa Ispir <ispir@google.com> 2017-06-23 13:18:53 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-06-23 13:22:27 -0700
commit: 160ff06ac0cd0551044c7f650a3bb0d6f3d074f5 (patch)
tree: f0b57a7c1e88e8dd495d77d70a6e856c321f92b9 /tensorflow/examples/learn
parent: e01611369f29eb18565bc77512884b908fde70ff (diff)
1 files changed, 123 insertions, 119 deletions
diff --git a/tensorflow/examples/learn/wide_n_deep_tutorial.py b/tensorflow/examples/learn/wide_n_deep_tutorial.py
index a0c6df821a..6a3ae50f0b 100644
--- a/tensorflow/examples/learn/wide_n_deep_tutorial.py
+++ b/tensorflow/examples/learn/wide_n_deep_tutorial.py
@@ -21,21 +21,89 @@ import argparse
 import sys
 import tempfile
 
-from six.moves import urllib
-
 import pandas as pd
+from six.moves import urllib
 import tensorflow as tf
 
 
-COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
-           "marital_status", "occupation", "relationship", "race", "gender",
-           "capital_gain", "capital_loss", "hours_per_week", "native_country",
-           "income_bracket"]
-LABEL_COLUMN = "label"
-CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
-                       "relationship", "race", "gender", "native_country"]
-CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
-                      "hours_per_week"]
+CSV_COLUMNS = [
+    "age", "workclass", "fnlwgt", "education", "education_num",
+    "marital_status", "occupation", "relationship", "race", "gender",
+    "capital_gain", "capital_loss", "hours_per_week", "native_country",
+    "income_bracket"
+]
+
+gender = tf.feature_column.categorical_column_with_vocabulary_list(
+    "gender", [" Female", " Male"])
+education = tf.feature_column.categorical_column_with_vocabulary_list(
+    "education", [
+        "Bachelors", "HS-grad", "11th", "Masters", "9th",
+        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
+        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
+        "Preschool", "12th"
+    ])
+tf.feature_column.categorical_column_with_vocabulary_list(
+    "marital_status", [
+        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
+        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
+    ])
+relationship = tf.feature_column.categorical_column_with_vocabulary_list(
+    "relationship", [
+        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
+        "Other-relative"
+    ])
+workclass = tf.feature_column.categorical_column_with_vocabulary_list(
+    "workclass", [
+        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
+        "Local-gov", "?", "Self-emp-inc", "Without-pay", " Never-worked"
+    ])
+
+# To show an example of hashing:
+occupation = tf.feature_column.categorical_column_with_hash_bucket(
+    "occupation", hash_bucket_size=1000)
+native_country = tf.feature_column.categorical_column_with_hash_bucket(
+    "native_country", hash_bucket_size=1000)
+
+# Continuous base columns.
+age = tf.feature_column.numeric_column("age")
+education_num = tf.feature_column.numeric_column("education_num")
+capital_gain = tf.feature_column.numeric_column("capital_gain")
+capital_loss = tf.feature_column.numeric_column("capital_loss")
+hours_per_week = tf.feature_column.numeric_column("hours_per_week")
+
+# Transformations.
+age_buckets = tf.feature_column.bucketized_column(
+    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+
+# Wide columns and deep columns.
+base_columns = [
+    gender, native_country, education, occupation, workclass, relationship,
+    age_buckets,
+]
+
+crossed_columns = [
+    tf.feature_column.crossed_column(
+        ["education", "occupation"], hash_bucket_size=1000),
+    tf.feature_column.crossed_column(
+        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
+    tf.feature_column.crossed_column(
+        ["native_country", "occupation"], hash_bucket_size=1000)
+]
+
+deep_columns = [
+    tf.feature_column.indicator_column(workclass),
+    tf.feature_column.indicator_column(education),
+    tf.feature_column.indicator_column(gender),
+    tf.feature_column.indicator_column(relationship),
+    # To show an example of embedding
+    tf.feature_column.embedding_column(native_country, dimension=8),
+    tf.feature_column.embedding_column(occupation, dimension=8),
+    age,
+    education_num,
+    capital_gain,
+    capital_loss,
+    hours_per_week,
+]
 
 
 def maybe_download(train_data, test_data):
@@ -44,7 +112,9 @@ def maybe_download(train_data, test_data):
     train_file_name = train_data
   else:
     train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve(
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+        train_file.name)  # pylint: disable=line-too-long
     train_file_name = train_file.name
     train_file.close()
     print("Training data is downloaded to %s" % train_file_name)
@@ -53,138 +123,72 @@ def maybe_download(train_data, test_data):
     test_file_name = test_data
   else:
     test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve(
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
+        test_file.name)  # pylint: disable=line-too-long
     test_file_name = test_file.name
     test_file.close()
-    print("Test data is downloaded to %s" % test_file_name)
+    print("Test data is downloaded to %s"% test_file_name)
 
   return train_file_name, test_file_name
 
 
 def build_estimator(model_dir, model_type):
   """Build an estimator."""
-  # Sparse base columns.
-  gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender",
-                                                     keys=["female", "male"])
-  education = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "education", hash_bucket_size=1000)
-  relationship = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "relationship", hash_bucket_size=100)
-  workclass = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "workclass", hash_bucket_size=100)
-  occupation = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "occupation", hash_bucket_size=1000)
-  native_country = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "native_country", hash_bucket_size=1000)
-
-  # Continuous base columns.
-  age = tf.contrib.layers.real_valued_column("age")
-  education_num = tf.contrib.layers.real_valued_column("education_num")
-  capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
-  capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
-  hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
-
-  # Transformations.
-  age_buckets = tf.contrib.layers.bucketized_column(age,
-                                                    boundaries=[
-                                                        18, 25, 30, 35, 40, 45,
-                                                        50, 55, 60, 65
-                                                    ])
-
-  # Wide columns and deep columns.
-  wide_columns = [gender, native_country, education, occupation, workclass,
-                  relationship, age_buckets,
-                  tf.contrib.layers.crossed_column([education, occupation],
-                                                   hash_bucket_size=int(1e4)),
-                  tf.contrib.layers.crossed_column(
-                      [age_buckets, education, occupation],
-                      hash_bucket_size=int(1e6)),
-                  tf.contrib.layers.crossed_column([native_country, occupation],
-                                                   hash_bucket_size=int(1e4))]
-  deep_columns = [
-      tf.contrib.layers.embedding_column(workclass, dimension=8),
-      tf.contrib.layers.embedding_column(education, dimension=8),
-      tf.contrib.layers.embedding_column(gender, dimension=8),
-      tf.contrib.layers.embedding_column(relationship, dimension=8),
-      tf.contrib.layers.embedding_column(native_country,
-                                         dimension=8),
-      tf.contrib.layers.embedding_column(occupation, dimension=8),
-      age,
-      education_num,
-      capital_gain,
-      capital_loss,
-      hours_per_week,
-  ]
+  # Categorical base columns.
 
   if model_type == "wide":
-    m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
-                                          feature_columns=wide_columns)
+    m = tf.estimator.LinearClassifier(
+        model_dir=model_dir, feature_columns=base_columns + crossed_columns)
   elif model_type == "deep":
-    m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
-                                       feature_columns=deep_columns,
-                                       hidden_units=[100, 50])
+    m = tf.estimator.DNNClassifier(
+        model_dir=model_dir,
+        feature_columns=deep_columns,
+        hidden_units=[100, 50])
   else:
-    m = tf.contrib.learn.DNNLinearCombinedClassifier(
+    m = tf.estimator.DNNLinearCombinedClassifier(
         model_dir=model_dir,
-        linear_feature_columns=wide_columns,
+        linear_feature_columns=crossed_columns,
         dnn_feature_columns=deep_columns,
-        dnn_hidden_units=[100, 50],
-        fix_global_step_increment_bug=True)
+        dnn_hidden_units=[100, 50])
   return m
 
 
-def input_fn(df):
+def input_fn(data_file, num_epochs, shuffle):
   """Input builder function."""
-  # Creates a dictionary mapping from each continuous feature column name (k) to
-  # the values of that column stored in a constant Tensor.
-  continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
-  # Creates a dictionary mapping from each categorical feature column name (k)
-  # to the values of that column stored in a tf.SparseTensor.
-  categorical_cols = {
-      k: tf.SparseTensor(
-          indices=[[i, 0] for i in range(df[k].size)],
-          values=df[k].values,
-          dense_shape=[df[k].size, 1])
-      for k in CATEGORICAL_COLUMNS}
-  # Merges the two dictionaries into one.
-  feature_cols = dict(continuous_cols)
-  feature_cols.update(categorical_cols)
-  # Converts the label column into a constant Tensor.
-  label = tf.constant(df[LABEL_COLUMN].values)
-  # Returns the feature columns and the label.
-  return feature_cols, label
+  df_data = pd.read_csv(
+      tf.gfile.Open(data_file),
+      names=CSV_COLUMNS,
+      skipinitialspace=True,
+      engine="python",
+      skiprows=1)
+  # remove NaN elements
+  df_data = df_data.dropna(how="any", axis=0)
+  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
+  return tf.estimator.inputs.pandas_input_fn(
+      x=df_data,
+      y=labels,
+      batch_size=100,
+      num_epochs=num_epochs,
+      shuffle=shuffle,
+      num_threads=5)
 
 
 def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
   """Train and evaluate the model."""
   train_file_name, test_file_name = maybe_download(train_data, test_data)
-  df_train = pd.read_csv(
-      tf.gfile.Open(train_file_name),
-      names=COLUMNS,
-      skipinitialspace=True,
-      engine="python")
-  df_test = pd.read_csv(
-      tf.gfile.Open(test_file_name),
-      names=COLUMNS,
-      skipinitialspace=True,
-      skiprows=1,
-      engine="python")
-
-  # remove NaN elements
-  df_train = df_train.dropna(how='any', axis=0)
-  df_test = df_test.dropna(how='any', axis=0)
-
-  df_train[LABEL_COLUMN] = (
-      df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-  df_test[LABEL_COLUMN] = (
-      df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-
   model_dir = tempfile.mkdtemp() if not model_dir else model_dir
-  print("model directory = %s" % model_dir)
 
   m = build_estimator(model_dir, model_type)
-  m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps)
-  results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
+  # set num_epochs to None to get infinite stream of data.
+  m.train(
+      input_fn=input_fn(train_file_name, num_epochs=None, shuffle=True),
+      steps=train_steps)
+  # set steps to None to run evaluation until all data consumed.
+  results = m.evaluate(
+      input_fn=input_fn(test_file_name, num_epochs=1, shuffle=False),
+      steps=None)
+  print("model directory = %s" % model_dir)
   for key in sorted(results):
     print("%s: %s" % (key, results[key]))
 
@@ -215,7 +219,7 @@ if __name__ == "__main__":
   parser.add_argument(
       "--train_steps",
       type=int,
-      default=200,
+      default=2000,
       help="Number of training steps."
   )
   parser.add_argument(
author	Mustafa Ispir <ispir@google.com>	2017-06-23 13:18:53 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-06-23 13:22:27 -0700
commit	160ff06ac0cd0551044c7f650a3bb0d6f3d074f5 (patch)
tree	f0b57a7c1e88e8dd495d77d70a6e856c321f92b9 /tensorflow/examples/learn
parent	e01611369f29eb18565bc77512884b908fde70ff (diff)