diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2017-06-26 15:02:09 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-06-26 15:07:17 -0700 |
commit | 5f84a0cb372ca6542385ad0a372496c1f56f7ce6 (patch) | |
tree | dc24f54907dabddab5cb974e6018396042b02e26 /tensorflow/examples/learn | |
parent | 23e0d5044d6ddcdd14a673d7300a96e735c6663a (diff) |
Updates text_classification example.
PiperOrigin-RevId: 160200457
Diffstat (limited to 'tensorflow/examples/learn')
-rw-r--r-- | tensorflow/examples/learn/text_classification.py | 144 |
1 files changed, 90 insertions, 54 deletions
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py index 7e10014c39..4b32bfc382 100644 --- a/tensorflow/examples/learn/text_classification.py +++ b/tensorflow/examples/learn/text_classification.py @@ -24,43 +24,67 @@ import numpy as np import pandas from sklearn import metrics import tensorflow as tf -from tensorflow.contrib.layers.python.layers import encoders - -learn = tf.contrib.learn FLAGS = None MAX_DOCUMENT_LENGTH = 10 EMBEDDING_SIZE = 50 n_words = 0 +MAX_LABEL = 15 +WORDS_FEATURE = 'words' # Name of the input words feature. + + +def estimator_spec_for_softmax_classification( + logits, labels, mode): + """Returns EstimatorSpec instance for softmax classification.""" + predicted_classes = tf.argmax(logits, 1) + if mode == tf.estimator.ModeKeys.PREDICT: + return tf.estimator.EstimatorSpec( + mode=mode, + predictions={ + 'class': predicted_classes, + 'prob': tf.nn.softmax(logits) + }) + + onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0) + loss = tf.losses.softmax_cross_entropy( + onehot_labels=onehot_labels, logits=logits) + if mode == tf.estimator.ModeKeys.TRAIN: + optimizer = tf.train.AdamOptimizer(learning_rate=0.01) + train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) + return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) + + eval_metric_ops = { + 'accuracy': tf.metrics.accuracy( + labels=labels, predictions=predicted_classes) + } + return tf.estimator.EstimatorSpec( + mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) + + +def bag_of_words_model(features, labels, mode): + """A bag-of-words model. Note it disregards the word order in the text.""" + bow_column = tf.feature_column.categorical_column_with_identity( + WORDS_FEATURE, num_buckets=n_words) + bow_embedding_column = tf.feature_column.embedding_column( + bow_column, dimension=EMBEDDING_SIZE) + bow = tf.feature_column.input_layer( + features, + feature_columns=[bow_embedding_column]) + logits = tf.layers.dense(bow, MAX_LABEL, activation=None) + return estimator_spec_for_softmax_classification( + logits=logits, labels=labels, mode=mode) -def bag_of_words_model(features, target): - """A bag-of-words model. Note it disregards the word order in the text.""" - target = tf.one_hot(target, 15, 1, 0) - features = encoders.bow_encoder( - features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE) - logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None) - loss = tf.contrib.losses.softmax_cross_entropy(logits, target) - train_op = tf.contrib.layers.optimize_loss( - loss, - tf.contrib.framework.get_global_step(), - optimizer='Adam', - learning_rate=0.01) - return ({ - 'class': tf.argmax(logits, 1), - 'prob': tf.nn.softmax(logits) - }, loss, train_op) - - -def rnn_model(features, target): + +def rnn_model(features, labels, mode): """RNN model to predict from sequence of words to a class.""" # Convert indexes of words into embeddings. # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then # maps word indexes of the sequence into [batch_size, sequence_length, # EMBEDDING_SIZE]. word_vectors = tf.contrib.layers.embed_sequence( - features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words') + features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE) # Split into list of embedding per word, while removing doc length dim. # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE]. @@ -74,29 +98,17 @@ def rnn_model(features, target): _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32) # Given encoding of RNN, take encoding of last step (e.g hidden size of the - # neural network of last step) and pass it as features for logistic - # regression over output classes. - target = tf.one_hot(target, 15, 1, 0) - logits = tf.contrib.layers.fully_connected(encoding, 15, activation_fn=None) - loss = tf.contrib.losses.softmax_cross_entropy(logits, target) - - # Create a training op. - train_op = tf.contrib.layers.optimize_loss( - loss, - tf.contrib.framework.get_global_step(), - optimizer='Adam', - learning_rate=0.01) - - return ({ - 'class': tf.argmax(logits, 1), - 'prob': tf.nn.softmax(logits) - }, loss, train_op) + # neural network of last step) and pass it as features for softmax + # classification over output classes. + logits = tf.layers.dense(encoding, MAX_LABEL, activation=None) + return estimator_spec_for_softmax_classification( + logits=logits, labels=labels, mode=mode) def main(unused_argv): global n_words # Prepare training and testing data - dbpedia = learn.datasets.load_dataset( + dbpedia = tf.contrib.learn.datasets.load_dataset( 'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data) x_train = pandas.DataFrame(dbpedia.train.data)[1] y_train = pandas.Series(dbpedia.train.target) @@ -104,14 +116,15 @@ def main(unused_argv): y_test = pandas.Series(dbpedia.test.target) # Process vocabulary - vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH) - + vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( + MAX_DOCUMENT_LENGTH) + x_transform_train = vocab_processor.fit_transform(x_train) x_transform_test = vocab_processor.transform(x_test) - + x_train = np.array(list(x_transform_train)) x_test = np.array(list(x_transform_test)) - + n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) @@ -119,17 +132,40 @@ def main(unused_argv): # Switch between rnn_model and bag_of_words_model to test different models. model_fn = rnn_model if FLAGS.bow_model: + # Subtract 1 because VocabularyProcessor outputs a word-id matrix where word + # ids start from 1 and 0 means 'no word'. But + # categorical_column_with_identity assumes 0-based count and uses -1 for + # missing word. + x_train -= 1 + x_test -= 1 model_fn = bag_of_words_model - classifier = learn.Estimator(model_fn=model_fn) - - # Train and predict - classifier.fit(x_train, y_train, steps=100) - y_predicted = [ - p['class'] for p in classifier.predict( - x_test, as_iterable=True) - ] + classifier = tf.estimator.Estimator(model_fn=model_fn) + + # Train. + train_input_fn = tf.estimator.inputs.numpy_input_fn( + x={WORDS_FEATURE: x_train}, + y=y_train, + num_epochs=None, + shuffle=True) + classifier.train(input_fn=train_input_fn, steps=100) + + # Predict. + test_input_fn = tf.estimator.inputs.numpy_input_fn( + x={WORDS_FEATURE: x_test}, + y=y_test, + num_epochs=1, + shuffle=False) + predictions = classifier.predict(input_fn=test_input_fn) + y_predicted = np.array(list(p['class'] for p in predictions)) + y_predicted = y_predicted.reshape(np.array(y_test).shape) + + # Score with sklearn. score = metrics.accuracy_score(y_test, y_predicted) - print('Accuracy: {0:f}'.format(score)) + print('Accuracy (sklearn): {0:f}'.format(score)) + + # Score with tensorflow. + scores = classifier.evaluate(input_fn=test_input_fn) + print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy'])) if __name__ == '__main__': |