diff options
Diffstat (limited to 'tensorflow/examples/tutorials')
-rw-r--r-- | tensorflow/examples/tutorials/estimators/BUILD | 12 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/estimators/__init__.py | 0 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/input_fn/__init__.py | 0 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/layers/BUILD | 12 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/layers/__init__.py | 0 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/mnist/BUILD | 15 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/mnist/input_data.py | 2 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/mnist/mnist_softmax.py | 16 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/monitors/BUILD | 12 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/monitors/__init__.py | 0 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/monitors/iris_monitors.py | 6 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/word2vec/BUILD | 14 | ||||
-rw-r--r-- | tensorflow/examples/tutorials/word2vec/word2vec_basic.py | 161 |
13 files changed, 140 insertions, 110 deletions
diff --git a/tensorflow/examples/tutorials/estimators/BUILD b/tensorflow/examples/tutorials/estimators/BUILD index ecbc1a431d..bab609f208 100644 --- a/tensorflow/examples/tutorials/estimators/BUILD +++ b/tensorflow/examples/tutorials/estimators/BUILD @@ -20,15 +20,3 @@ py_binary( "//third_party/py/numpy", ], ) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tensorflow/examples/tutorials/estimators/__init__.py b/tensorflow/examples/tutorials/estimators/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/tensorflow/examples/tutorials/estimators/__init__.py diff --git a/tensorflow/examples/tutorials/input_fn/__init__.py b/tensorflow/examples/tutorials/input_fn/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/tensorflow/examples/tutorials/input_fn/__init__.py diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD index f8a29c79c6..aad78b1840 100644 --- a/tensorflow/examples/tutorials/layers/BUILD +++ b/tensorflow/examples/tutorials/layers/BUILD @@ -19,15 +19,3 @@ py_binary( "//third_party/py/numpy", ], ) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tensorflow/examples/tutorials/layers/__init__.py b/tensorflow/examples/tutorials/layers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/tensorflow/examples/tutorials/layers/__init__.py diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD index 6d4e67063d..d4070fdd1e 100644 --- a/tensorflow/examples/tutorials/mnist/BUILD +++ b/tensorflow/examples/tutorials/mnist/BUILD @@ -51,6 +51,7 @@ py_binary( "fully_connected_feed.py", ], srcs_version = "PY2AND3", + tags = ["optonly"], deps = [ ":input_data", ":mnist", @@ -96,7 +97,7 @@ py_binary( py_test( name = "fully_connected_feed_test", - size = "small", + size = "medium", srcs = [ "fully_connected_feed.py", ], @@ -132,15 +133,3 @@ py_test( "//tensorflow:tensorflow_py", ], ) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tensorflow/examples/tutorials/mnist/input_data.py b/tensorflow/examples/tutorials/mnist/input_data.py index f1a7e1c4af..fa148ae3e6 100644 --- a/tensorflow/examples/tutorials/mnist/input_data.py +++ b/tensorflow/examples/tutorials/mnist/input_data.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +# pylint: disable=unused-import import gzip import os import tempfile @@ -27,3 +28,4 @@ from six.moves import urllib from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets +# pylint: enable=unused-import diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py index fb3ac94203..47dd6a1947 100644 --- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py +++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """A very simple MNIST classifier. See extensive documentation at @@ -67,12 +66,19 @@ def main(_): # Test trained model correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) - print(sess.run(accuracy, feed_dict={x: mnist.test.images, - y_: mnist.test.labels})) + print(sess.run( + accuracy, feed_dict={ + x: mnist.test.images, + y_: mnist.test.labels + })) + if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data', - help='Directory for storing input data') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/tensorflow/mnist/input_data', + help='Directory for storing input data') FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/examples/tutorials/monitors/BUILD b/tensorflow/examples/tutorials/monitors/BUILD index 4220e8144d..1c49e3fe53 100644 --- a/tensorflow/examples/tutorials/monitors/BUILD +++ b/tensorflow/examples/tutorials/monitors/BUILD @@ -23,15 +23,3 @@ py_binary( "//third_party/py/numpy", ], ) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/tensorflow/examples/tutorials/monitors/__init__.py b/tensorflow/examples/tutorials/monitors/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/tensorflow/examples/tutorials/monitors/__init__.py diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py index 850d105f7b..a2b7fe6023 100644 --- a/tensorflow/examples/tutorials/monitors/iris_monitors.py +++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py @@ -32,9 +32,9 @@ IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv") def main(unused_argv): # Load datasets. training_set = tf.contrib.learn.datasets.base.load_csv_with_header( - filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float) + filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) test_set = tf.contrib.learn.datasets.base.load_csv_with_header( - filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float) + filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) validation_metrics = { "accuracy": @@ -83,7 +83,7 @@ def main(unused_argv): # Classify two new flower samples. new_samples = np.array( - [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float) + [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32) y = list(classifier.predict(new_samples)) print("Predictions: {}".format(str(y))) diff --git a/tensorflow/examples/tutorials/word2vec/BUILD b/tensorflow/examples/tutorials/word2vec/BUILD index 42d6355b4f..2e19c038bd 100644 --- a/tensorflow/examples/tutorials/word2vec/BUILD +++ b/tensorflow/examples/tutorials/word2vec/BUILD @@ -13,19 +13,11 @@ py_binary( "word2vec_basic.py", ], srcs_version = "PY2AND3", + tags = [ + "no-internal-py3", + ], deps = [ "//tensorflow:tensorflow_py", "//third_party/py/numpy", ], ) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), -) diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py index 142e45a2e8..b09ee99768 100644 --- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py +++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py @@ -21,6 +21,8 @@ from __future__ import print_function import collections import math import os +import sys +import argparse import random from tempfile import gettempdir import zipfile @@ -30,6 +32,24 @@ from six.moves import urllib from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf +from tensorflow.contrib.tensorboard.plugins import projector + +# Give a folder path as an argument with '--log_dir' to save +# TensorBoard summaries. Default is a log folder in current directory. +current_path = os.path.dirname(os.path.realpath(sys.argv[0])) + +parser = argparse.ArgumentParser() +parser.add_argument( + '--log_dir', + type=str, + default=os.path.join(current_path, 'log'), + help='The log directory for TensorBoard summaries.') +FLAGS, unparsed = parser.parse_known_args() + +# Create the directory for TensorBoard variables if there is not. +if not os.path.exists(FLAGS.log_dir): + os.makedirs(FLAGS.log_dir) + # Step 1: Download the data. url = 'http://mattmahoney.net/dc/' @@ -61,6 +81,7 @@ def read_data(filename): data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data + vocabulary = read_data(filename) print('Data size', len(vocabulary)) @@ -86,20 +107,22 @@ def build_dataset(words, n_words): reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reversed_dictionary + # Filling 4 global variables: # data - list of codes (integers from 0 to vocabulary_size-1). # This is the original text but words are replaced by their codes # count - map of words(strings) to count of occurrences # dictionary - map of words(strings) to their codes(integers) # reverse_dictionary - maps codes(integers) to words(strings) -data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, - vocabulary_size) +data, count, dictionary, reverse_dictionary = build_dataset( + vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) data_index = 0 + # Step 3: Function to generate a training batch for the skip-gram model. def generate_batch(batch_size, num_skips, skip_window): global data_index @@ -108,7 +131,7 @@ def generate_batch(batch_size, num_skips, skip_window): batch = np.ndarray(shape=(batch_size), dtype=np.int32) labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) span = 2 * skip_window + 1 # [ skip_window target skip_window ] - buffer = collections.deque(maxlen=span) + buffer = collections.deque(maxlen=span) # pylint: disable=redefined-builtin if data_index + span > len(data): data_index = 0 buffer.extend(data[data_index:data_index + span]) @@ -120,7 +143,7 @@ def generate_batch(batch_size, num_skips, skip_window): batch[i * num_skips + j] = buffer[skip_window] labels[i * num_skips + j, 0] = buffer[context_word] if data_index == len(data): - buffer[:] = data[:span] + buffer.extend(data[0:span]) data_index = span else: buffer.append(data[data_index]) @@ -129,96 +152,130 @@ def generate_batch(batch_size, num_skips, skip_window): data_index = (data_index + len(data) - span) % len(data) return batch, labels + batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) for i in range(8): - print(batch[i], reverse_dictionary[batch[i]], - '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) + print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], + reverse_dictionary[labels[i, 0]]) # Step 4: Build and train a skip-gram model. batch_size = 128 embedding_size = 128 # Dimension of the embedding vector. -skip_window = 1 # How many words to consider left and right. -num_skips = 2 # How many times to reuse an input to generate a label. -num_sampled = 64 # Number of negative examples to sample. +skip_window = 1 # How many words to consider left and right. +num_skips = 2 # How many times to reuse an input to generate a label. +num_sampled = 64 # Number of negative examples to sample. # We pick a random validation set to sample nearest neighbors. Here we limit the # validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. These 3 variables are used only for # displaying model accuracy, they don't affect calculation. -valid_size = 16 # Random set of words to evaluate similarity on. +valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) - graph = tf.Graph() with graph.as_default(): # Input data. - train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) - train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) - valid_dataset = tf.constant(valid_examples, dtype=tf.int32) + with tf.name_scope('inputs'): + train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) + train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) + valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # Ops and variables pinned to the CPU because of missing GPU implementation with tf.device('/cpu:0'): # Look up embeddings for inputs. - embeddings = tf.Variable( - tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) - embed = tf.nn.embedding_lookup(embeddings, train_inputs) + with tf.name_scope('embeddings'): + embeddings = tf.Variable( + tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) + embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss - nce_weights = tf.Variable( - tf.truncated_normal([vocabulary_size, embedding_size], - stddev=1.0 / math.sqrt(embedding_size))) - nce_biases = tf.Variable(tf.zeros([vocabulary_size])) + with tf.name_scope('weights'): + nce_weights = tf.Variable( + tf.truncated_normal( + [vocabulary_size, embedding_size], + stddev=1.0 / math.sqrt(embedding_size))) + with tf.name_scope('biases'): + nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. # Explanation of the meaning of NCE loss: # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ - loss = tf.reduce_mean( - tf.nn.nce_loss(weights=nce_weights, - biases=nce_biases, - labels=train_labels, - inputs=embed, - num_sampled=num_sampled, - num_classes=vocabulary_size)) + with tf.name_scope('loss'): + loss = tf.reduce_mean( + tf.nn.nce_loss( + weights=nce_weights, + biases=nce_biases, + labels=train_labels, + inputs=embed, + num_sampled=num_sampled, + num_classes=vocabulary_size)) + + # Add the loss value as a scalar to summary. + tf.summary.scalar('loss', loss) # Construct the SGD optimizer using a learning rate of 1.0. - optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) + with tf.name_scope('optimizer'): + optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings. - norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) + norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True)) normalized_embeddings = embeddings / norm - valid_embeddings = tf.nn.embedding_lookup( - normalized_embeddings, valid_dataset) + valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, + valid_dataset) similarity = tf.matmul( valid_embeddings, normalized_embeddings, transpose_b=True) + # Merge all summaries. + merged = tf.summary.merge_all() + # Add variable initializer. init = tf.global_variables_initializer() + # Create a saver. + saver = tf.train.Saver() + # Step 5: Begin training. num_steps = 100001 with tf.Session(graph=graph) as session: + # Open a writer to write summaries. + writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph) + # We must initialize all variables before we use them. init.run() print('Initialized') average_loss = 0 for step in xrange(num_steps): - batch_inputs, batch_labels = generate_batch( - batch_size, num_skips, skip_window) + batch_inputs, batch_labels = generate_batch(batch_size, num_skips, + skip_window) feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} + # Define metadata variable. + run_metadata = tf.RunMetadata() + # We perform one update step by evaluating the optimizer op (including it # in the list of returned values for session.run() - _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) + # Also, evaluate the merged op to get all summaries from the returned "summary" variable. + # Feed metadata variable to session for visualizing the graph in TensorBoard. + _, summary, loss_val = session.run( + [optimizer, merged, loss], + feed_dict=feed_dict, + run_metadata=run_metadata) average_loss += loss_val + # Add returned summaries to writer in each step. + writer.add_summary(summary, step) + # Add metadata to visualize the graph for the last run. + if step == (num_steps - 1): + writer.add_run_metadata(run_metadata, 'step%d' % step) + if step % 2000 == 0: if step > 0: average_loss /= 2000 @@ -240,6 +297,23 @@ with tf.Session(graph=graph) as session: print(log_str) final_embeddings = normalized_embeddings.eval() + # Write corresponding labels for the embeddings. + with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f: + for i in xrange(vocabulary_size): + f.write(reverse_dictionary[i] + '\n') + + # Save the model for checkpoints. + saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt')) + + # Create a configuration for visualizing embeddings with the labels in TensorBoard. + config = projector.ProjectorConfig() + embedding_conf = config.embeddings.add() + embedding_conf.tensor_name = embeddings.name + embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv') + projector.visualize_embeddings(writer, config) + +writer.close() + # Step 6: Visualize the embeddings. @@ -251,21 +325,24 @@ def plot_with_labels(low_dim_embs, labels, filename): for i, label in enumerate(labels): x, y = low_dim_embs[i, :] plt.scatter(x, y) - plt.annotate(label, - xy=(x, y), - xytext=(5, 2), - textcoords='offset points', - ha='right', - va='bottom') + plt.annotate( + label, + xy=(x, y), + xytext=(5, 2), + textcoords='offset points', + ha='right', + va='bottom') plt.savefig(filename) + try: # pylint: disable=g-import-not-at-top from sklearn.manifold import TSNE import matplotlib.pyplot as plt - tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact') + tsne = TSNE( + perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact') plot_only = 500 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :]) labels = [reverse_dictionary[i] for i in xrange(plot_only)] |