diff options
Diffstat (limited to 'tensorflow/g3doc/tutorials')
24 files changed, 4008 insertions, 0 deletions
diff --git a/tensorflow/g3doc/tutorials/BUILD b/tensorflow/g3doc/tutorials/BUILD new file mode 100644 index 0000000000..5642ade160 --- /dev/null +++ b/tensorflow/g3doc/tutorials/BUILD @@ -0,0 +1,19 @@ +# Description: +# Top-level tutorials files + +package(default_visibility = ["//tensorflow:internal"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + ], + ), +) diff --git a/tensorflow/g3doc/tutorials/__init__.py b/tensorflow/g3doc/tutorials/__init__.py new file mode 100755 index 0000000000..e69de29bb2 --- /dev/null +++ b/tensorflow/g3doc/tutorials/__init__.py diff --git a/tensorflow/g3doc/tutorials/deep_cnn/cifar_tensorboard.html b/tensorflow/g3doc/tutorials/deep_cnn/cifar_tensorboard.html new file mode 100644 index 0000000000..266faf042e --- /dev/null +++ b/tensorflow/g3doc/tutorials/deep_cnn/cifar_tensorboard.html @@ -0,0 +1,21 @@ +<html> + +<head> + <title>TensorBoard Demo</title> + <script src="/tensorboard/webcomponents-lite.min.js"></script> + <link rel="import" href="/tensorboard/tf-tensorboard-demo.html"> + <style> + + html,body { + margin: 0; + padding: 0; + height: 100%; + font-family: "RobotoDraft","Roboto",sans-serif; + } + +</style> +</head> +<body> + <tf-tensorboard-demo data-dir="/tensorboard/cifar"></tf-tensorboard-demo> +</body> +</html> diff --git a/tensorflow/g3doc/tutorials/deep_cnn/index.md b/tensorflow/g3doc/tutorials/deep_cnn/index.md new file mode 100644 index 0000000000..f40a94ba7a --- /dev/null +++ b/tensorflow/g3doc/tutorials/deep_cnn/index.md @@ -0,0 +1,462 @@ +# Convolutional Neural Networks for Object Recognition + +**NOTE:** This tutorial is intended for *advanced* users of TensorFlow +and assumes expertise and experience in machine learning. + +## Overview + +CIFAR-10 classification is a common benchmark problem in machine learning. The +problem is to classify RGB 32x32 pixel images across 10 categories: +```airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and truck.``` + +![CIFAR-10 Samples](./cifar_samples.png "CIFAR-10 Samples, from http://www.cs.toronto.edu/~kriz/cifar.html") + +For more details refer to the [CIFAR-10 page](http://www.cs.toronto.edu/~kriz/cifar.html) +and a [Tech Report](http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf) +by Alex Krizhevsky. + +### Goals + +The goal of this tutorial is to build a relatively small convolutional neural +network (CNN) for recognizing images. In the process this tutorial: + +1. Highlights a canonical organization for network architecture, +training and evaluation. +2. Provides a template for constructing larger and more sophisticated models. + +The reason CIFAR-10 was selected was because it contains enough complexity to +exercise much of TensorFlow's ability to scale to large models. At the same +time, the model is small enough to train fast in order to test new ideas and +experiments. + +### Highlights of the Tutorial +The CIFAR-10 tutorial demonstrates several important constructs for +designing larger and more sophisticated models in TensorFlow: + +* Core mathematical components including +[convolution](../../api_docs/python/nn.md#conv2d), +[rectified linear activations](../../api_docs/python/nn.md#relu), +[max pooling](../../api_docs/python/nn.md#max_pool) and +[local response normalization](../../api_docs/python/nn.md#local_response_normalization). +* [Visualization](../../how_tos/summaries_and_tensorboard/index.md) +of network activity during training including input images, +losses and distributions of activations and gradients. +* Routines for calculating the +[moving average](../../api_docs/python/train.md#ExponentialMovingAverage) +of learned parameters and using these averages +during evaluation to boost predictive performance. +* Implementation of a +[learning rate schedule](../../api_docs/python/train.md#exponential_decay) +that systematically decrements over time. +* Prefetching [queues](../../api_docs/python/io_ops.md#shuffle_batch) +for input +data to isolate the model from disk latency and expensive image pre-processing. + +We also provide a multi-GPU version of the model which demonstrates: + +* Configuring a model to train across multiple GPU cards in parallel. +* Sharing and updating variables between multiple GPUs. + +We hope that this tutorial provides a launch point for building larger CNNs for +vision tasks on TensorFlow. + +### Model Architecture + +The model in this CIFAR-10 tutorial is a multi-layer architecture consisting of +alternating convolutions and nonlinearities. These layers are followed by fully +connected layers leading into a softmax classifier. The model follows the +architecture described by +[Alex Krizhevsky](https://code.google.com/p/cuda-convnet/), with a few +differences in the top few layers. + +This model achieves a peak performance of about 86% accuracy within a few hours +of training time on a GPU. Please see [below](#evaluating-a-model) and the code +for details. It consists of 1,068,298 learnable parameters and requires about +19.5M multiply-add operations to compute inference on a single image. + +## Code Organization + +The code for this tutorial resides in +[`tensorflow/models/image/cifar10/`](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/image/cifar10/). + +File | Purpose +--- | --- +[`cifar10_input.py`](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/image/cifar10/cifar10_input.py) | Read the native CIFAR-10 binary file format. +[`cifar10.py`](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/image/cifar10/cifar10.py) | Build the CIFAR-10 model. +[`cifar10_train.py`](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/image/cifar10/cifar10_train.py) | Train a CIFAR-10 model on a single machine. +[`cifar10_multi_gpu_train.py`](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py) | Train a CIFAR-10 model on multiple GPUs. +[`cifar10_eval.py`](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model. + + +## CIFAR-10 Model + +The CIFAR-10 network is largely contained in +[`cifar10.py`](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/image/cifar10/cifar10.py). +The complete training +graph contains roughly 765 operations. We find that we can make the code most +reusable by constructing the graph with the following modules: + +1. [**Model inputs:**](#model-inputs) `inputs()` and `distorted_inputs()` add +operations that read and preprocess CIFAR images for evaluation and training, +respectively. +1. [**Model prediction:**](#model-prediction) `inference()` +adds operations that perform inference, i.e. classification, on supplied images. +1. [**Model training:**](#model-training) `loss()` and `train()` +add operations that compute the loss, +gradients, variable updates and visualization summaries. + +### Model Inputs + +The input part of the model is built by the functions `inputs()` and +`distorted_inputs()` which read images from the CIFAR-10 binary data files. +These files contain fixed byte length records, so we use +[`tf.FixedLengthRecordReader`](../../api_docs/python/io_ops.md#FixedLengthRecordReader). +See [`Reading Data`](../../how_tos/reading_data/index.md#reading-from-files) to +learn more about how the `Reader` class works. + +The images are processed as follows: + +* They are cropped to 24 x 24 pixels, centrally for evaluation or + [randomly](../../api_docs/python/image.md#random_crop) for training. +* They are [approximately whitened](../../api_docs/python/image.md#per_image_whitening) + to make the model insensitive to dynamic range. + +For training, we additionally apply a series of random distortions to +artificially increase the data set size: + +* [Randomly flip](../../api_docs/python/image.md#random_flip_left_right) the image from left to right. +* Randomly distort the [image brightness](../../api_docs/python/image.md#random_brightness). +* Randomly distort the [image contrast](../../api_docs/python/image.md#tf_image_random_contrast). + +Please see the [`Images`](../../api_docs/python/image.md) page for the list of +available distortions. We also attach an +[`image_summary`](../../api_docs/python/train.md?#image_summary) to the images +so that we may visualize them in TensorBoard. This is a good practice to verify +that inputs are built correctly. + +<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;"> + <img style="width:70%" src="./cifar_image_summary.png"> +</div> + +Reading images from disk and distorting them can use a non-trivial amount of +processing time. To prevent these operations from slowing down training, we run +them inside 16 separate threads which continuously fill a TensorFlow +[queue](../../api_docs/python/io_ops.md#shuffle_batch). + +### Model Prediction + +The prediction part of the model is constructed by the `inference()` function +which adds operations to compute the *logits* of the predictions. That part of +the model is organized as follows: + +Layer Name | Description +--- | --- +`conv1` | [convolution](../../api_docs/python/nn.md#conv2d) and [rectified linear](../../api_docs/python/nn.md#relu) activation. +`pool1` | [max pooling](../../api_docs/python/nn.md#max_pool). +`norm1` | [local response normalization](../../api_docs/python/nn.md#local_response_normalization). +`conv2` | [convolution](../../api_docs/python/nn.md#conv2d) and [rectified linear](../../api_docs/python/nn.md#relu) activation. +`norm2` | [local response normalization](../../api_docs/python/nn.md#local_response_normalization). +`pool2` | [max pooling](../../api_docs/python/nn.md#max_pool). +`local3` | [fully connected layer with rectified linear activation](../../api_docs/python/nn.md). +`local4` | [fully connected layer with rectified linear activation](../../api_docs/python/nn.md). +`softmax_linear` | linear transformation to produce logits. + +Here is a graph generated from TensorBoard describing the inference operation: + +<div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;"> + <img style="width:100%" src="./cifar_graph.png"> +</div> + +> **EXERCISE**: The output of `inference` are un-normalized logits. Try editing +the network architecture to return normalized predictions using [`tf.softmax()`] +(../../api_docs/python/nn.md?cl=head#softmax). + +The `inputs()` and `inference()` functions provide all of the components +necessary to perform evaluation on a model. We now shift our focus towards +building operations for training a model. + +> **EXERCISE:** The model architecture in `inference()` differs slightly from +the CIFAR-10 model specified in +[cuda-convnet](https://code.google.com/p/cuda-convnet/). In particular, the top +layers are locally connected and not fully connected. Try editing the +architecture to exactly replicate that fully connected model. + +### Model Training + +The usual method for training a network to perform N-way classification is +[multinomial logistic regression](https://en.wikipedia.org/wiki/Multinomial_logistic_regression), +aka. *softmax regression*. Softmax regression applies a +[softmax](../../api_docs/python/nn.md#softmax) nonlinearity to the +output of the network and calculates the +[cross-entropy](../../api_docs/python/nn.md#softmax_cross_entropy_with_logits) +between the normalized predictions and a +[1-hot encoding](../../api_docs/python/sparse_ops.md#sparse_to_dense) of the label. +For regularization, we also apply the usual +[weight decay](../../api_docs/python/nn.md#l2_loss) losses to all learned +variables. The objective function for the model is the sum of the cross entropy +loss and all these weight decay terms, as returned by the `loss()` function. + +We visualize it in TensorBoard with a [scalar_summary](../../api_docs/python/train.md?#scalar_summary): + +[![CIFAR-10 Loss](./cifar_loss.png "CIFAR-10 Total Loss")](#TODO(migmigmig)#TODO(danmane)) + +We train the model using standard +[gradient descent](https://en.wikipedia.org/wiki/Gradient_descent) +algorithm (see [Training](../../api_docs/python/train.md) for other methods) +with a learning rate that +[exponentially decays](../../api_docs/python/train.md#exponential_decay) +over time. + +[![CIFAR-10 Learning Rate Decay](./cifar_lr_decay.png "CIFAR-10 Learning Rate Decay")](#TODO(migmigmig)#TODO(danmane)) + +The `train()` function adds the operations needed to minimize the objective by +calculating the gradient and updating the learned variables (see +[`GradientDescentOptimizer`](../../api_docs/python/train.md#GradientDescentOptimizer) +for details). It returns an operation that executes all of the calculations +needed to train and update the model for one batch of images. + +## Launching and Training the Model + +We have built the model, let's now launch it and run the training operation with +the script `cifar10_train.py`. + +```shell +python cifar10_train.py +``` + +**NOTE:** The first time your run any target in the CIFAR-10 tutorial, +the CIFAR-10 dataset is automatically downloaded. The data set is ~160MB +so you may want to grab a quick cup of coffee for your first run. + +You should see the output: + +```shell +Filling queue with 20000 CIFAR images before starting to train. This will take a few minutes. +2015-11-04 11:45:45.927302: step 0, loss = 4.68 (2.0 examples/sec; 64.221 sec/batch) +2015-11-04 11:45:49.133065: step 10, loss = 4.66 (533.8 examples/sec; 0.240 sec/batch) +2015-11-04 11:45:51.397710: step 20, loss = 4.64 (597.4 examples/sec; 0.214 sec/batch) +2015-11-04 11:45:54.446850: step 30, loss = 4.62 (391.0 examples/sec; 0.327 sec/batch) +2015-11-04 11:45:57.152676: step 40, loss = 4.61 (430.2 examples/sec; 0.298 sec/batch) +2015-11-04 11:46:00.437717: step 50, loss = 4.59 (406.4 examples/sec; 0.315 sec/batch) +... +``` + +The script reports the total loss every 10 steps as well the speed at which +the last batch of data was processed. A few comments: + +* The first batch of data can be inordinately slow (e.g. several minutes) as the +preprocessing threads fill up the shuffling queue with 20,000 processed CIFAR +images. + +* The reported loss is the average loss of the most recent batch. Remember that +this loss is the sum of the cross entropy and all weight decay terms. + +* Keep an eye on the processing speed of a batch. The numbers shown above were +run on a Tesla K40c. If you are running on a CPU, expect slower performance. + + +> **EXERCISE:** When experimenting, it is sometimes annoying that the first +training step can take so long. Try decreasing the number of images initially +that initially fill up the queue. Search for `NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN` +in `cifar10.py`. + +`cifar10_train.py` periodically [saves](../../api_docs/python/state_ops.md#Saver) +all model parameters in +[checkpoint files](../../how_tos/variables.md#saving-and-restoring) +but it does *not* evaluate the model. The checkpoint file +will be used by `cifar10_eval.py` to measure the predictive +performance (see [Evaluating a Model](#evaluating-a-model) below). + + +If you followed the previous steps, then you have now started training +a CIFAR-10 model. [Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0) + +The terminal text returned from `cifar10_train.py` provides minimal insight into +how the model is training. We want more insight into the model during training: + +* Is the loss *really* decreasing or is that just noise? +* Is the model being provided appropriate images? +* Are the gradients, activations and weights reasonable? +* What is the learning rate currently at? + +[TensorBoard](../../how_tos/summaries_and_tensorboard/index.md) provides this +functionality, displaying data exported periodically from `cifar10_train.py` via +a +[`SummaryWriter`](../../api_docs/python/train.md#SummaryWriter). + +For instance, we can watch how the distribution of activations and degree of +sparsity in `local3` features evolve during training: + +<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px; display: flex; flex-direction: row"> + <img style="flex-grow:1; flex-shrink:1;" src="./cifar_sparsity.png"> + <img style="flex-grow:1; flex-shrink:1;" src="./cifar_activations.png"> +</div> + +Individual loss functions, as well as the total loss, are particularly +interesting to track over time. However, the loss exhibits a considerable amount +of noise due to the small batch size employed by training. In practice we find +it extremely useful to visualize their moving averages in addition to their raw +values. See how the scripts use +[ExponentialMovingAverage](../../api_docs/python/train.md#ExponentialMovingAverage) +for this purpose. + +## Evaluating a Model + +Let us now evaluate how well the trained model performs on a hold-out data set. +the model is evaluated by the script `cifar10_eval.py`. It constructs the model +with the `inference()` function and uses all 10,000 images in the evaluation set +of CIFAR-10. It calculates the *precision at 1:* how often the top prediction +matches the true label of the image. + +To monitor how the model improves during training, the evaluation script runs +periodically on the latest checkpoint files created by the `cifar10_train.py`. + +```shell +python cifar10_eval.py +``` + +> Be careful not to run the evaluation and training binary on the same GPU or +else you might run out of memory. Consider running the evaluation on +a separate GPU if available or suspending the training binary while running +the evaluation on the same GPU. + +You should see the output: + +```shell +2015-11-06 08:30:44.391206: precision @ 1 = 0.860 +... +``` + +The script merely returns the precision @ 1 periodically -- in this case +it returned 86% accuracy. `cifar10_eval.py` also +exports summaries that may be visualized in TensorBoard. These summaries +provide additional insight into the model during evaluation. + +The training script calculates the +[moving average](../../api_docs/python/train.md#ExponentialMovingAverage) +version of all learned variables. The evaluation script substitutes +all learned model parameters with the moving average version. This +substitution boosts model performance at evaluation time. + +> **EXERCISE:** Employing averaged parameters may boost predictive performance +by about 3% as measured by precision@1. Edit `cifar10_eval.py` to not employ the +averaged parameters for the model and verify that the predictive performance +drops. + + +## Training a Model Using Multiple GPU Cards + +Modern workstations may contain multiple GPUs for scientific computation. +TensorFlow can leverage this environment to run the training operation +concurrently across multiple cards. + +Training a model in a parallel, distributed fashion requires +coordinating training processes. For what follows we term *model replica* +to be one copy of a model training on a subset of data. + +Naively employing asynchronous updates of model parameters +leads to sub-optimal training performance +because an individual model replica might be trained on a stale +copy of the model parameters. Conversely, employing fully synchronous +updates will be as slow as the slowest model replica. + +In a workstation with multiple GPU cards, each GPU will have similar speed +and contain enough memory to run an entire CIFAR-10 model. Thus, we opt to +design our training system in the following manner: + +* Place an individual model replica on each GPU. +* Update model parameters synchronously by waiting for all GPUs to finish +processing a batch of data. + +Here is a diagram of this model: + +<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;"> + <img style="width:100%" src="./Parallelism.png"> +</div> + +Note that each GPU computes inference as well as the gradients for a unique +batch of data. This setup effectively permits dividing up a larger batch +of data across the GPUs. + +This setup requires that all GPUs share the model parameters. A well-known +fact is that transferring data to and from GPUs is quite slow. For this +reason, we decide to store and update all model parameters on the CPU (see +green box). A fresh set of model parameters are transferred to the GPU +when a new batch of data is processed by all GPUs. + +The GPUs are synchronized in operation. All gradients are accumulated from +the GPUs and averaged (see green box). The model parameters are updated with +the gradients averaged across all model replicas. + +### Placing Variables and Operations on Devices + +Placing operations and variables on devices requires some special +abstractions. + +The first abstraction we require is a function for computing inference and +gradients for a single model replica. In the code we term this abstraction +a *tower*. We must set two attributes for each tower: + +* A unique name for all operations within a tower. +[`tf.name_scope()`](../../api_docs/python/framework.md#name_scope) provides +this unique name by prepending a scope. For instance, all operations in +the first tower are prepended with `tower_0`, e.g. `tower_0/conv1/Conv2D`. + +* A preferred hardware device to run the operation within a tower. +[`tf.device()`](../../api_docs/python/framework.md#device) specifies this. For +instance, all operations in the first tower reside within `device('/gpu:0')` +scope indicating that they should be run on the first GPU. + +All variables are pinned to the CPU and accessed via +[`tf.get_variable()`](../../api_docs/python/state_ops.md#get_variable) +in order to share them in a multi-GPU version. +See how-to on [Sharing Variables](../../how_tos/variable_scope/index.md). + +### Launching and Training the Model on Multiple GPU cards + +If you have several GPU cards installed on your machine you can use them to +train the model faster with the `cifar10_multi_gpu_train.py` script. It is a +variation of the training script that parallelizes the model across multiple GPU +cards. + +```shell +python cifar10_multi_gpu_train.py --num_gpus=2 +``` + +The training script should output: + +```shell +Filling queue with 20000 CIFAR images before starting to train. This will take a few minutes. +2015-11-04 11:45:45.927302: step 0, loss = 4.68 (2.0 examples/sec; 64.221 sec/batch) +2015-11-04 11:45:49.133065: step 10, loss = 4.66 (533.8 examples/sec; 0.240 sec/batch) +2015-11-04 11:45:51.397710: step 20, loss = 4.64 (597.4 examples/sec; 0.214 sec/batch) +2015-11-04 11:45:54.446850: step 30, loss = 4.62 (391.0 examples/sec; 0.327 sec/batch) +2015-11-04 11:45:57.152676: step 40, loss = 4.61 (430.2 examples/sec; 0.298 sec/batch) +2015-11-04 11:46:00.437717: step 50, loss = 4.59 (406.4 examples/sec; 0.315 sec/batch) +... +``` + +Note that the number of GPU cards used defaults to 1. Additionally, if only 1 +GPU is available on your machine, all computations will be placed on it, even if +you ask for more. + +> **EXERCISE:** The default settings for `cifar10_train.py` is to +run on a batch size of 128. Try running `cifar10_multi_gpu_train.py` on 2 GPUs +with a batch size of 64 and compare the training speed. + +## Next Steps + +[Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0). You have +completed the CIFAR-10 tutorial. + +If you are now interested in developing and training your own image +classification system, we recommend forking this tutorial and replacing +components to build address your image classification problem. + +> **EXERCISE:** Download the +[Street View House Numbers (SVHN)](http://ufldl.stanford.edu/housenumbers/) data set. +Fork the CIFAR-10 tutorial and swap in the SVHN as the input data. Try adapting +the network architecture to improve predictive performance. + + + diff --git a/tensorflow/g3doc/tutorials/index.md b/tensorflow/g3doc/tutorials/index.md new file mode 100644 index 0000000000..726a5c6687 --- /dev/null +++ b/tensorflow/g3doc/tutorials/index.md @@ -0,0 +1,142 @@ +# Overview + + +## ML for Beginners + +If you're new to machine learning, we recommend starting here. You'll learn +about a classic problem, handwritten digit classification (MNIST), and get a +gentle introduction to multiclass classification. + +[View Tutorial](mnist/beginners/index.md) + + +## MNIST for Pros + +If you're already familiar with other deep learning software packages, and are +already familiar with MNIST, this tutorial with give you a very brief primer on +TensorFlow. + +[View Tutorial](mnist/pros/index.md) + + +## TensorFlow Mechanics 101 + +This is a technical tutorial, where we walk you through the details of using +TensorFlow infrastructure to train models at scale. We use again MNIST as the +example. + +[View Tutorial](mnist/tf/index.md) + + +## Convolutional Neural Networks + +An introduction to convolutional neural networks using the CIFAR-10 data set. +Convolutional neural nets are particularly tailored to images, since they +exploit translation invariance to yield more compact and effective +representations of visual content. + +[View Tutorial](deep_cnn/index.md) + + +## Vector Representations of Words + +This tutorial motivates why it is useful to learn to represent words as vectors +(called *word embeddings*). It introduces the word2vec model as an efficient +method for learning embeddings. It also covers the high-level details behind +noise-contrastive training methods (the biggest recent advance in training +embeddings). + +[View Tutorial](word2vec/index.md) + + +## Recurrent Neural Networks + +An introduction to RNNs, wherein we train an LSTM network to predict the next +word in an English sentence. (A task sometimes called language modeling.) + +[View Tutorial](recurrent/index.md) + + +## Sequence-to-Sequence Models + +A follow on to the RNN tutorial, where we assemble a sequence-to-sequence model +for machine translation. You will learn to build your own English-to-French +translator, entirely machine learned, end-to-end. + +[View Tutorial](seq2seq/index.md) + + +## Mandelbrot Set + +TensorFlow can be used for computation that has nothing to do with machine +learning. Here's a naive implementation of Mandelbrot set visualization. + +[View Tutorial](mandelbrot/index.md) + + +## Partial Differential Equations + +As another example of non-machine learning computation, we offer an example of +a naive PDE simulation of raindrops landing on a pond. + +[View Tutorial](pdes/index.md) + + +## MNIST Data Download + +Details about downloading the MNIST handwritten digits data set. Exciting +stuff. + +[View Tutorial](mnist/download/index.md) + + +## Sparse Linear Regression + +In many practical machine learning settings we have a large number input +features, only very few of which are active for any given example. TensorFlow +has great tools for learning predictive models in these settings. + +COMING SOON + + +## Visual Object Recognition + +We will be releasing our state-of-the-art Inception object recognition model, +complete and already trained. + +COMING SOON + + +## Deep Dream Visual Hallucinations + +Building on the Inception recognition model, we will release a TensorFlow +version of the [Deep Dream](https://github.com/google/deepdream) neural network +visual hallucination software. + +COMING SOON + + +## Automated Image Captioning + +TODO(vinyals): Write me, three lines max. + +COMING SOON + + + +<div class='sections-order' style="display: none;"> +<!-- +<!-- mnist/beginners/index.md --> +<!-- mnist/pros/index.md --> +<!-- mnist/tf/index.md --> +<!-- deep_cnn/index.md --> +<!-- word2vec/index.md --> +<!-- recurrent/index.md --> +<!-- seq2seq/index.md --> +<!-- mandelbrot/index.md --> +<!-- pdes/index.md --> +<!-- mnist/download/index.md --> +--> +</div> + + diff --git a/tensorflow/g3doc/tutorials/mandelbrot/index.md b/tensorflow/g3doc/tutorials/mandelbrot/index.md new file mode 100755 index 0000000000..7c6adcb4e8 --- /dev/null +++ b/tensorflow/g3doc/tutorials/mandelbrot/index.md @@ -0,0 +1,97 @@ + + +``` +#Import libraries for simulation +import tensorflow as tf +import numpy as np + +#Imports for visualization +import PIL.Image +from cStringIO import StringIO +from IPython.display import clear_output, Image, display +import scipy.ndimage as nd +``` + + +``` +def DisplayFractal(a, fmt='jpeg'): + """Display an array of iteration counts as a + colorful picture of a fractal.""" + a_cyclic = (6.28*a/20.0).reshape(list(a.shape)+[1]) + img = np.concatenate([10+20*np.cos(a_cyclic), + 30+50*np.sin(a_cyclic), + 155-80*np.cos(a_cyclic)], 2) + img[a==a.max()] = 0 + a = img + a = np.uint8(np.clip(a, 0, 255)) + f = StringIO() + PIL.Image.fromarray(a).save(f, fmt) + display(Image(data=f.getvalue())) +``` + + +``` +sess = tf.InteractiveSession() +``` + + Exception AssertionError: AssertionError() in <bound method InteractiveSession.__del__ of <tensorflow.python.client.session.InteractiveSession object at 0x6247390>> ignored + + + +``` +# Use NumPy to create a 2D array of complex numbers on [-2,2]x[-2,2] + +Y, X = np.mgrid[-1.3:1.3:0.005, -2:1:0.005] +Z = X+1j*Y +``` + + +``` +xs = tf.constant(Z.astype("complex64")) +zs = tf.Variable(xs) +ns = tf.Variable(tf.zeros_like(xs, "float32")) +``` + + +``` +tf.InitializeAllVariables().run() +``` + + +``` +# Compute the new values of z: z^2 + x +zs_ = zs*zs + xs + +# Have we diverged with this new value? +not_diverged = tf.complex_abs(zs_) < 4 + +# Operation to update the zs and the iteration count. +#t +# Note: We keep computing zs after they diverge! This +# is very wasteful! There are better, if a little +# less simple, ways to do this. +# +step = tf.group( + zs.assign(zs_), + ns.assign_add(tf.cast(not_diverged, "float32")) + ) +``` + + +``` +for i in range(200): step.run() +``` + + +``` +DisplayFractal(ns.eval()) +``` + + +![jpeg](output_8_0.jpe) + + + +``` + +``` diff --git a/tensorflow/g3doc/tutorials/mandelbrot/output_8_0.jpe b/tensorflow/g3doc/tutorials/mandelbrot/output_8_0.jpe Binary files differnew file mode 100755 index 0000000000..8e261d44a8 --- /dev/null +++ b/tensorflow/g3doc/tutorials/mandelbrot/output_8_0.jpe diff --git a/tensorflow/g3doc/tutorials/mnist/__init__.py b/tensorflow/g3doc/tutorials/mnist/__init__.py new file mode 100755 index 0000000000..e69de29bb2 --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/__init__.py diff --git a/tensorflow/g3doc/tutorials/mnist/beginners/index.md b/tensorflow/g3doc/tutorials/mnist/beginners/index.md new file mode 100644 index 0000000000..8ccb69d977 --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/beginners/index.md @@ -0,0 +1,420 @@ +# MNIST Softmax Regression (For Beginners) + +*This tutorial is intended for readers who are new to both machine learning and +TensorFlow. If you already +know what MNIST is, and what softmax (multinomial logistic) regression is, +you might prefer this [faster paced tutorial](../pros/index.md).* + +When one learns how to program, there's a tradition that the first thing you do +is print "Hello World." Just like programming has Hello World, machine learning +has MNIST. + +MNIST is a simple computer vision dataset. It consists of images of handwritten +digits like these: + +<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/MNIST.png"> +</div> + +It also includes labels for each image, telling us which digit it is. For +example, the labels for the above images are 5, 0, 4, and 1. + +In this tutorial, we're going to train a model to look at images and predict +what digits they are. Our goal isn't to train a really elaborate model that +achieves state-of-the-art performance -- although we'll give you code to do that +later! -- but rather to dip a toe into using TensorFlow. As such, we're going +to start with a very simple model, called a Softmax Regression. + +The actual code for this tutorial is very short, and all the interesting +stuff happens in just three lines. However, it is very +important to understand the ideas behind it: both how TensorFlow works and the +core machine learning concepts. Because of this, we are going to very carefully +work through the code. + +## The MNIST Data + +The MNIST data is hosted on +[Yann LeCun's website](http://yann.lecun.com/exdb/mnist/). +For your convenience, we've included some python code to download and install +the data automatically. You can either download [the code](../input_data.py) and +import it as below, or simply copy and paste it in. + +```python +import input_data +mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) +``` + +The downloaded data is split into two parts, 60,000 data points of training +data (`mnist.train`) and 10,000 points of test data (`mnist.test`). This +split is very important: it's essential in machine learning that we +have separate data which we don't learn from so that we can make sure +that what we've learned actually generalizes! + +As mentioned earlier, every MNIST data point has two parts: an image of a +handwritten digit and a corresponding label. We will call the images "xs" and +the labels "ys". Both the training set and test set contain xs and ys, for +example the training images are `mnist.train.images` and the train labels are +`mnist.train.labels`. + +Each image is 28 pixels by 28 pixels. We can interpret this as a big array of +numbers: + +<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/MNIST-Matrix.png"> +</div> + +We can flatten this array into a vector of 28x28 = 784 numbers. It doesn't +matter how we flatten the array, as long as we're consistent between images. +From this perspective, the MNIST images are just a bunch of points in a +784-dimensional vector space, with a +[very rich structure](http://colah.github.io/posts/2014-10-Visualizing-MNIST/) +(warning: computationally intensive visualizations). + +Flattening the data throws away information about the 2D structure of the image. +Isn't that bad? Well, the best computer vision methods do exploit this +structure, and we will in later tutorials. But the simple method we will be +using here, a softmax regression, won't. + +The result is that `mnist.train.images` is a tensor (an n-dimensional array) with a +shape of `[60000, 784]`. The first dimension indexes the images and the second +dimension indexes the pixels in each image. Each entry in the tensor is the +pixel intensity between 0 and 1, for a particular pixel in a particular image. + +<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/mnist-train-xs.png"> +</div> + +The corresponding labels in MNIST are numbers between 0 and 9, describing +which digit a given image is of. +For the purposes of this tutorial, we're going to want our labels as +as "one-hot vectors". A one-hot vector is a vector which is 0 in most +dimensions, and 1 in a single dimension. In this case, the $$n$$th digit will be +represented as a vector which is 1 in the $$n$$th dimensions. For example, 0 +would be $$[1,0,0,0,0,0,0,0,0,0,0]$$. +Consequently, `mnist.train.labels` is a +`[60000, 10]` array of floats. + +<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/mnist-train-ys.png"> +</div> + +We're now ready to actually make our model! + +## Softmax Regressions + +We know that every image in MNIST is a digit, whether it's a zero or a nine. We +want to be able to look at an image and give probabilities for it being each +digit. For example, our model might look at a picture of a nine and be 80% sure +it's a nine, but give a 5% chance to it being an eight (because of the top loop) +and a bit of probability to all the others because it isn't sure. + +This is a classic case where a softmax regression is a natural, simple model. +If you want to assign probabilities to an object being one of several different +things, softmax is the thing to do. Even later on, when we train more +sophisticated models, the final step will be a layer of softmax. + +A softmax regression has two steps: first we add up the evidence of our input +being in certain classes, and then we convert that evidence into probabilities. + +To tally up the evidence that a given image is in a particular class, we do a +weighted sum of the pixel intensities. The weight is negative if that pixel +having a high intensity is evidence against the image being in that class, +and positive if it is evidence in favor. + +The following diagram shows the weights one model learned for each of these +classes. Red represents negative weights, while blue represents positive +weights. + +<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/softmax-weights.png"> +</div> + +We also add some extra evidence called a bias. Basically, we want to be able +to say that some things are more likely independent of the input. The result is +that the evidence for a class $$i$$ given an input $$x$$ is: + +$$\text{evidence}_i = \sum_j W_{i,~ j} x_j + b_i$$ + +where $$W_i$$ is the weights and $$b_i$$ is the bias for class $$i$$, and $$j$$ +is an index for summing over the pixels in our input image $$x$$. We then +convert the evidence tallies into our predicted probabilities +$$y$$ using the "softmax" function: + +$$y = \text{softmax}(\text{evidence})$$ + +Here softmax is serving as an "activation" or "link" function, shaping +the output of our linear function into the form we want -- in this case, a +probability distribution over 10 cases. +You can think of it as converting tallies +of evidence into probabilities of our input being in each class. +It's defined as: + +$$\text{softmax}(x) = \text{normalize}(\exp(x))$$ + +If you expand that equation out, you get: + +$$\text{softmax}(x)_i = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$ + +But it's often more helpful to think of softmax the first way: +exponentiating its inputs and then normalizing them. The exponentiation +means that one unit more evidence increases the weight given to any hypothesis +multiplicatively. And conversely, having one less unit of evidence means that a +hypothesis gets a fraction of its earlier weight. No hypothesis ever has zero +or negative weight. Softmax then normalizes these weights, so that they add up +to one, forming a valid probability distribution. (To get more intuition about +the softmax function, check out the +[section](http://neuralnetworksanddeeplearning.com/chap3.html#softmax) +on it in Michael Nieslen's book, complete with an interactive visualization.) + + +You can picture our softmax regression as looking something like the following, +although with a lot more $$x$$s. For each output, we compute a weighted sum of +the $$x$$s, add a bias, and then apply softmax. + +<div style="width:55%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/softmax-regression-scalargraph.png"> +</div> + +If we write that out as equations, we get: + +<div style="width:52%; margin-left:25%; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/softmax-regression-scalarequation.png"> +</div> + +We can "vectorize" this procedure, turning it into a matrix multiplication +and vector addition. This is helpful for computational efficiency. (It's also +a useful way to think.) + +<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/softmax-regression-vectorequation.png"> +</div> + +More compactly, we can just write: + +$$y = \text{softmax}(Wx + b)$$ + + +## Implementing the Regression + + +To do efficient numerical computing in Python, we typically use libraries like +NumPy that do expensive operations such as matrix multiplication outside Python, +using highly efficient code implemented in another language. +Unfortunately, there can still be a lot of overhead from switching back to +Python every operation. This overhead is especially bad if you want to run +computations on GPUs or in a distributed manner, where there can be a high cost +to transferring data. + +TensorFlow also does its heavy lifting outside python, +but it takes things a step further to avoid this overhead. +Instead of running a single expensive operation independently +from Python, TensorFlow lets us describe a graph of interacting operations that +run entirely outside Python. (Approaches like this can be seen in a few +machine learning libraries.) + +To run computations, TensorFlow needs to connect to its backend. This connection +is called a `Session`. To use TensorFlow, we need to import it and create a +session. + +```python +import tensorflow as tf +sess = tf.InteractiveSession() +``` + +(Using an `InteractiveSession` makes TensorFlow a bit more flexible about how +you structure your code. In particular, it's helpful for work in interactive +contexts like iPython.) + +We describe these interacting operations by manipulating symbolic variables. +Let's create one: + +```python +x = tf.placeholder("float", [None, 784]) +``` + +`x` isn't a specific value. It's a `placeholder`, a value that we'll input when +we ask TensorFlow to run a computation. We want to be able to input any number +of MNIST images, each flattened into a 784-dimensional vector. We represent +this as a 2d tensor of floating point numbers, with a shape `[None, 784]`. +(Here `None` means that a dimension can be of any length.) + +We also need the weights and biases for our model. We could imagine treating +these like additional inputs, but TensorFlow has an even better way to handle +it: `Variable`. +A `Variable` is a modifiable tensor that lives in TensorFlow's graph of +interacting +operations. It can be used and even modified by the computation. For machine +learning applications, one generally has the model parameters be `Variable`s. + +```python +W = tf.Variable(tf.zeros([784,10])) +b = tf.Variable(tf.zeros([10])) +``` + +We create these `Variable`s by giving `tf.Variable` the initial value of the +`Variable`: in this case, we initialize both `W` and `b` as tensors full of +zeros. Since we are going to learn `W` and `b`, it doesn't matter very much +what they initially are. + +Notice that `W` has a shape of [784, 10] because we want to multiply the +784-dimensional image vectors by it to produce 10-dimensional vectors of +evidence for the difference classes. `b` has a shape of [10] so we can add it +to the output. + +We can now implement our model. It only takes one line! + +```python +y = tf.nn.softmax(tf.matmul(x,W) + b) +``` + +First, we multiply `x` by `W` with the expression `tf.matmul(x,W)`. This is +flipped from when we multiplied them in our equation, where we had $$Wx$$, as a +small trick +to deal with `x` being a 2D tensor with multiple inputs. We then add `b`, and +finally apply `tf.nn.softmax`. + +That's it. It only took us one line to define our model, after a couple short +lines of setup. That isn't because TensorFlow is designed to make a softmax +regression particularly easy: it's just a very flexible way to describe many +kinds of numerical computations, from machine learning models to physics +simulations. And once defined, our model can be run on different devices: +your computer's CPU, GPUs, and even phones! + + +## Training + +In order to train our model, we need to define what it means for the model to +be good. Well, actually, in machine learning we typically define what it means +for a model to be bad, called the cost or loss, and then try to minimize how bad +it is. But the two are equivalent. + +One very common, very nice cost function is "cross-entropy." Surprisingly, +cross-entropy arises from thinking about information compressing codes in +information theory but it winds up being an important idea in lots of areas, +from gambling to machine learning. It's defined: + +$$H_{y'}(y) = -\sum_i y'_i \log(y_i)$$ + +Where $$y$$ is our predicted probability distribution, and $$y'$$ is the true +distribution (the one-hot vector we'll input). In some rough sense, the +cross-entropy is measuring how inefficient our predictions are for describing +the truth. Going into more detail about cross-entropy is beyond the scope of +this tutorial, but it's well worth +[understanding](http://colah.github.io/posts/2015-09-Visual-Information/). + +To implement cross-entropy we need to first add a new placeholder to input +the correct answers: + +```python +y_ = tf.placeholder("float", [None,10]) +``` + +Then we can implement the cross-entropy, $$-\sum y'\log(y)$$: + +```python +cross_entropy = -tf.reduce_sum(y_*tf.log(y)) +``` + +First, `tf.log` computes the logarithm of each element of `y`. Next, we multiply +each element of `y_` with the corresponding element of `tf.log(y_)`. Finally, +`tf.reduce_sum` adds all the elements of the tensor. (Note that this isn't +just the cross-entropy of the truth with a single prediction, but the sum of the +cross-entropies for all 100 images we looked at. How well we are doing on 100 +data points is a much better description of how good our model is than a single +data point.) + +Now that we know what we want our model to do, it's very easy to have TensorFlow +train it to do so. +Because TensorFlow know the entire graph of your computations, it +can automatically use the [backpropagation +algorithm](http://colah.github.io/posts/2015-08-Backprop/) +to efficiently determine how your variables affect the cost you ask it minimize. +Then it can apply your choice of optimization algorithm to modify the variables +and reduce the cost. + +```python +train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) +``` + +In this case, we ask TensorFlow to minimize `cross_entropy` using the gradient +descent algorithm with a learning rate of 0.01. Gradient descent is a simple +procedure, where TensorFlow simply shifts each variable a little bit in the +direction that reduces the cost. But TensorFlow also provides +[many other optimization algorithms] +(../../../api_docs/python/train.md?#optimizers): using one is as simple as +tweaking one line. + +What TensorFlow actually does here, behind the scenes, is it adds new operations +to your graph which +implement backpropagation and gradient descent. Then it gives you back a +single operation which, when run, will do a step of gradient descent training, +slightly tweaking your variables to reduce the cost. + +Now we have our model set up to train. But before we start, we need to +initialize the variables we created: + +```python +tf.initialize_all_variables().run() +``` + +Let's train -- we'll run the training step 1000 times! + +```python +for i in range(1000): + batch_xs, batch_ys = mnist.train.next_batch(100) + train_step.run({x: batch_xs, y_: batch_ys}) +``` + +Each step of the loop, we get a "batch" of one hundred random data points from +our training set. We run `train_step` feeding in the batches data to replace +the `placeholder`s. + +Using small batches of random data is called stochastic training -- in +this case, stochastic gradient descent. Ideally, we'd like to use all our data +for every step of training because that would give us a better sense of what +we should be doing, but that's expensive. So, instead, we use a different subset +every time. Doing this is cheap and has much of the same benefit. + + + +## Evaluating Our Model + +How well does our model do? + +Well, first let's figure out where we predicted the correct label. `tf.argmax` +is an extremely useful function which gives you the index of the highest entry +in a tensor along some axis. For example, `tf.argmax(y,1)` is the label our +model thinks is most likely for each input, while `tf.argmax(y_,1)` is the +correct label. We can use `tf.equal` to check if our prediction matches the +truth. + +```python +correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) +``` + +That gives us a list of booleans. To determine what fraction are correct, we +cast to floating point numbers and then take the mean. For example, +`[True, False, True, True]` would become `[1,0,1,1]` which would become `0.75`. + +```python +accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) +``` + +Finally, we ask for our accuracy on our test data. + +```python +print accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}) +``` + +This should be about 91%. + +Is that good? Well, not really. In fact, it's pretty bad. This is because we're +using a very simple model. With some small changes, we can get to +97%. The best models can get to over 99.7% accuracy! (For more information, have +a look at this +[list of results](http://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results.html).) + +What matters is that we learned from this model. Still, if you're feeling a bit +down about these results, check out [the next tutorial](../../index.md) where we +do a lot better, and learn how to build more sophisticated models using +TensorFlow! diff --git a/tensorflow/g3doc/tutorials/mnist/download/index.md b/tensorflow/g3doc/tutorials/mnist/download/index.md new file mode 100644 index 0000000000..dc11e727d8 --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/download/index.md @@ -0,0 +1,85 @@ +# Downloading MNIST + +Code: [tensorflow/g3doc/tutorials/mnist/](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/g3doc/tutorials/mnist/) + +The goal of this tutorial is to show how to download the dataset files required +for handwritten digit classification using the (classic) MNIST data set. + +## Tutorial Files + +This tutorial references the following files: + +File | Purpose +--- | --- +[`input_data.py`](../input_data.py) | The code to download the MNIST dataset for training and evaluation. + +## Prepare the Data + +MNIST is a classic problem in machine learning. The problem is to look at +greyscale 28x28 pixel images of handwritten digits and determine which digit +the image represents, for all the digits from zero to nine. + +![MNIST Digits](../tf/mnist_digits.png "MNIST Digits") + +For more information, refer to [Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/) +or [Chris Olah's visualizations of MNIST](http://colah.github.io/posts/2014-10-Visualizing-MNIST/). + +### Download + +[Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/) +also hosts the training and test data for download. + +File | Purpose +--- | --- +[`train-images-idx3-ubyte.gz`](http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz) | training set images - 55000 training images, 5000 validation images +[`train-labels-idx1-ubyte.gz`](http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz) | training set labels matching the images +[`t10k-images-idx3-ubyte.gz`](http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz) | test set images - 10000 images +[`t10k-labels-idx1-ubyte.gz`](http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz) | test set labels matching the images + +In the `input_data.py` file, the `maybe_download()` function will ensure these +files are downloaded into a local data folder for training. + +The folder name is specified in a flag variable at the top of the +`fully_connected_feed.py` file and may be changed to fit your needs. + +### Unpack and Reshape + +The files themselves are not in any standard image format and are manually +unpacked (following the instructions available at the website) by the +`extract_images()` and `extract_labels()` functions in `input_data.py`. + +The image data is extracted into a 2d tensor of: `[image index, pixel index]` +where each entry is the intensity value of a specific pixel in a specific +image, rescaled from `[0, 255]` to `[-0.5, 0.5]`. The "image index" corresponds +to an image in the dataset, counting up from zero to the size of the dataset. +And the "pixel index" corresponds to a specific pixel in that image, ranging +from zero to the number of pixels in the image. + +The 60000 examples in the `train-*` files are then split into 55000 examples +for training and 5000 examples for validation. For all of the 28x28 +pixel greyscale images in the datasets the image size is 784 and so the output +tensor for the training set images is of shape `[55000, 784]`. + +The label data is extracted into a 1d tensor of: `[image index]` +with the class identifier for each example as the value. For the training set +labels, this would then be of shape `[55000]`. + +### DataSet Object + +The underlying code will download, unpack, and reshape images and labels for +the following datasets: + +Dataset | Purpose +--- | --- +`data_sets.train` | 55000 images and labels, for primary training. +`data_sets.validation` | 5000 images and labels, for iterative validation of training accuracy. +`data_sets.test` | 10000 images and labels, for final testing of trained accuracy. + +The `read_data_sets()` function will return a dictionary with a `DataSet` +instance for each of these three sets of data. The `DataSet.next_batch()` +method can be used to fetch a tuple consisting of `batch_size` lists of images +and labels to be fed into the running TensorFlow session. + +```python +images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size) +``` diff --git a/tensorflow/g3doc/tutorials/mnist/fully_connected_feed.py b/tensorflow/g3doc/tutorials/mnist/fully_connected_feed.py new file mode 100644 index 0000000000..618c8f47cb --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/fully_connected_feed.py @@ -0,0 +1,219 @@ +"""Trains and Evaluates the MNIST network using a feed dictionary. + +TensorFlow install instructions: +https://tensorflow.org/get_started/os_setup.html + +MNIST tutorial: +https://tensorflow.org/tutorials/mnist/tf/index.html + +""" +# pylint: disable=missing-docstring +import os.path +import time + +import tensorflow.python.platform +import numpy +import tensorflow as tf + +from tensorflow.g3doc.tutorials.mnist import input_data +from tensorflow.g3doc.tutorials.mnist import mnist + + +# Basic model parameters as external flags. +flags = tf.app.flags +FLAGS = flags.FLAGS +flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') +flags.DEFINE_integer('max_steps', 2000, 'Number of steps to run trainer.') +flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.') +flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.') +flags.DEFINE_integer('batch_size', 100, 'Batch size. ' + 'Must divide evenly into the dataset sizes.') +flags.DEFINE_string('train_dir', 'data', 'Directory to put the training data.') +flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data ' + 'for unit testing.') + + +def placeholder_inputs(batch_size): + """Generate placeholder variables to represent the the input tensors. + + These placeholders are used as inputs by the rest of the model building + code and will be fed from the downloaded data in the .run() loop, below. + + Args: + batch_size: The batch size will be baked into both placeholders. + + Returns: + images_placeholder: Images placeholder. + labels_placeholder: Labels placeholder. + """ + # Note that the shapes of the placeholders match the shapes of the full + # image and label tensors, except the first dimension is now batch_size + # rather than the full size of the train or test data sets. + images_placeholder = tf.placeholder(tf.float32, shape=(batch_size, + mnist.IMAGE_PIXELS)) + labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size)) + return images_placeholder, labels_placeholder + + +def fill_feed_dict(data_set, images_pl, labels_pl): + """Fills the feed_dict for training the given step. + + A feed_dict takes the form of: + feed_dict = { + <placeholder>: <tensor of values to be passed for placeholder>, + .... + } + + Args: + data_set: The set of images and labels, from input_data.read_data_sets() + images_pl: The images placeholder, from placeholder_inputs(). + labels_pl: The labels placeholder, from placeholder_inputs(). + + Returns: + feed_dict: The feed dictionary mapping from placeholders to values. + """ + # Create the feed_dict for the placeholders filled with the next + # `batch size ` examples. + images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size, + FLAGS.fake_data) + feed_dict = { + images_pl: images_feed, + labels_pl: labels_feed, + } + return feed_dict + + +def do_eval(sess, + eval_correct, + images_placeholder, + labels_placeholder, + data_set): + """Runs one evaluation against the full epoch of data. + + Args: + sess: The session in which the model has been trained. + eval_correct: The Tensor that returns the number of correct predictions. + images_placeholder: The images placeholder. + labels_placeholder: The labels placeholder. + data_set: The set of images and labels to evaluate, from + input_data.read_data_sets(). + """ + # And run one epoch of eval. + true_count = 0 # Counts the number of correct predictions. + steps_per_epoch = int(data_set.num_examples / FLAGS.batch_size) + num_examples = steps_per_epoch * FLAGS.batch_size + for step in xrange(steps_per_epoch): + feed_dict = fill_feed_dict(data_set, + images_placeholder, + labels_placeholder) + true_count += sess.run(eval_correct, feed_dict=feed_dict) + precision = float(true_count) / float(num_examples) + print ' Num examples: %d Num correct: %d Precision @ 1: %0.04f' % ( + num_examples, true_count, precision) + + +def run_training(): + """Train MNIST for a number of steps.""" + # Get the sets of images and labels for training, validation, and + # test on MNIST. + data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) + + # Tell TensorFlow that the model will be built into the default Graph. + with tf.Graph().as_default(): + # Generate placeholders for the images and labels. + images_placeholder, labels_placeholder = placeholder_inputs( + FLAGS.batch_size) + + # Build a Graph that computes predictions from the inference model. + logits = mnist.inference(images_placeholder, + FLAGS.hidden1, + FLAGS.hidden2) + + # Add to the Graph the Ops for loss calculation. + loss = mnist.loss(logits, labels_placeholder) + + # Add to the Graph the Ops that calculate and apply gradients. + train_op = mnist.training(loss, FLAGS.learning_rate) + + # Add the Op to compare the logits to the labels during evaluation. + eval_correct = mnist.evaluation(logits, labels_placeholder) + + # Build the summary operation based on the TF collection of Summaries. + summary_op = tf.merge_all_summaries() + + # Create a saver for writing training checkpoints. + saver = tf.train.Saver() + + # Create a session for running Ops on the Graph. + sess = tf.Session() + + # Run the Op to initialize the variables. + init = tf.initialize_all_variables() + sess.run(init) + + # Instantiate a SummaryWriter to output summaries and the Graph. + summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, + graph_def=sess.graph_def) + + # And then after everything is built, start the training loop. + for step in xrange(FLAGS.max_steps): + start_time = time.time() + + # Fill a feed dictionary with the actual set of images and labels + # for this particular training step. + feed_dict = fill_feed_dict(data_sets.train, + images_placeholder, + labels_placeholder) + + # Run one step of the model. The return values are the activations + # from the `train_op` (which is discarded) and the `loss` Op. To + # inspect the values of your Ops or variables, you may include them + # in the list passed to sess.run() and the value tensors will be + # returned in the tuple from the call. + _, loss_value = sess.run([train_op, loss], + feed_dict=feed_dict) + + duration = time.time() - start_time + + # Write the summaries and print an overview fairly often. + if step % 100 == 0: + # Print status to stdout. + print 'Step %d: loss = %.2f (%.3f sec)' % (step, + loss_value, + duration) + # Update the events file. + summary_str = sess.run(summary_op, feed_dict=feed_dict) + summary_writer.add_summary(summary_str, step) + + # Save a checkpoint and evaluate the model periodically. + if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: + saver.save(sess, FLAGS.train_dir, global_step=step) + # Evaluate against the training set. + print 'Training Data Eval:' + do_eval(sess, + eval_correct, + images_placeholder, + labels_placeholder, + data_sets.train) + # Evaluate against the validation set. + print 'Validation Data Eval:' + do_eval(sess, + eval_correct, + images_placeholder, + labels_placeholder, + data_sets.validation) + # Evaluate against the test set. + print 'Test Data Eval:' + do_eval(sess, + eval_correct, + images_placeholder, + labels_placeholder, + data_sets.test) + + +def main(_): + run_training() + + +if __name__ == '__main__': + tf.app.run() diff --git a/tensorflow/g3doc/tutorials/mnist/input_data.py b/tensorflow/g3doc/tutorials/mnist/input_data.py new file mode 100644 index 0000000000..88892027ff --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/input_data.py @@ -0,0 +1,175 @@ +"""Functions for downloading and reading MNIST data.""" +import gzip +import os +import urllib + +import numpy + +SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' + + +def maybe_download(filename, work_directory): + """Download the data from Yann's website, unless it's already here.""" + if not os.path.exists(work_directory): + os.mkdir(work_directory) + filepath = os.path.join(work_directory, filename) + if not os.path.exists(filepath): + filepath, _ = urllib.urlretrieve(SOURCE_URL + filename, filepath) + statinfo = os.stat(filepath) + print 'Succesfully downloaded', filename, statinfo.st_size, 'bytes.' + return filepath + + +def _read32(bytestream): + dt = numpy.dtype(numpy.uint32).newbyteorder('>') + return numpy.frombuffer(bytestream.read(4), dtype=dt) + + +def extract_images(filename): + """Extract the images into a 4D uint8 numpy array [index, y, x, depth].""" + print 'Extracting', filename + with gzip.open(filename) as bytestream: + magic = _read32(bytestream) + if magic != 2051: + raise ValueError( + 'Invalid magic number %d in MNIST image file: %s' % + (magic, filename)) + num_images = _read32(bytestream) + rows = _read32(bytestream) + cols = _read32(bytestream) + buf = bytestream.read(rows * cols * num_images) + data = numpy.frombuffer(buf, dtype=numpy.uint8) + data = data.reshape(num_images, rows, cols, 1) + return data + + +def dense_to_one_hot(labels_dense, num_classes=10): + """Convert class labels from scalars to one-hot vectors.""" + num_labels = labels_dense.shape[0] + index_offset = numpy.arange(num_labels) * num_classes + labels_one_hot = numpy.zeros((num_labels, num_classes)) + labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 + return labels_one_hot + + +def extract_labels(filename, one_hot=False): + """Extract the labels into a 1D uint8 numpy array [index].""" + print 'Extracting', filename + with gzip.open(filename) as bytestream: + magic = _read32(bytestream) + if magic != 2049: + raise ValueError( + 'Invalid magic number %d in MNIST label file: %s' % + (magic, filename)) + num_items = _read32(bytestream) + buf = bytestream.read(num_items) + labels = numpy.frombuffer(buf, dtype=numpy.uint8) + if one_hot: + return dense_to_one_hot(labels) + return labels + + +class DataSet(object): + + def __init__(self, images, labels, fake_data=False): + if fake_data: + self._num_examples = 10000 + else: + assert images.shape[0] == labels.shape[0], ( + "images.shape: %s labels.shape: %s" % (images.shape, + labels.shape)) + self._num_examples = images.shape[0] + + # Convert shape from [num examples, rows, columns, depth] + # to [num examples, rows*columns] (assuming depth == 1) + assert images.shape[3] == 1 + images = images.reshape(images.shape[0], + images.shape[1] * images.shape[2]) + # Convert from [0, 255] -> [0.0, 1.0]. + images = images.astype(numpy.float32) + images = numpy.multiply(images, 1.0 / 255.0) + self._images = images + self._labels = labels + self._epochs_completed = 0 + self._index_in_epoch = 0 + + @property + def images(self): + return self._images + + @property + def labels(self): + return self._labels + + @property + def num_examples(self): + return self._num_examples + + @property + def epochs_completed(self): + return self._epochs_completed + + def next_batch(self, batch_size, fake_data=False): + """Return the next `batch_size` examples from this data set.""" + if fake_data: + fake_image = [1.0 for _ in xrange(784)] + fake_label = 0 + return [fake_image for _ in xrange(batch_size)], [ + fake_label for _ in xrange(batch_size)] + start = self._index_in_epoch + self._index_in_epoch += batch_size + if self._index_in_epoch > self._num_examples: + # Finished epoch + self._epochs_completed += 1 + # Shuffle the data + perm = numpy.arange(self._num_examples) + numpy.random.shuffle(perm) + self._images = self._images[perm] + self._labels = self._labels[perm] + # Start next epoch + start = 0 + self._index_in_epoch = batch_size + assert batch_size <= self._num_examples + end = self._index_in_epoch + return self._images[start:end], self._labels[start:end] + + +def read_data_sets(train_dir, fake_data=False, one_hot=False): + class DataSets(object): + pass + data_sets = DataSets() + + if fake_data: + data_sets.train = DataSet([], [], fake_data=True) + data_sets.validation = DataSet([], [], fake_data=True) + data_sets.test = DataSet([], [], fake_data=True) + return data_sets + + TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' + TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' + TEST_IMAGES = 't10k-images-idx3-ubyte.gz' + TEST_LABELS = 't10k-labels-idx1-ubyte.gz' + VALIDATION_SIZE = 5000 + + local_file = maybe_download(TRAIN_IMAGES, train_dir) + train_images = extract_images(local_file) + + local_file = maybe_download(TRAIN_LABELS, train_dir) + train_labels = extract_labels(local_file, one_hot=one_hot) + + local_file = maybe_download(TEST_IMAGES, train_dir) + test_images = extract_images(local_file) + + local_file = maybe_download(TEST_LABELS, train_dir) + test_labels = extract_labels(local_file, one_hot=one_hot) + + validation_images = train_images[:VALIDATION_SIZE] + validation_labels = train_labels[:VALIDATION_SIZE] + train_images = train_images[VALIDATION_SIZE:] + train_labels = train_labels[VALIDATION_SIZE:] + + data_sets.train = DataSet(train_images, train_labels) + data_sets.validation = DataSet(validation_images, validation_labels) + data_sets.test = DataSet(test_images, test_labels) + + return data_sets diff --git a/tensorflow/g3doc/tutorials/mnist/mnist.py b/tensorflow/g3doc/tutorials/mnist/mnist.py new file mode 100644 index 0000000000..acf4d01dd1 --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/mnist.py @@ -0,0 +1,148 @@ +"""Builds the MNIST network. + +Implements the inference/loss/training pattern for model building. + +1. inference() - Builds the model as far as is required for running the network +forward to make predictions. +2. loss() - Adds to the inference model the layers required to generate loss. +3. training() - Adds to the loss model the Ops required to generate and +apply gradients. + +This file is used by the various "fully_connected_*.py" files and not meant to +be run. + +TensorFlow install instructions: +https://tensorflow.org/get_started/os_setup.html + +MNIST tutorial: +https://tensorflow.org/tutorials/mnist/tf/index.html +""" +import math + +import tensorflow.python.platform +import tensorflow as tf + +# The MNIST dataset has 10 classes, representing the digits 0 through 9. +NUM_CLASSES = 10 + +# The MNIST images are always 28x28 pixels. +IMAGE_SIZE = 28 +IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE + + +def inference(images, hidden1_units, hidden2_units): + """Build the MNIST model up to where it may be used for inference. + + Args: + images: Images placeholder, from inputs(). + hidden1: Size of the first hidden layer. + hidden2: Size of the second hidden layer. + + Returns: + softmax_linear: Output tensor with the computed logits. + """ + # Hidden 1 + with tf.name_scope('hidden1') as scope: + weights = tf.Variable( + tf.truncated_normal([IMAGE_PIXELS, hidden1_units], + stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))), + name='weights') + biases = tf.Variable(tf.zeros([hidden1_units]), + name='biases') + hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases) + # Hidden 2 + with tf.name_scope('hidden2') as scope: + weights = tf.Variable( + tf.truncated_normal([hidden1_units, hidden2_units], + stddev=1.0 / math.sqrt(float(hidden1_units))), + name='weights') + biases = tf.Variable(tf.zeros([hidden2_units]), + name='biases') + hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases) + # Linear + with tf.name_scope('softmax_linear') as scope: + weights = tf.Variable( + tf.truncated_normal([hidden2_units, NUM_CLASSES], + stddev=1.0 / math.sqrt(float(hidden2_units))), + name='weights') + biases = tf.Variable(tf.zeros([NUM_CLASSES]), + name='biases') + logits = tf.matmul(hidden2, weights) + biases + return logits + + +def loss(logits, labels): + """Calculates the loss from the logits and the labels. + + Args: + logits: Logits tensor, float - [batch_size, NUM_CLASSES]. + labels: Labels tensor, int32 - [batch_size]. + + Returns: + loss: Loss tensor of type float. + """ + # Convert from sparse integer labels in the range [0, NUM_CLASSSES) + # to 1-hot dense float vectors (that is we will have batch_size vectors, + # each with NUM_CLASSES values, all of which are 0.0 except there will + # be a 1.0 in the entry corresponding to the label). + batch_size = tf.size(labels) + labels = tf.expand_dims(labels, 1) + indices = tf.expand_dims(tf.range(0, batch_size, 1), 1) + concated = tf.concat(1, [indices, labels]) + onehot_labels = tf.sparse_to_dense( + concated, tf.pack([batch_size, NUM_CLASSES]), 1.0, 0.0) + cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, + onehot_labels, + name='xentropy') + loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') + return loss + + +def training(loss, learning_rate): + """Sets up the training Ops. + + Creates a summarizer to track the loss over time in TensorBoard. + + Creates an optimizer and applies the gradients to all trainable variables. + + The Op returned by this function is what must be passed to the + `sess.run()` call to cause the model to train. + + Args: + loss: Loss tensor, from loss(). + learning_rate: The learning rate to use for gradient descent. + + Returns: + train_op: The Op for training. + """ + # Add a scalar summary for the snapshot loss. + tf.scalar_summary(loss.op.name, loss) + # Create the gradient descent optimizer with the given learning rate. + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + # Create a variable to track the global step. + global_step = tf.Variable(0, name='global_step', trainable=False) + # Use the optimizer to apply the gradients that minimize the loss + # (and also increment the global step counter) as a single training step. + train_op = optimizer.minimize(loss, global_step=global_step) + return train_op + + +def evaluation(logits, labels): + """Evaluate the quality of the logits at predicting the label. + + Args: + logits: Logits tensor, float - [batch_size, NUM_CLASSES]. + labels: Labels tensor, int32 - [batch_size], with values in the + range [0, NUM_CLASSES). + + Returns: + A scalar int32 tensor with the number of examples (out of batch_size) + that were predicted correctly. + """ + # For a classifier model, we can use the in_top_k Op. + # It returns a bool tensor with shape [batch_size] that is true for + # the examples where the label's is was in the top k (here k=1) + # of all logits for that example. + correct = tf.nn.in_top_k(logits, labels, 1) + # Return the number of true entries. + return tf.reduce_sum(tf.cast(correct, tf.int32)) diff --git a/tensorflow/g3doc/tutorials/mnist/mnist_softmax.py b/tensorflow/g3doc/tutorials/mnist/mnist_softmax.py new file mode 100644 index 0000000000..640ea29dac --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/mnist_softmax.py @@ -0,0 +1,33 @@ +"""A very simple MNIST classifer. + +See extensive documentation at ??????? (insert public URL) +""" + +# Import data +import input_data +mnist = input_data.read_data_sets("/tmp/data/", one_hot=True) + +import tensorflow as tf +sess = tf.InteractiveSession() + +# Create the model +x = tf.placeholder("float", [None, 784]) +W = tf.Variable(tf.zeros([784,10])) +b = tf.Variable(tf.zeros([10])) +y = tf.nn.softmax(tf.matmul(x,W) + b) + +# Define loss and optimizer +y_ = tf.placeholder("float", [None,10]) +cross_entropy = -tf.reduce_sum(y_*tf.log(y)) +train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) + +# Train +tf.initialize_all_variables().run() +for i in range(1000): + batch_xs, batch_ys = mnist.train.next_batch(100) + train_step.run({x: batch_xs, y_: batch_ys}) + +# Test trained model +correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) +accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) +print accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}) diff --git a/tensorflow/g3doc/tutorials/mnist/pros/index.md b/tensorflow/g3doc/tutorials/mnist/pros/index.md new file mode 100644 index 0000000000..17696712b0 --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/pros/index.md @@ -0,0 +1,390 @@ +# MNIST Deep Learning Example (For Experts) + +TensorFlow is a powerful library for doing large-scale numerical computation. +One of the tasks at which it excels is implementing and training deep neural +networks. +In this tutorial we will learn the basic building blocks of a TensorFlow model +while constructing a deep convolutional MNIST classifier. + +*This introduction assumes familiarity with neural networks and the MNIST +dataset. If you don't have +a background with them, check out the +[introduction for beginners](../beginners/index.md).* + +## Setup + +Before we create our model, we will first load the MNIST dataset, and start a +TensorFlow session. + +### Load MNIST Data + +For your convenience, we've included [a script](../input_data.py) which +automatically downloads and imports the MNIST dataset. It will create a +directory `'MNIST_data'` in which to store the data files. + +```python +import input_data +mnist = input_data.read_data_sets('MNIST_data', one_hot=True) +``` + +Here `mnist` is a lightweight class which stores the training, validation, and +testing sets as NumPy arrays. +It also provides a function for iterating through data minibatches, which we +will use below. + +### Start TensorFlow Session + +Tensorflow relies on a highly efficient C++ backend to do its computation. The +connection to this backend is called a session. We will need to create a session +before we can do any computation. + +```python +import tensorflow as tf +sess = tf.InteractiveSession() +``` + +Using an `InteractiveSession` makes TensorFlow more flexible about how you +structure your code. +It allows you to interleave operations which build a +[computation graph](../../../get_started/basic_usage.md#the-computation-graph) +with ones that run the graph. +This is particularly convenient when working in interactive contexts like +iPython. +If you are not using an `InteractiveSession`, then you should build +the entire computation graph before starting a session and [launching the +graph](../../../get_started/basic_usage.md#launching-the-graph-in-a-session). + +#### Computation Graph + +To do efficient numerical computing in Python, we typically use libraries like +NumPy that do expensive operations such as matrix multiplication outside Python, +using highly efficient code implemented in another language. +Unfortunately, there can still be a lot of overhead from switching back to +Python every operation. This overhead is especially bad if you want to run +computations on GPUs or in a distributed manner, where there can be a high cost +to transferring data. + +TensorFlow also does its heavy lifting outside Python, +but it takes things a step further to avoid this overhead. +Instead of running a single expensive operation independently +from Python, TensorFlow lets us describe a graph of interacting operations that +run entirely outside Python. +This approach is similar to that used in Theano or Torch. + +The role of the Python code is therefore to build this external computation +graph, and to dictate which parts of the computation graph should be run. See +the +[Computation Graph](../../../get_started/basic_usage.md#the-computation-graph) +section of +[Basic Usage](../../../get_started/basic_usage.md) +for more detail. + +## Build a Softmax Regression Model + +In this section we will build a softmax regression model with a single linear +layer. In the next section, we will extend this to the case of softmax +regression with a multilayer convolutional network. + +### Placeholders + +We start building the computation graph by creating nodes for the +input images and target output classes. + +```python +x = tf.placeholder("float", shape=[None, 784]) +y_ = tf.placeholder("float", shape=[None, 10]) +``` + +Here `x` and `y_` aren't specific values. Rather, they are each a `placeholder` +-- a value that we'll input when we ask TensorFlow to run a computation. + +The input images `x` will consist of a 2d tensor of floating point numbers. +Here we assign it a `shape` of `[None, 784]`, where `784` is the dimensionality of +a single flattened MNIST image, and `None` indicates that the first dimension, +corresponding to the batch size, can be of any size. +The target output classes `y_` will also consist of a 2d tensor, +where each row is a one-hot 10-dimensional vector indicating +which digit class the corresponding MNIST image belongs to. + +The `shape` argument to `placeholder` is optional, but it allows TensorFlow +to automatically catch bugs stemming from inconsistent tensor shapes. + +### Variables + +We now define the weights `W` and biases `b` for our model. We could imagine treating +these like additional inputs, but TensorFlow has an even better way to handle +them: `Variable`. +A `Variable` is a value that lives in TensorFlow's computation graph. +It can be used and even modified by the computation. In machine +learning applications, one generally has the model paramaters be `Variable`s. + +```python +W = tf.Variable(tf.zeros([784,10])) +b = tf.Variable(tf.zeros([10])) +``` + +We pass the initial value for each parameter in the call to `tf.Variable`. +In this case, we initialize both `W` and `b` as tensors full of +zeros. `W` is a 784x10 matrix (because we have 784 input features +and 10 outputs) and `b` is a 10-dimensional vector (because we have 10 classes). + +Before `Variable`s can be used within a session, they must be initialized using +that session. +This step takes the initial values (in this case tensors full of zeros) that +have already been specified, and assigns them to each `Variable`. This can be +done for all `Variables` at once. + +```python +sess.run(tf.initialize_all_variables()) +``` + +### Predicted Class and Cost Function + +We can now implement our regression model. It only takes one line! +We multiply the vectorized input images `x` by the weight matrix `W`, add +the bias `b`, and compute the softmax probabilities that are assigned to each +class. + +```python +y = tf.nn.softmax(tf.matmul(x,W) + b) +``` + +The cost function to be minimized during training can be specified just as +easily. Our cost function will be the cross-entropy between the target and the +model's prediction. + +```python +cross_entropy = -tf.reduce_sum(y_*tf.log(y)) +``` + +Note that `tf.reduce_sum` sums across all images in the minibatch, as well as +all classes. We are computing the cross entropy for the entire minibatch. + +## Train the Model + +Now that we have defined our model and training cost function, it is +straightforward to train using TensorFlow. +Because TensorFlow knows the entire computation graph, it +can use automatic differentiation to find the gradients of the cost with +respect to each of the variables. +TensorFlow has a variety of +[builtin optimization algorithms] +(../../../api_docs/python/train.md?#optimizers). +For this example, we will use steepest gradient descent, with a step length of +0.01, to descend the cross entropy. + +```python +train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) +``` + +What TensorFlow actually did in that single line was to add new operations to +the computation graph. These operations included ones to compute gradients, +compute parameter update steps, and apply update steps to the parameters. + +The returned operation `train_step`, when run, will apply the gradient +descent updates to the parameters. Training the model can therefore be +accomplished by repeatedly running `train_step`. + +```python +for i in range(1000): + batch = mnist.train.next_batch(50) + train_step.run(feed_dict={x: batch[0], y_: batch[1]}) +``` + +Each training iteration we load 50 training examples. We then run the +`train_step` operation, using `feed_dict` to replace the `placeholder` tensors +`x` and `y_` with the training examples. +Note that you can replace any tensor in your computation graph using `feed_dict` +-- it's not restricted to just `placeholder`s. + +### Evaluate the Model + +How well did our model do? + +First we'll figure out where we predicted the correct label. `tf.argmax` +is an extremely useful function which gives you the index of the highest entry +in a tensor along some axis. For example, `tf.argmax(y,1)` is the label our +model thinks is most likely for each input, while `tf.argmax(y_,1)` is the +true label. We can use `tf.equal` to check if our prediction matches the +truth. + +```python +correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) +``` + +That gives us a list of booleans. To determine what fraction are correct, we +cast to floating point numbers and then take the mean. For example, +`[True, False, True, True]` would become `[1,0,1,1]` which would become `0.75`. + +```python +accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) +``` + +Finally, we can evaluate our accuracy on the test data. This should be about +91% correct. + +```python +print accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}) +``` + +## Build a Multilayer Convolutional Network + +Getting 91% accuracy on MNIST is bad. It's almost embarrassingly bad. In this +section, we'll fix that, jumping from a very simple model to something moderatly +sophisticated: a small convolutional neural network. This will get us to around +99.2% accuracy -- not state of the art, but respectable. + +### Weight Initialization + +To create this model, we're going to need to create a lot of weights and biases. +One should generally initialize weights with a small amount of noise for +symmetry breaking, and to prevent 0 gradients. Since we're using ReLU neurons, +it is also good practice to initialize them with a slightly positive initial +bias to avoid "dead neurons." Instead of doing this repeatedly while we build +the model, let's create two handy functions to do it for us. + +```python +def weight_variable(shape): + initial = tf.truncated_normal(shape, stddev=0.1) + return tf.Variable(initial) + +def bias_variable(shape): + initial = tf.constant(0.1, shape=shape) + return tf.Variable(initial) +``` + +### Convolution and Pooling + +TensorFlow also gives us a lot of flexibility in convolution and pooling +operations. How do we handle the boundaries? What is our stride size? +In this example, we're always going to choose the vanilla version. +Our convolutions uses a stride of one and are zero padded so that the +output is the same size as the input. Our pooling is plain old max pooling +over 2x2 blocks. To keep our code cleaner, let's also abstract those operations +into functions. + +```python +def conv2d(x, W): + return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') + +def max_pool_2x2(x): + return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], padding='SAME') +``` + +### First Convolutional Layer + +We can now implement our first layer. It will consist of convolution, followed +by max pooling. The convolutional will compute 32 features for each 5x5 patch. +Its weight tensor will have a shape of `[5, 5, 1, 32]`. The first two +dimensions are the patch size, the next is the number of input channels, and +the last is the number of output channels. We will also have a bias vector with +a component for each output channel. + +```python +W_conv1 = weight_variable([5, 5, 1, 32]) +b_conv1 = bias_variable([32]) +``` + +To apply the layer, we first reshape `x` to a 4d tensor, with the second and +third dimensions corresponding to image width and height, and the final +dimension corresponding to the number of color channels. + +```python +x_image = tf.reshape(x, [-1,28,28,1]) +``` + +We then convolve `x_image` with the weight tensor, add the +bias, apply the ReLU function, and finally max pool. + +```python +h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) +h_pool1 = max_pool_2x2(h_conv1) +``` + +### Second Convolutional Layer + +In order to build a deep network, we stack several layers of this type. The +second layer will have 64 features for each 5x5 patch. + +```python +W_conv2 = weight_variable([5, 5, 32, 64]) +b_conv2 = bias_variable([64]) + +h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) +h_pool2 = max_pool_2x2(h_conv2) +``` + +### Densely Connected Layer + +Now that the image size has been reduced to 7x7, we add a fully-connected layer +with 1024 neurons to allow processing on the entire image. We reshape the tensor +from the pooling layer into a batch of vectors, +multiply by a weight matrix, add a bias, and apply a ReLU. + +```python +W_fc1 = weight_variable([7 * 7 * 64, 1024]) +b_fc1 = bias_variable([1024]) + +h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64]) +h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) +``` + +#### Dropout + +To reduce overfitting, we will apply dropout before the readout layer. +We create a `placeholder` for the probability that a neuron's output is kept +during dropout. This allows us to turn dropout on during training, and turn it +off during testing. +TensorFlow's `tf.nn.dropout` op automatically handles scaling neuron outputs in +addition to masking them, so dropout just works without any additional scaling. + +```python +keep_prob = tf.placeholder("float") +h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) +``` + +### Readout Layer + +Finally, we add a softmax layer, just like for the one layer softmax regression +above. + +```python +W_fc2 = weight_variable([1024, 10]) +b_fc2 = bias_variable([10]) + +y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2) +``` + +### Train and Evaluate the Model + +How well does this model do? +To train and evaluate it we will use code that is nearly identical to that for +the simple one layer SoftMax network above. +The differences are that: we will replace the steepest gradient descent +optimizer with the more sophisticated ADAM optimizer; we will include the +additional parameter `keep_prob` in `feed_dict` to control the dropout rate; +and we will add logging to every 100th iteration in the training process. + +```python +cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv)) +train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) +correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1)) +accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) +sess.run(tf.initialize_all_variables()) +for i in range(20000): + batch = mnist.train.next_batch(50) + if i%100 == 0: + train_accuracy = accuracy.eval(feed_dict={ + x:batch[0], y_: batch[1], keep_prob: 1.0}) + print "step %d, training accuracy %g"%(i, train_accuracy) + train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5}) + +print "test accuracy %g"%accuracy.eval(feed_dict={ + x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}) +``` + +The final test set accuracy after running this code should be approximately 99.2%. + +We have learned how to quickly and easily build, train, and evaluate a +fairly sophisticated deep learning model using TensorFlow. diff --git a/tensorflow/g3doc/tutorials/mnist/tf/index.md b/tensorflow/g3doc/tutorials/mnist/tf/index.md new file mode 100644 index 0000000000..86f3296287 --- /dev/null +++ b/tensorflow/g3doc/tutorials/mnist/tf/index.md @@ -0,0 +1,513 @@ +# Handwritten Digit Classification + +Code: [tensorflow/g3doc/tutorials/mnist/](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/g3doc/tutorials/mnist/) + +The goal of this tutorial is to show how to use TensorFlow to train and +evaluate a simple feed-forward neural network for handwritten digit +classification using the (classic) MNIST data set. The intended audience for +this tutorial is experienced machine learning users interested in using +TensorFlow. + +These tutorials are not intended for teaching Machine Learning in general. + +Please ensure you have followed the instructions to [`Install TensorFlow`](../../../get_started/os_setup.md). + +## Tutorial Files + +This tutorial references the following files: + +File | Purpose +--- | --- +[`mnist.py`](../mnist.py) | The code to build a fully-connected MNIST model. +[`fully_connected_feed.py`](../fully_connected_feed.py) | The main code, to train the built MNIST model against the downloaded dataset using a feed dictionary. + +Simply run the `fully_connected_feed.py` file directly to start training: + +`python fully_connected_feed.py` + +## Prepare the Data + +MNIST is a classic problem in machine learning. The problem is to look at +greyscale 28x28 pixel images of handwritten digits and determine which digit +the image represents, for all the digits from zero to nine. + +![MNIST Digits](./mnist_digits.png "MNIST Digits") + +For more information, refer to [Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/) +or [Chris Olah's visualizations of MNIST](http://colah.github.io/posts/2014-10-Visualizing-MNIST/). + +### Download + +At the top of the `run_training()` method, the `input_data.read_data_sets()` +function will ensure that the correct data has been downloaded to your local +training folder and then unpack that data to return a dictionary of `DataSet` +instances. + +```python +data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data) +``` + +**NOTE**: The `fake_data` flag is used for unit-testing purposes and may be +safely ignored by the reader. + +Dataset | Purpose +--- | --- +`data_sets.train` | 55000 images and labels, for primary training. +`data_sets.validation` | 5000 images and labels, for iterative validation of training accuracy. +`data_sets.test` | 10000 images and labels, for final testing of trained accuracy. + +For more information about the data, please read the [`Download`](../download/index.md) +tutorial. + +### Inputs and Placeholders + +The `placeholder_inputs()` function creates two [`tf.placeholder`](../../../api_docs/python/io_ops.md#placeholder) +ops that define the shape of the inputs, including the `batch_size`, to the +rest of the graph and into which the actual training examples will be fed. + +```python +images_placeholder = tf.placeholder(tf.float32, shape=(batch_size, + IMAGE_PIXELS)) +labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size)) +``` + +Further down, in the training loop, the full image and label datasets are +sliced to fit the `batch_size` for each step, matched with these placeholder +ops, and then passed into the `sess.run()` function using the `feed_dict` +parameter. + +## Build the Graph + +After creating placeholders for the data, the graph is built from the +`mnist.py` file according to a 3-stage pattern: `inference()`, `loss()`, and +`training()`. + +1. `inference()` - Builds the graph as far as is required for running +the network forward to make predictions. +1. `loss()` - Adds to the inference graph the ops required to generate +loss. +1. `training()` - Adds to the loss graph the ops required to compute +and apply gradients. + +<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;"> + <img style="width:100%" src="./mnist_subgraph.png"> +</div> + +### Inference + +The `inference()` function builds the graph as far as needed to +return the tensor that would contain the output predictions. + +It takes the images placeholder as input and builds on top +of it a pair of fully connected layers with ReLu activation followed by a ten +node linear layer specifying the output logits. + +Each layer is created beneath a unique [`tf.name_scope`](../../../api_docs/python/framework.md#name_scope) +that acts as a prefix to the items created within that scope. + +```python +with tf.name_scope('hidden1') as scope: +``` + +Within the defined scope, the weights and biases to be used by each of these +layers are generated into [`tf.Variable`](../../../api_docs/python/state_ops.md#Variable) +instances, with their desired shapes: + +```python +weights = tf.Variable( + tf.truncated_normal([IMAGE_PIXELS, hidden1_units], + stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))), + name='weights') +biases = tf.Variable(tf.zeros([hidden1_units]), + name='biases') +``` + +When, for instance, these are created under the `hidden1` scope, the unique +name given to the weights variable would be "`hidden1/weights`". + +Each variable is given initializer ops as part of their construction. + +In this most common case, the weights are initialized with the +[`tf.truncated_normal`](../../../api_docs/python/constant_op.md#truncated_normal) +and given their shape of a 2d tensor with +the first dim representing the number of units in the layer from which the +weights connect and the second dim representing the number of +units in the layer to which the weights connect. For the first layer, named +`hidden1`, the dimensions are `[IMAGE_PIXELS, hidden1_units]` because the +weights are connecting the image inputs to the hidden1 layer. The +`tf.truncated_normal` initializer generates a random distribution with a given +mean and standard deviation. + +Then the biases are initialized with [`tf.zeros`](../../../api_docs/python/constant_op.md#zeros) +to ensure they start with all zero values, and their shape is simply the number +of units in the layer to which they connect. + +The graph's three primary ops -- two [`tf.nn.relu`](../../../api_docs/python/nn.md#relu) +ops wrapping [`tf.matmul`](../../../api_docs/python/math_ops.md#matmul) +for the hidden layers and one extra `tf.matmul` for the logits -- are then +created, each in turn, with their `tf.Variable` instances connected to the +input placeholder or the output tensor of the layer beneath each. + +```python +hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases) +``` + +```python +hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases) +``` + +```python +logits = tf.matmul(hidden2, weights) + biases +``` + +Finally, the `logits` tensor that will contain the output is returned. + +### Loss + +The `loss()` function further builds the graph by adding the required loss +ops. + +First, the values from the label_placeholder are encoded as a tensor of 1-hot +values. For example, if the class identifier is '3' the value is converted to: +<br>`[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]` + +```python +batch_size = tf.size(labels) +labels = tf.expand_dims(labels, 1) +indices = tf.expand_dims(tf.range(0, batch_size, 1), 1) +concated = tf.concat(1, [indices, labels]) +onehot_labels = tf.sparse_to_dense( + concated, tf.pack([batch_size, NUM_CLASSES]), 1.0, 0.0) +``` + +A [`tf.nn.softmax_cross_entropy_with_logits`](../../../api_docs/python/nn.md#softmax_cross_entropy_with_logits) +op is then added to compare the output logits from the `inference()` function +and the 1-hot labels. + +```python +cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, + onehot_labels, + name='xentropy') +``` + +It then uses [`tf.reduce_mean`](../../../api_docs/python/math_ops.md#reduce_mean) +to average the cross entropy values across the batch dimension (the first +dimension) as the total loss. + +```python +loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') +``` + +And the tensor that will then contain the loss value is returned. + +> Note: Cross-entropy is an idea from information theory that allows us +> to describe how bad it is to believe the predictions of the neural network, +> given what is actually true. For more information, read the blog post Visual +> Information Theory (http://colah.github.io/posts/2015-09-Visual-Information/) + +### Training + +The `training()` function adds the operations needed to minimize the loss via +gradient descent. + +Firstly, it takes the loss tensor from the `loss()` function and hands it to a +[`tf.scalar_summary`](../../../api_docs/python/train.md#scalar_summary), +an op for generating summary values into the events file when used with a +`SummaryWriter` (see below). In this case, it will emit the snapshot value of +the loss every time the summaries are written out. + +```python +tf.scalar_summary(loss.op.name, loss) +``` + +Next, we instantiate a [`tf.train.GradientDescentOptimizer`](../../../api_docs/python/train.md#GradientDescentOptimizer) +responsible for applying gradients with the requested learning rate. + +```python +optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate) +``` + +We then generate a single variable to contain a counter for the global +training step and the [`minimize()`](../../../api_docs/python/train.md#Optimizer.minimize) +op is used to both update the trainable weights in the system and increment the +global step. This is, by convention, known as the `train_op` and is what must +be run by a TensorFlow session in order to induce one full step of training +(see below). + +```python +global_step = tf.Variable(0, name='global_step', trainable=False) +train_op = optimizer.minimize(loss, global_step=global_step) +``` + +The tensor containing the outputs of the training op is returned. + +## Train the Model + +Once the graph is built, it can be iteratively trained and evaluated in a loop +controlled by the user code in `fully_connected_feed.py`. + +### The Graph + +At the top of the `run_training()` function is a python `with` command that +indicates all of the built ops are to be associated with the default +global [`tf.Graph`](../../../api_docs/python/framework.md#Graph) +instance. + +```python +with tf.Graph().as_default(): +``` + +A `tf.Graph` is a collection of ops that may be executed together as a group. +Most TensorFlow uses will only need to rely on the single default graph. + +More complicated uses with multiple graphs are possible, but beyond the scope of +this simple tutorial. + +### The Session + +Once all of the build preparation has been completed and all of the necessary +ops generated, a [`tf.Session`](../../../api_docs/python/client.md#Session) +is created for running the graph. + +```python +sess = tf.Session() +``` + +Alternately, a `Session` may be generated into a `with` block for scoping: + +```python +with tf.Session() as sess: +``` + +The empty parameter to session indicates that this code will attach to +(or create if not yet created) the default local session. + +Immediately after creating the session, all of the `tf.Variable` +instances are initialized by calling `sess.run()` on their initialization op. + +```python +init = tf.initialize_all_variables() +sess.run(init) +``` + +The [`sess.run()`](../../../api_docs/python/client.md#Session.run) +method will run the complete subset of the graph that +corresponds to the op(s) passed as parameters. In this first call, the `init` +op is a [`tf.group`](../../../api_docs/python/control_flow_ops.md#group) +that contains only the initializers for the variables. None of the rest of the +graph is run here, that happens in the training loop below. + +### Train Loop + +After initializing the variables with the session, training may begin. + +The user code controls the training per step, and the simplest loop that +can do useful training is: + +```python +for step in xrange(max_steps): + sess.run([train_op]) +``` + +However, this tutorial is slightly more complicated in that it must also slice +up the input data for each step to match the previously generated placeholders. + +#### Feed the Graph + +For each step, the code will generate a feed dictionary that will contain the +set of examples on which to train for the step, keyed by the placeholder +ops they represent. + +In the `fill_feed_dict()` function, the given `DataSet` is queried for its next +`batch_size` set of images and labels, and tensors matching the placeholders are +filled containing the next images and labels. + +```python +images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size) +``` + +A python dictionary object is then generated with the placeholders as keys and +the representative feed tensors as values. + +```python +feed_dict = { + images_placeholder: images_feed, + labels_placeholder: labels_feed, +} +``` + +This is passed into the `sess.run()` function's `feed_dict` parameter to provide +the input examples for this step of training. + +#### Check the Status + +The code specifies two op-tensors in its run call: `[train_op, loss]`: + +```python +for step in xrange(FLAGS.max_steps): + feed_dict = fill_feed_dict(data_sets.train, + images_placeholder, + labels_placeholder) + _, loss_value = sess.run([train_op, loss], + feed_dict=feed_dict) +``` + +Because there are two tensors passed as parameters, the return from +`sess.run()` is a tuple with two items. The returned items are themselves +tensors, filled with the values of the passed op-tensors during this step of +training. + +The value of the `train_op` is actually `None` and, thus, discarded. But the +value of the `loss` tensor may become NaN if the model diverges during training. + +Assuming that the training runs fine without NaNs, the training loop also +prints a simple status text every 100 steps to let the user know the state of +training. + +```python +if step % 100 == 0: + print 'Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration) +``` + +#### Visualize the Status + +In order to emit the events files used by [TensorBoard](../../../how_tos/summaries_and_tensorboard/index.md), +all of the summaries (in this case, only one) are collected into a single op +during the graph building phase. + +```python +summary_op = tf.merge_all_summaries() +``` + +And then after the Session is generated, a [`tf.train.SummaryWriter`](../../../api_docs/python/train.md#SummaryWriter) +may be instantiated to output into the given directory the events files, +containing the Graph itself and the values of the summaries. + +```python +summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, + graph_def=sess.graph_def) +``` + +Lastly, the events file will be updated with new summary values every time the +`summary_op` is run and the ouput passed to the writer's `add_summary()` +function. + +```python +summary_str = sess.run(summary_op, feed_dict=feed_dict) +summary_writer.add_summary(summary_str, step) +``` + +When the events files are written, TensorBoard may be run against the training +folder to display the values from the summaries. + +![MNIST TensorBoard](./mnist_tensorboard.png "MNIST TensorBoard") + +**NOTE**: For more info about how to build and run Tensorboard, please see the accompanying tutorial [Tensorboard: Visualizing Your Training](../../../how_tos/summaries_and_tensorboard/index.md). + +#### Save a Checkpoint + +In order to emit a checkpoint file that may be used to later restore a model +for further training or evaluation, we instantiate a +[`tf.train.Saver`](../../../api_docs/python/state_ops.md#Saver). + +```python +saver = tf.train.Saver() +``` + +In the training loop, the [`saver.save()`](../../../api_docs/python/state_ops.md#Saver.save) +method will periodically be called to write a checkpoint file to the training +directory with the current values of all the trainable variables. + +```python +saver.save(sess, FLAGS.train_dir, global_step=step) +``` + +At some later point in the future, training might be resumed by using the +[`saver.restore()`](../../../api_docs/python/state_ops.md#Saver.restore) +method to reload the model parameters. + +```python +saver.restore(sess, FLAGS.train_dir) +``` + +## Evaluate the Model + +Every thousand steps, the code will attempt to evaluate the model against both +the training and test datasets. The `do_eval()` function is called thrice, for +the training, validation, and test datasets. + +```python +print 'Training Data Eval:' +do_eval(sess, + eval_correct, + images_placeholder, + labels_placeholder, + data_sets.train) +print 'Validation Data Eval:' +do_eval(sess, + eval_correct, + images_placeholder, + labels_placeholder, + data_sets.validation) +print 'Test Data Eval:' +do_eval(sess, + eval_correct, + images_placeholder, + labels_placeholder, + data_sets.test) +``` + +> Note that more complicated usage would usually sequester the `data_sets.test` +> to only be checked after significant amounts of hyperparameter tuning. For +> the sake of a simple little MNIST problem, however, we evaluate against all of +> the data. + +### Build the Eval Graph + +Before opening the default Graph, the test data should have been fetched by +calling the `get_data(train=False)` function with the parameter set to grab +the test dataset. + +```python +test_all_images, test_all_labels = get_data(train=False) +``` + +Before entering the training loop, the Eval op should have been built +by calling the `evaluation()` function from `mnist.py` with the same +logits/labels parameters as the `loss()` function. + +```python +eval_correct = mnist.evaluation(logits, labels_placeholder) +``` + +The `evaluation()` function simply generates a [`tf.nn.in_top_k`](../../../api_docs/python/nn.md#in_top_k) +op that can automatically score each model output as correct if the true label +can be found in the K most-likely predictions. In this case, we set the value +of K to 1 to only consider a prediction correct if it is for the true label. + +```python +eval_correct = tf.nn.in_top_k(logits, labels, 1) +``` + +### Eval Output + +One can then create a loop for filling a `feed_dict` and calling `sess.run()` +against the `eval_correct` op to evaluate the model on the given dataset. + +```python +for step in xrange(steps_per_epoch): + feed_dict = fill_feed_dict(data_set, + images_placeholder, + labels_placeholder) + true_count += sess.run(eval_correct, feed_dict=feed_dict) +``` + +The `true_count` variable simply accumulates all of the predictions that the +`in_top_k` op has determined to be correct. From there, the precision may be +calculated from simply dividing by the total number of examples. + +```python +precision = float(true_count) / float(num_examples) +print ' Num examples: %d Num correct: %d Precision @ 1: %0.02f' % ( + num_examples, true_count, precision) +``` diff --git a/tensorflow/g3doc/tutorials/pdes/index.md b/tensorflow/g3doc/tutorials/pdes/index.md new file mode 100755 index 0000000000..1f29e4037c --- /dev/null +++ b/tensorflow/g3doc/tutorials/pdes/index.md @@ -0,0 +1,129 @@ + +## Basic Setup + + +``` +#Import libraries for simulation +import tensorflow as tf +import numpy as np + +#Imports for visualization +import PIL.Image +from cStringIO import StringIO +from IPython.display import clear_output, Image, display +``` + + +``` +def DisplayArray(a, fmt='jpeg', rng=[0,1]): + """Display an array as a picture.""" + a = (a - rng[0])/float(rng[1] - rng[0])*255 + a = np.uint8(np.clip(a, 0, 255)) + f = StringIO() + PIL.Image.fromarray(a).save(f, fmt) + display(Image(data=f.getvalue())) +``` + + +``` +sess = tf.InteractiveSession() +``` + +## Computational Convenience Functions + + +``` +def make_kernel(a): + """Transform a 2D array into a convolution kernel""" + a = np.asarray(a) + a = a.reshape(list(a.shape) + [1,1]) + return tf.constant(a, dtype=1) + +def simple_conv(x, k): + """A simplified 2D convolution operation""" + x = tf.expand_dims(tf.expand_dims(x, 0), -1) + y = tf.nn.depthwise_conv2d(x, k, [1, 1, 1, 1], padding='SAME') + return y[0, :, :, 0] + +def laplace(x): + """Compute the 2D laplacian of an array""" + laplace_k = make_kernel([[0.5, 1.0, 0.5], + [1.0, -6., 1.0], + [0.5, 1.0, 0.5]]) + return simple_conv(x, laplace_k) +``` + +## Define the PDE + + +``` +N = 500 +``` + + +``` +# Initial Conditions -- some rain drops hit a pond + +# Set everything to zero +u_init = np.zeros([N, N], dtype="float32") +ut_init = np.zeros([N, N], dtype="float32") + +# Some rain drops hit a pond at random points +for n in range(40): + a,b = np.random.randint(0, N, 2) + u_init[a,b] = np.random.uniform() + +DisplayArray(u_init, rng=[-0.1, 0.1]) +``` + + +![jpeg](output_8_0.jpe) + + + +``` +# paramaters +# eps -- time resolution +# damping -- wave damping +eps = tf.placeholder('float', shape=()) +damping = tf.placeholder('float', shape=()) + +# create variables for simulation state +U = tf.Variable(u_init) +Ut = tf.Variable(ut_init) + +# discretized PDE update rules +U_ = U + eps*Ut +Ut_ = Ut + eps*(laplace(U) - damping*Ut) + +# operation to update the state +step = tf.group( + U.Assign(U_), + Ut.Assign(Ut_) ) +``` + +## Run The Simulation + + +``` +# initialize state to initial conditions +tf.InitializeAllVariables().Run() + +# Run 1000 steps of PDE +for i in range(1000): + # Step simulation + step.Run({eps: 0.03, damping: 0.04}) + # Visualize every 50 steps + if i % 50 == 0: + clear_output() + DisplayArray(U.eval(), rng=[-0.1, 0.1]) +``` + + +![jpeg](output_11_0.jpe) + + + +``` + +``` diff --git a/tensorflow/g3doc/tutorials/pdes/output_11_0.jpe b/tensorflow/g3doc/tutorials/pdes/output_11_0.jpe Binary files differnew file mode 100755 index 0000000000..8cd8cf02b5 --- /dev/null +++ b/tensorflow/g3doc/tutorials/pdes/output_11_0.jpe diff --git a/tensorflow/g3doc/tutorials/pdes/output_8_0.jpe b/tensorflow/g3doc/tutorials/pdes/output_8_0.jpe Binary files differnew file mode 100755 index 0000000000..97954effc0 --- /dev/null +++ b/tensorflow/g3doc/tutorials/pdes/output_8_0.jpe diff --git a/tensorflow/g3doc/tutorials/recurrent/index.md b/tensorflow/g3doc/tutorials/recurrent/index.md new file mode 100644 index 0000000000..29d058cd5d --- /dev/null +++ b/tensorflow/g3doc/tutorials/recurrent/index.md @@ -0,0 +1,209 @@ +# Recurrent Neural Networks + +## Introduction + +Take a look at [this great article] +(http://colah.github.io/posts/2015-08-Understanding-LSTMs/) +for an introduction to recurrent neural networks and LSTMs in particular. + +## Language Modeling + +In this tutorial we will show how to train a recurrent neural network on +a challenging task of language modeling. The goal of the problem is to fit a +probabilistic model which assigns probablities to sentences. It does so by +predicting next words in a text given a history of previous words. For this +purpose we will use the Penn Tree Bank (PTB) dataset, which is a popular +benchmark for measuring quality of these models, whilst being small and +relatively fast to train. + +Language modeling is key to many interesting problems such as speech +recognition, machine translation, or image captioning. It is also fun, too -- +take a look [here] (http://karpathy.github.io/2015/05/21/rnn-effectiveness/). + +For the purpose of this tutorial, we will reproduce the results from +[Zaremba et al., 2014] (http://arxiv.org/abs/1409.2329), which achieves very +good results on the PTB dataset. + +## Tutorial Files + +This tutorial references the following files from `models/rnn/ptb`: + +File | Purpose +--- | --- +`ptb_word_lm.py` | The code to train a language model on the PTB dataset. +`reader.py` | The code to read the dataset. + +## Download and Prepare the Data + +The data required for this tutorial is in the data/ directory of the +PTB dataset from Tomas Mikolov's webpage: +http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz + +The dataset is already preprocessed and contains overall 10000 different words, +including the end-of-sentence marker and a special symbol (\<unk\>) for rare +words. We convert all of them in the `reader.py` to unique integer identifiers +to make it easy for the neural network to process. + +## The Model + +### LSTM + +The core of the model consists of an LSTM cell that processes one word at the +time and computes probabilities of the possible continuations of the sentence. +The memory state of the network is initialized with a vector of zeros and gets +updated after reading each word. Also, for computational reasons, we will +process data in mini-batches of size `batch_size`. + +The basic pseudocode looks as follows: + +```python +lstm = rnn_cell.BasicLSTMCell(lstm_size) +# Initial state of the LSTM memory. +state = tf.zeros([batch_size, lstm.state_size]) + +loss = 0.0 +for current_batch_of_words in words_in_dataset: + # The value of state is updated after processing each batch of words. + output, state = lstm(current_batch_of_words, state) + + # The LSTM output can be used to make next word predictions + logits = tf.matmul(output, softmax_w) + softmax_b + probabilities = tf.nn.softmax(logits) + loss += loss_function(probabilities, target_words) +``` + +### Truncated Backpropagation + +In order to make the learning process tractable, it is a common practice to +truncate the gradients for backpropagation to a fixed number (`num_steps`) +of unrolled steps. +This is easy to implement by feeding inputs of length `num_steps` at a time and +doing backward pass after each iteration. + +A simplifed version of the code for the graph creation for truncated +backpropagation: + +```python +# Placeholder for the inputs in a given iteration. +words = tf.placeholder(tf.int32, [batch_size, num_steps]) + +lstm = rnn_cell.BasicLSTMCell(lstm_size) +# Initial state of the LSTM memory. +initial_state = state = tf.zeros([batch_size, lstm.state_size]) + +for i in range(len(num_steps)): + # The value of state is updated after processing each batch of words. + output, state = lstm(words[:, i], state) + + # The rest of the code. + # ... + +final_state = state +``` + +And this is how to implement an iteration over the whole dataset: + +```python +# A numpy array holding the state of LSTM after each batch of words. +numpy_state = initial_state.eval() +total_loss = 0.0 +for current_batch_of_words in words_in_dataset: + numpy_state, current_loss = session.run([final_state, loss], + # Initialize the LSTM state from the previous iteration. + feed_dict={initial_state: numpy_state, words: current_batch_of_words}) + total_loss += current_loss +``` + +### Inputs + +The word IDs will be embedded into a dense representation (see the +[Vectors Representations Tutorial](../word2vec/index.md)) before feeding to +the LSTM. This allows the model to efficiently represent the knowledge about +particular words. It is also easy to write: + +```python +# embedding_matrix is a tensor of shape [vocabulary_size, embedding size] +word_embeddings = tf.nn.embedding_lookup(embedding_matrix, word_ids) +``` + +The embedding matrix will be initialized randomly and the model will learn to +differentiate the meaning of words just by looking at the data. + +### Loss Fuction + +We want to minimize the average negative log probability of the target words: + +$$ \text{loss} = -\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i} $$ + +It is not very difficult to implement but the function +`sequence_loss_by_example` is already available, so we can just use it here. + +The typical measure reported in the papers is average per-word perplexity (often +just called perplexity), which is equal to + +$$e^{-\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i}} = e^{\text{loss}} $$ + +and we will monitor its value throughout the training process. + +### Stacking multiple LSTMs + +To give the model more expressive power, we can add multiple layers of LSTMs +to process the data. The output of the first layer will become the input of +the second and so on. + +We have a class called `MultiRNNCell` that makes the implementation seemless: + +```python +lstm = rnn_cell.BasicLSTMCell(lstm_size) +stacked_lstm = rnn_cell.MultiRNNCell([lstm] * number_of_layers) + +initial_state = state = stacked_lstm.zero_state(batch_size, tf.float32) +for i in range(len(num_steps)): + # The value of state is updated after processing each batch of words. + output, state = stacked_lstm(words[:, i], state) + + # The rest of the code. + # ... + +final_state = state +``` + +## Compile and Run the Code + +First, the library needs to be built. To compile it on CPU: + +``` +bazel build -c opt tensorflow/models/rnn/ptb:ptb_word_lm +``` + +And if you have a fast GPU, run the following: + +``` +bazel build -c opt tensorflow --config=cuda \ + tensorflow/models/rnn/ptb:ptb_word_lm +``` + +Now we can run the model: + +``` +bazel-bin/.../ptb_word_lm \ + --data_path=/tmp/simple-examples/data/ --alsologtostderr --model small +``` + +There are 3 supported model configurations in the tutorial code: "small", +"medium" and "large". The difference between them is in size of the LSTMs and +the set of hyperparameters used for training. + +The larger the model, the better results it should get. The `small` model should +be able to reach perplexity below 120 on the test set and the `large` one below +80, though it might take several hours to train. + +## What Next? + +There are several tricks that we haven't mentioned that make the model better, +including: + +* decreasing learning rate schedule, +* dropout between the LSTM layers. + +Study the code and modify it to improve the model even further. diff --git a/tensorflow/g3doc/tutorials/seq2seq/index.md b/tensorflow/g3doc/tutorials/seq2seq/index.md new file mode 100644 index 0000000000..e421c814aa --- /dev/null +++ b/tensorflow/g3doc/tutorials/seq2seq/index.md @@ -0,0 +1,331 @@ +# Sequence-to-Sequence Models: Learning to Translate + +Recurrent neural networks can learn to model language, as already discussed +in the [RNN Tutorial](../recurrent/index.md) +(if you did not read it, please go through it before proceeding with this one). +This raises an interesting question: could we condition the generated words on +some input and generate a meaningful response? For example, could we train +a neural network to translate from English to French? It turns out that +the answer is *yes*. + +This tutorial will show you how to build and train such a system end-to-end. +You can start by running this binary. + +``` +bazel run -c opt <...>/models/rnn/translate/translate.py + --data_dir [your_data_directory] +``` + +It will download English-to-French translation data from the +[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html) +prepare it for training and train. It takes about 20GB of disk space, +and a while to download and prepare (see [later](#run_it) for details), +so you can start and leave it running while reading this tutorial. + +This tutorial references the following files from `models/rnn`. + +File | What's in it? +--- | --- +`seq2seq.py` | Library for building sequence-to-sequence models. +`translate/seq2seq_model.py` | Neural translation sequence-to-sequence model. +`translate/data_utils.py` | Helper functions for preparing translation data. +`translate/translate.py` | Binary that trains and runs the translation model. + + +## Sequence-to-Sequence Basics + +A basic sequence-to-sequence model, as introduced in +[Cho et al., 2014](http://arxiv.org/pdf/1406.1078v3.pdf), +consists of two recurrent neural networks (RNNs): an *encoder* that +processes the input and a *decoder* that generates the output. +This basic architecture is depicted below. + +<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="basic_seq2seq.png" /> +</div> + +Each box in the picture above represents a cell of the RNN, most commonly +a GRU cell or an LSTM cell (see the [RNN Tutorial](../recurrent/index.md) +for an explanation of those). Encoder and decoder can share weights or, +as is more common, use a different set of parameters. Mutli-layer cells +have been successfully used in sequence-to-sequence models too, e.g. for +translation [Sutskever et al., 2014](http://arxiv.org/abs/1409.3215). + +In the basic model depicted above, every input has to be encoded into +a fixed-size state vector, as that is the only thing passed to the decoder. +To allow the decoder more direct access to the input, an *attention* mechanism +was introduced in [Bahdanu et al., 2014](http://arxiv.org/abs/1409.0473). +We will not go into the details of the attention mechanism (see the paper), +suffice it to say that it allows the decoder to peek into the input at every +decoding step. A multi-layer sequence-to-sequence network with LSTM cells and +attention mechanism in the decoder looks like this. + +<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="attention_seq2seq.png" /> +</div> + +## TensorFlow seq2seq Library + +As you can see above, there are many different sequence-to-sequence +models. Each of these models can use different RNN cells, but all +of them accept encoder inputs and decoder inputs. This motivates +the interfaces in the TensorFlow seq2seq library (`models/rnn/seq2seq.py`). +The basic RNN encoder-decoder sequence-to-sequence model works as follows. + +```python +outputs, states = basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell) +``` + +In the above call, `encoder_inputs` are a list of tensors representing inputs +to the encoder, i.e., corresponding to the letters *A, B, C* in the first +picture above. Similarly, `decoder_inputs` are tensors representing inputs +to the decoder, *GO, W, X, Y, Z* on the first picture. + +The `cell` argument is an instance of the `models.rnn.rnn_cell.RNNCell` class +that determines which cell will be used inside the model. You can use +an existing cell, such as `GRUCell` or `LSTMCell`, or you can write your own. +Moreover, `rnn_cell` provides wrappers to construct multi-layer cells, +add dropout to cell inputs or outputs, or to do other transformations, +see the [RNN Tutorial](../recurrent/index.md) for examples. + +The call to `basic_rnn_seq2seq` returns two arguments: `outputs` and `states`. +Both of them are lists of tensors of the same length as `decoder_inputs`. +Naturally, `outputs` correspond to the outputs of the decoder in each time-step, +in the first picture above that would be *W, X, Y, Z, EOS*. The returned +`states` represent the internal state of the decoder at every time-step. + +In many applications of sequence-to-sequence models, the output of the decoder +at time t is fed back and becomes the input of the decoder at time t+1. At test +time, when decoding a sequence, this is how the sequence is constructed. +During training, on the other hand, it is common to provide the correct input +to the decoder at every time-step, even if the decoder made a mistake before. +Functions in `seq2seq.py` support both modes using the `feed_previous` argument. +For example, let's analyze the following use of an embedding RNN model. + +```python +outputs, states = embedding_rnn_seq2seq( + encoder_inputs, decoder_inputs, cell, + num_encoder_symbols, num_decoder_symbols, + output_projection=None, feed_previous=False) +``` + +In the `embedding_rnn_seq2seq` model, all inputs (both `encoder_inputs` and +`decoder_inputs`) are integer-tensors that represent discrete values. +They will be embedded into a dense representation (see the +[Vectors Representations Tutorial](../word2vec/index.md) for more details +on embeddings), but to construct these embeddings we need to specify +the maximum number of discrete symbols that will appear: `num_encoder_symbols` +on the encoder side, and `num_decoder_symbols` on the decoder side. + +In the above invocation, we set `feed_previous` to False. This means that the +decoder will use `decoder_inputs` tensors as provided. If we set `feed_previous` +to True, the decoder would only use the first element of `decoder_inputs`. +All other tensors from this list would be ignored, and instead the previous +output of the encoder would be used. This is used for decoding translations +in our translation model, but it can also be used during training, to make +the model more robust to its own mistakes, similar +to [Bengio et al., 2015](http://arxiv.org/pdf/1506.03099v2.pdf). + +One more important argument used above is `output_projection`. If not specified, +the outputs of the embedding model will be tensors of shape batch-size by +`num_decoder_symbols` as they represent the logits for each generated symbol. +When training models with large output vocabularies, i.e., when +`num_decoder_symbols` is large, it is not practical to store these large +tensors. Instead, it is better to return smaller output tensors, which will +later be projected onto a large output tensor using `output_projection`. +This allows to use our seq2seq models with a sampled softmax loss, as described +in [Jean et. al., 2015](http://arxiv.org/pdf/1412.2007v2.pdf). + +In addition to `basic_rnn_seq2seq` and `embedding_rnn_seq2seq` there are a few +more sequence-to-sequence models in `seq2seq.py`, take a look there. They all +have similar interfaces, so we will not describe them in detail. We will use +`embedding_attention_seq2seq` for our translation model below. + +## Neural Translation Model + +While the core of the sequence-to-sequence model is constructed by +the functions in `models/rnn/seq2seq.py`, there are still a few tricks +that are worth mentioning that are used in our translation model in +`models/rnn/translate/seq2seq_model.py`. + +### Sampled softmax and output projection + +For one, as already mentioned above, we want to use sampled softmax to +handle large output vocabulary. To decode from it, we need to keep track +of the output projection. Both the sampled softmax loss and the output +projections are constructed by the following code in `seq2seq_model.py`. + +```python + if num_samples > 0 and num_samples < self.target_vocab_size: + w = tf.get_variable("proj_w", [size, self.target_vocab_size]) + w_t = tf.transpose(w) + b = tf.get_variable("proj_b", [self.target_vocab_size]) + output_projection = (w, b) + + def sampled_loss(inputs, labels): + labels = tf.reshape(labels, [-1, 1]) + return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, + self.target_vocab_size) +``` + +First, note that we only construct a sampled softmax if the number of samples +(512 by default) is smaller that the target vocabulary size. For vocabularies +smaller than 512 it might be a better idea to just use a standard softmax loss. + +Then, as you can see, we construct an output projection. It is a pair, +consisting of a weight matrix and a bias vector. If used, the rnn cell +will return vectors of shape batch-size by `size`, rather than batch-size +by `target_vocab_size`. To recover logits, we need to multiply by the weight +matrix and add the biases, as is done in lines 124-126 in `seq2seq_model.py`. + +```python +if output_projection is not None: + self.outputs[b] = [tf.matmul(output, output_projection[0]) + + output_projection[1] for ...] +``` + +### Bucketing and padding + +In addition to sampled softmax, our translation model also makes use +of *bucketing*, which is a method to efficiently handle sentences of +different lengths. Let us first clarify the problem. When translating +English to French, we will have English sentences of different lengths L1 +on input, and French sentences of different lengths L2 on output. Since +the English sentence is passed as `encoder_inputs`, and the French sentence +comes as `decoder_inputs` (prefixed by a GO symbol), we should in principle +create a seq2seq model for every pair (L1, L2+1) of lengths of an English +and French sentence. This would result in an enormous graph consisting of +many very similar subgraphs. On the other hand, we could just pad every +sentence with a special PAD symbol. Then we'd need only one seq2seq model, +for the padded lengths. But on shorter sentence our model would be inefficient, +encoding and decoding many PAD symbols that are useless. + +As a compromise between contructing a graph for every pair of lengths and +padding to a single length, we use a number of *buckets* and pad each sentence +to the length of the bucket above it. In `translate.py` we use the following +default buckets. + +```python +buckets = [(5, 10), (10, 15), (20, 25), (40, 50)] +``` + +This means that if the input is an English sentence with 3 tokens, +and the corresponding output is a French sentence with 6 tokens, +then they will be put in the first bucket and padded to length 5 for +encoder inputs, and length 10 for decoder inputs. If we have an English +sentence with 8 tokens and the corresponding French sentence has 18 tokens, +then they will not fit into the (10, 15) bucket, and so the (20, 25) bucket +will be used, i.e. the English sentence will be padded to 20, and the French +one to 25. + +Remember that when constructing decoder inputs we prepend the special `GO` +symbol to the input data. This is done in the `get_batch()` function in +`seq2seq_model.py`, which also reverses the input English sentence. +Reversing the inputs was shown to improve results for the neural translation +model in [Sutskever et al., 2014](http://arxiv.org/abs/1409.3215). +To put it all together, imagine we have the sentence "I go.", tokenized +as `["I", "go", "."]` as input and the sentence "Je vais." as output, +tokenized `["Je", "vais", "."]`. It will be put in the (5, 10) bucket, +with encoder inputs representing `[PAD PAD "." "go" "I"]` and decoder +inputs `[GO "Je" "vais" "." EOS PAD PAD PAD PAD PAD]`. + + +## Let's Run It {#run_it} + +To train the model described above, we need to a large English-French corpus. +We will use the *10^9-French-English corpus* from the +[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html) +for training, and the 2013 news test from the same site as development set. +Both data-sets will be downloaded to `data_dir` and training will start, +saving checkpoints in `train_dir`, when this command is run. + +``` +bazel run -c opt <...>/models/rnn/translate:translate + --data_dir [your_data_directory] --train_dir [checkpoints_directory] + --en_vocab_size=40000 --fr_vocab_size=40000 +``` + +It takes about 18GB of disk space and several hours to prepare the training +corpus. It is unpacked, vocabulary files are created in `data_dir`, and then +the corpus is tokenized and converted to integer ids. Note the parameters +that determine vocabulary sizes. In the example above, all words outside +the 40K most common ones will be converted to an `UNK` token representing +unknown words. So if you change vocabulary size, the binary will re-map +the corpus to token-ids again. + +After the data is prepared, training starts. Default parameters in `translate` +are set to quite large values. Large models trained over a long time give good +results, but it might take too long or use too much memory for your GPU. +You can request to train a smaller model as in the following example. + +``` +bazel run -c opt <...>/models/rnn/translate:translate + --data_dir [your_data_directory] --train_dir [checkpoints_directory] + --size=256 --num_layers=2 --steps_per_checkpoint=50 +``` + +The above command will train a model with 2 layers (the default is 3), +each layer with 256 units (default is 1024), and will save a checkpoint +every 50 steps (the default is 200). You can play with these parameters +to find out how large a model can be to fit into the memory of your GPU. + +During training, every `steps_per_checkpoint` steps the binary will print +out statistics from recent steps. With the default parameters (3 layers +of size 1024), first messages look like this. + +``` +global step 200 learning rate 0.5000 step-time 1.39 perplexity 1720.62 + eval: bucket 0 perplexity 184.97 + eval: bucket 1 perplexity 248.81 + eval: bucket 2 perplexity 341.64 + eval: bucket 3 perplexity 469.04 +global step 400 learning rate 0.5000 step-time 1.38 perplexity 379.89 + eval: bucket 0 perplexity 151.32 + eval: bucket 1 perplexity 190.36 + eval: bucket 2 perplexity 227.46 + eval: bucket 3 perplexity 238.66 +``` + +You can see that each step takes just under 1.4 seconds, the perplexity +on the training set and the perplexities on the development set +for each bucket. After about 30K steps, we see perplexities on short +sentences (bucket 0 and 1) going into single digits. +Since the training corpus contains ~22M sentences, one epoch (going through +the training data once) takes about 340K steps with batch-size of 64. At this +point the model can be used for translating English sentences to French +using the `--decode` option. + +``` +bazel run -c opt <...>/models/rnn/translate:translate --decode + --data_dir [your_data_directory] --train_dir [checkpoints_directory] + +Reading model parameters from /tmp/translate.ckpt-340000 +> Who is the president of the United States? + Qui est le président des États-Unis ? +``` + +## What Next? + +The example above shows how you can build your own English-to-French +translator, end-to-end. Run it and see how the model performs for yourself. +While it has reasonable quality, the default parameters will not give you +the best translation model. Here are a few things you can improve. + +First of all, we use a very promitive tokenizer, the `basic_tokenizer` function +in `data_utils`. A better tokenizer can be found on the +[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html). +Using that tokenizer, and a larger vocabulary, should improve your translations. + +Also, the default parameters of the translation model are not tuned. +You can try changing the learning rate, decay, or initializing the weights +of your model in a different way. You can also change the default +`GradientDescentOptimizer` in `seq2seq_model.py` to a more advanced one, such +as `AdagradOptimizer`. Try these things and see how they improve your results! + +Finally, the model presented above can be used for any sequence-to-sequence +task, not only for translation. Even if you want to transform a sequence to +a tree, for example to generate a parsing tree, the same model as above can +give state-of-the-art results, as demonstrated in +[Vinyals & Kaiser et al., 2015](http://arxiv.org/abs/1412.7449). +So you can not only build your own translator, you can also build a parser, +a chat-bot, or any program that comes to your mind. Experiment! diff --git a/tensorflow/g3doc/tutorials/word2vec/__init__.py b/tensorflow/g3doc/tutorials/word2vec/__init__.py new file mode 100755 index 0000000000..e69de29bb2 --- /dev/null +++ b/tensorflow/g3doc/tutorials/word2vec/__init__.py diff --git a/tensorflow/g3doc/tutorials/word2vec/index.md b/tensorflow/g3doc/tutorials/word2vec/index.md new file mode 100644 index 0000000000..8779f33ad7 --- /dev/null +++ b/tensorflow/g3doc/tutorials/word2vec/index.md @@ -0,0 +1,396 @@ +# Learning Vector Representations of Words + +In this tutorial we look at the word2vec model by +[Mikolov et al.](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf). +This model is used for learning vector representations of words, called *word +embeddings*. + +## Highlights + +This tutorial is meant to highlight the interesting, substantive parts of +building a word2vec model in TensorFlow. + +* We start by giving the motivation for why we would want to +represent words as vectors. +* We look at the intuition behind the model and how it is trained +(with a splash of math for good measure). +* We also show a simple implementation of the model in TensorFlow. +* Finally, we look at ways to make the naive version scale better. + +We walk through the code later during the tutorial, but if you'd prefer to +dive straight in, feel free to look at the minimalistic implementation in +[tensorflow/g3doc/tutorials/word2vec/word2vec_basic.py](./word2vec_basic.py) +This basic example contains the code needed to download some data, train on it +a bit and visualize the result. Once you get +comfortable with reading and running the basic version, you can graduate to +[tensorflow/models/embedding/word2vec.py](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/embedding/word2vec.py) +which is a more serious implementation that showcases some more advanced +TensorFlow principles about how to efficiently use threads to move data into a +text model, how to checkpoint during training, etc. + +But first, let's look at why we would want to learn word embeddings in the first +place. Feel free to skip this section if you're an Embedding Pro and you'd just +like to get your hands dirty with the details. + +## Motivation: Why Learn Word Embeddings? + +Image and audio processing systems work with rich, high-dimensional datasets +encoded as vectors of the individual raw pixel-intensities for image data, or +e.g. power spectral density coefficients for audio data. For tasks like object +or speech recognition we know that all the information required to successfully +perform the task is encoded in the data (because humans can perform these tasks +from the raw data). However, natural language processing systems traditionally +treat words as discrete atomic symbols, and therefore 'cat' may be represented +as `Id537` and 'dog' as `Id143`. These encodings are arbitrary, and provide +no useful information to the system regarding the relationships that may exist +between the individual symbols. This means that the model can leverage +very little of what it has learned about 'cats' when it is processing data about +'dogs' (such that they are both animals, four-legged, pets, etc.). Representing +words as unique, discrete ids furthermore leads to data sparsity, and usually +means that we may need more data in order to successfully train statistical +models. Using vector representations can overcome some of these obstacles. + +<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/audio-image-text.png" alt> +</div> + +[Vector space models](https://en.wikipedia.org/wiki/Vector_space_model) (VSMs) +represent (embed) words in a continuous vector space where semantically +similar words are mapped to nearby points ('are embedded nearby each other'). +VSMs have a long, rich history in NLP, but all methods depend in some way or +another on the +[Distributional Hypothesis](https://en.wikipedia.org/wiki/Distributional_semantics#Distributional_Hypothesis), +which states that words that appear in the same contexts share +semantic meaning. The different approaches that leverage this principle can be +divided into two categories: *count-based methods* (e.g. +[Latent Semantic Analysis](https://en.wikipedia.org/wiki/Latent_semantic_analysis)), +and *predictive methods* (e.g. +[neural probabilistic language models](http://www.scholarpedia.org/article/Neural_net_language_models)). + +This distinction is elaborated in much more detail by +[Baroni et al.](http://clic.cimec.unitn.it/marco/publications/acl2014/baroni-etal-countpredict-acl2014.pdf), +but in a nutshell: Count-based methods compute the statistics of +how often some word co-occurs with its neighbor words in a large text corpus, +and then map these count-statistics down to a small, dense vector for each word. +Predictive models directly try to predict a word from its neighbors in terms of +learned small, dense *embedding vectors* (considered parameters of the +model). + +Word2vec is a particularly computationally-efficient predictive model for +learning word embeddings from raw text. It comes in two flavors, the Continuous +Bag-of-Words model (CBOW) and the Skip-Gram model. Algorithmically, these +models are similar, except that CBOW predicts target words (e.g. 'mat') from +source context words ('the cat sits on the'), while the skip-gram does the +inverse and predicts source context-words from the target words. This inversion +might seem like an arbitrary choice, but statistically it has the effect that +CBOW smoothes over a lot of the distributional information (by treating an +entire context as one observation). For the most part, this turns out to be a +useful thing for smaller datasets. However, skip-gram treats each context-target +pair as a new observation, and this tends to do better when we have larger +datasets. We will focus on the skip-gram model in the rest of this tutorial. + + +## Scaling up with Noise-Contrastive Training + +Neural probabilistic language models are traditionally trained using the +[maximum likelihood](https://en.wikipedia.org/wiki/Maximum_likelihood) (ML) +principle to maximize the probability of the next word $$w_t$$ (for 'target) +given the previous words $$h$$ (for 'history') in terms of a +[*softmax* function](https://en.wikipedia.org/wiki/Softmax_function), + +$$ +\begin{align} +P(w_t | h) &= \text{softmax}(\exp \{ \text{score}(w_t, h) \}) \\ + &= \frac{\exp \{ \text{score}(w_t, h) \} } + {\sum_\text{Word w' in Vocab} \exp \{ \text{score}(w', h) \} }. +\end{align} +$$ + +where $$\text{score}(w_t, h)$$ computes the compatibility of word $$w_t$$ with +the context $$h$$ (a dot product is commonly used). We train this model by +maximizing its log-likelihood on the training set, i.e. by maximizing + +$$ +\begin{align} + J_\text{ML} &= \log P(w_t | h) \\ + &= \text{score}(w_t, h) - + \log \left( \sum_\text{Word w' in Vocab} \exp \{ \text{score}(w', h) \} \right) +\end{align} +$$ + +This yields a properly normalized probabilistic model for language modeling. +However this is very expensive, because we need to compute and normalize each +probability using the score for all other $$V$$ words $$w'$$ in the current +context $$h$$, *at every training step*. + +<div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/softmax-nplm.png" alt> +</div> + +On the other hand, for feature learning in word2vec we do not need a full +probabilistic model. The CBOW and skip-gram models are instead trained using a +binary classification objective (logistic regression) to discriminate the real +target words $$w_t$$ from $$k$$ imaginary (noise) words $$\tilde w$$, in the +same context. We illustrate this below for a CBOW model. For skip-gram the +direction is simply inverted. + +<div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/nce-nplm.png" alt> +</div> + +Mathematically, the objective (for each example) is to maximize + +$$J_\text{NEG} = \log Q_\theta(D=1 |w_t, h) + + k \mathop{\mathbb{E}}_{\tilde w \sim P_\text{noise}} + \left[ \log Q_\theta(D = 0 |\tilde w, h) \right]$$, + +where $$Q_\theta(D=1 | w, h)$$ is the binary logistic regression probability +under the model of seeing the word $$w$$ in the context $$h$$ in the dataset +$$D$$, calculated in terms of the learned embedding vectors $$\theta$$. In +practice we approximate the expectation by drawing $$k$$ constrastive words +from the noise distribution (i.e. we compute a +[Monte Carlo average](https://en.wikipedia.org/wiki/Monte_Carlo_integration)). + +This objective is maximized when the model assigns high probabilities +to the real words, and low probabilities to noise words. Technically, this is +called +[Negative Sampling](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf), +and there is good mathematical motivation for using this loss function: +The updates it proposes approximate the updates of the softmax function in the +limit. But computationally it is especially appealing because computing the +loss function now scales only with the number of *noise words* that we +select ($$k$$), and not *all words* in the vocabulary ($$V$$). This makes it +much faster to train. We will actually make use of the very similar +[noise-contrastive estimation (NCE)](http://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf) +loss, for which TensorFlow has a handy helper function `tf.nn.nce_loss()`. + +Let's get an intuitive feel for how this would work in practice! + +## The Skip-gram Model + +As an example, let's consider the dataset + +`the quick brown fox jumped over the lazy dog` + +We first form a dataset of words and the contexts in which they appear. We +could define 'context' in any way that makes sense, and in fact people have +looked at syntactic contexts (i.e. the syntactic dependents of the current +target word, see e.g. +[Levy et al.](https://levyomer.files.wordpress.com/2014/04/dependency-based-word-embeddings-acl-2014.pdf)), +words-to-the-left of the target, words-to-the-right of the target, etc. For now, +let's stick to the vanilla definition and define 'context' as the window +of words to the left and to the right of a target word. Using a window +size of 1, we then have the dataset + +`([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox), ...` + +of `(context, target)` pairs. Recall that skip-gram inverts contexts and +targets, and tries to predict each context word from its target word, so the +task becomes to predict 'the' and 'brown' from 'quick', 'quick' and 'fox' from +'brown', etc. Therefore our dataset becomes + +`(quick, the), (quick, brown), (brown, quick), (brown, fox), ...` + +of `(input, output)` pairs. The objective function is defined over the entire +dataset, but we typically optimize this with +[stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) +(SGD) using one example at a time (or a 'minibatch' of `batch_size` examples, +where typically `16 <= batch_size <= 512`). So let's look at one step of +this process. + +Let's imagine at training step $$t$$ we observe the first training case above, +where the goal is to predict `the` from `quick`. We select `num_noise` number +of noisy (contrastive) examples by drawing from some noise distribution, +typically the unigram distribution, $$P(w)$$. For simplicity let's say +`num_noise=1` and we select `sheep` as a noisy example. Next we compute the +loss for this pair of observed and noisy examples, i.e. the objective at time +step $$t$$ becomes + +$$J^{(t)}_\text{NEG} = \log Q_\theta(D=1 | \text{the, quick}) + + \log(Q_\theta(D=0 | \text{sheep, quick}))$$. + +The goal is to make an update to the embedding parameters $$\theta$$ to improve +(in this case, maximize) this objective function. We do this by deriving the +gradient of the loss with respect to the embedding parameters $$\theta$$, i.e. +$$\frac{\partial}{\partial \theta} J_\text{NEG}$$ (luckily TensorFlow provides +easy helper functions for doing this!). We then perform an update to the +embeddings by taking a small step in the direction of the gradient. When this +process is repeated over the entire training set, this has the effect of +'moving' the embedding vectors around for each word until the model is +successful at discriminating real words from noise words. + +We can visualize the learned vectors by projecting them down to 2 dimensions +using for instance something like the +[t-SNE dimensionality reduction technique](http://lvdmaaten.github.io/tsne/). +When we inspect these visualizations it becomes apparent that the vectors +capture some general, and in fact quite useful, semantic information about +words and their relationships to one another. It was very interesting when we +first discovered that certain directions in the induced vector space specialize +towards certain semantic relationships, e.g. *male-female*, *gender* and +even *country-capital* relationships between words, as illustrated in the figure +below (see also for example +[Mikolov et al., 2013](http://www.aclweb.org/anthology/N13-1090)). + +<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/linear-relationships.png" alt> +</div> + +This explains why these vectors are also useful as features for many canonical +NLP prediction tasks, such as part-of-speech tagging or named entity recognition +(see for example the original work by +[Collobert et al.](http://arxiv.org/pdf/1103.0398v1.pdf), or follow-up work by +[Turian et al.](http://www.aclweb.org/anthology/P10-1040)). + +But for now, let's just use them to draw pretty pictures! + +## Building the Graph + +This is all about embeddings, so let's define our embedding matrix. +This is just a big random matrix to start. We'll initialize the values to be +uniform in the unit cube. + +```python +embeddings = tf.Variable( + tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) +``` + +The noise-contrastive estimation loss is defined in terms a logistic regression +model. For this, we need to define the weights and biases for each word in the +vocabulary (also called the `output weights` as opposed to the `input +embeddings`). So let's define that. + +```python +nce_weights = tf.Variable( + tf.truncated_normal([vocabulary_size, embedding_size], + stddev=1.0 / math.sqrt(embedding_size))) +nce_biases = tf.Variable(tf.zeros([vocabulary_size])) +``` + +Now that we have the parameters in place, we can define our skip-gram model +graph. For simplicity, let's suppose we've already integerized our text corpus +with a vocabulary so that each word is represented as an integer (see +[tensorflow/g3doc/tutorials/word2vec/word2vec_basic.py](./word2vec_basic.py) for +the details). The skip-gram model takes two inputs. One is a batch full of +integers representing the source context words, the other is for the target +words. Let's create placeholder nodes for these inputs, so that we can feed in +data later. + +```python +# Placeholders for inputs +train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) +train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) +``` + +Now what we need to do is look up the vector for each of the source words in +the batch. TensorFlow has handy helpers that make this easy. + +```python +embed = tf.nn.embedding_lookup(embeddings, train_inputs) +``` + +Ok, now that we have the embeddings for each word, we'd like to try to predict +the target word using the noise-contrastive training objective. + +```python +# Compute the NCE loss, using a sample of the negative labels each time. +loss = tf.reduce_mean( + tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels, + num_sampled, vocabulary_size)) +``` + +Now that we have a loss node, we need to add the nodes required to compute +gradients and update the parameters, etc. For this we will use stochastic +gradient descent, and TensorFlow has handy helpers to make this easy. + +```python +# We use the SGD optimizer. +optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss) +``` + +## Training the Model + +Training the model is then as simple as using a `feed_dict` to push data into +the placeholders and calling `session.run` with this new data in a loop. + +```python +for inputs, labels in generate_batch(...): + feed_dict = {training_inputs: inputs, training_labels: labels} + _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict) +``` + +See the full example code in +[tensorflow/g3doc/tutorials/word2vec/word2vec_basic.py](./word2vec_basic.py). + +## Visualizing the Learned Embeddings + +After training has finished we can visualize the learned embeddings using +t-SNE. + +<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;"> +<img style="width:100%" src="img/tsne.png" alt> +</div> + +Et voila! As expected, words that are similar end up clustering nearby each +other. For a more heavyweight implementation of word2vec that showcases more of +the advanced features of TensorFlow, see the implementation in +[tensorflow/models/embedding/word2vec.py](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/embedding/word2vec.py). + +## Evaluating Embeddings: Analogical Reasoning + +Embeddings are useful for a wide variety of prediction tasks in NLP. Short of +training a full-blown part-of-speech model or named-entity model, one simple way +to evaluate embeddings is to directly use them to predict syntactic and semantic +relationships like `king is to queen as father is to ?`. This is called +*analogical reasoning* and the task was introduced by +[Mikolov and colleagues](http://msr-waypoint.com/en-us/um/people/gzweig/Pubs/NAACL2013Regularities.pdf), +and the dataset can be downloaded from here: +https://word2vec.googlecode.com/svn/trunk/questions-words.txt. + +To see how we do this evaluation, have a look at the `build_eval_graph()` and +`eval()` functions in +[tensorflow/models/embedding/word2vec.py](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/embedding/word2vec.py). + +The choice of hyperparameters can strongly influence the accuracy on this task. +To achieve state-of-the-art performance on this task requires training over a +very large dataset, carefully tuning the hyperparameters and making use of +tricks like subsampling the data, which is out of the scope of this tutorial. + + +## Optimizing the Implementation + +Our vanilla implementation showcases the flexibility of TensorFlow. For +example, changing the training objective is as simple as swapping out the call +to `tf.nn.nce_loss()` for an off-the-shelf alternative such as +`tf.nn.sampled_softmax_loss()`. If you have a new idea for a loss function, you +can manually write an expression for the new objective in TensorFlow and let +the optimizer compute its derivatives. This flexibility is invaluable in the +exploratory phase of machine learning model development, where we are trying +out several different ideas and iterating quickly. + +Once you have a model structure you're satisfied with, it may be worth +optimizing your implementation to run more efficiently (and cover more data in +less time). For example, the naive code we used in this tutorial would suffer +compromised speed because we use Python for reading and feeding data items -- +each of which require very little work on the TensorFlow back-end. If you find +your model is seriously bottlenecked on input data, you may want to implement a +custom data reader for your problem, as described in [New Data +Formats](../how_tos/new_data_formats/index.md). For the case of Skip-Gram +modeling, we've actually already done this for you as an example in +[tensorflow/models/embedding/word2vec.py](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/embedding/word2vec.py). + +If your model is no longer I/O bound but you want still more performance, you +can take things further by writing your own TensorFlow Ops, as described in +[Adding a New Op](../how_tos/adding_an_op/index.md). Again we've provided an +example of this for the Skip-Gram case +[tensorflow/models/embedding/word2vec_optimized.py](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/models/embedding/word2vec_optimized.py). +Feel free to benchmark these against each other to measure performance +improvements at each stage. + +## Conclusion + +In this tutorial we covered the word2vec model, a computationally efficient +model for learning word embeddings. We motivated why embeddings are useful, +discussed efficient training techniques and showed how to implement all of this +in TensorFlow. Overall, we hope that this has show-cased how TensorFlow affords +you the flexibility you need for early experimentation, and the control you +later need for bespoke optimized implementation. diff --git a/tensorflow/g3doc/tutorials/word2vec/word2vec_basic.py b/tensorflow/g3doc/tutorials/word2vec/word2vec_basic.py new file mode 100644 index 0000000000..0a981570fa --- /dev/null +++ b/tensorflow/g3doc/tutorials/word2vec/word2vec_basic.py @@ -0,0 +1,219 @@ +import collections +import math +import numpy as np +import os +import random +import tensorflow as tf +import urllib +import zipfile + +# Step 1: Download the data. +url = 'http://mattmahoney.net/dc/' + +def maybe_download(filename, expected_bytes): + """Download a file if not present, and make sure it's the right size.""" + if not os.path.exists(filename): + filename, _ = urllib.urlretrieve(url + filename, filename) + statinfo = os.stat(filename) + if statinfo.st_size == expected_bytes: + print 'Found and verified', filename + else: + print statinfo.st_size + raise Exception( + 'Failed to verify ' + filename + '. Can you get to it with a browser?') + return filename + +filename = maybe_download('text8.zip', 31344016) + +# Read the data into a string. +def read_data(filename): + f = zipfile.ZipFile(filename) + for name in f.namelist(): + return f.read(name).split() + f.close() + +words = read_data(filename) +print 'Data size', len(words) + +# Step 2: Build the dictionary and replace rare words with UNK token. +vocabulary_size = 50000 + +def build_dataset(words): + count = [['UNK', -1]] + count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) + dictionary = dict() + for word, _ in count: + dictionary[word] = len(dictionary) + data = list() + unk_count = 0 + for word in words: + if word in dictionary: + index = dictionary[word] + else: + index = 0 # dictionary['UNK'] + unk_count = unk_count + 1 + data.append(index) + count[0][1] = unk_count + reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) + return data, count, dictionary, reverse_dictionary + +data, count, dictionary, reverse_dictionary = build_dataset(words) +del words # Hint to reduce memory. +print 'Most common words (+UNK)', count[:5] +print 'Sample data', data[:10] + +data_index = 0 + +# Step 4: Function to generate a training batch for the skip-gram model. +def generate_batch(batch_size, num_skips, skip_window): + global data_index + assert batch_size % num_skips == 0 + assert num_skips <= 2 * skip_window + batch = np.ndarray(shape=(batch_size), dtype=np.int32) + labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) + span = 2 * skip_window + 1 # [ skip_window target skip_window ] + buffer = collections.deque(maxlen=span) + for _ in range(span): + buffer.append(data[data_index]) + data_index = (data_index + 1) % len(data) + for i in range(batch_size / num_skips): + target = skip_window # target label at the center of the buffer + targets_to_avoid = [ skip_window ] + for j in range(num_skips): + while target in targets_to_avoid: + target = random.randint(0, span - 1) + targets_to_avoid.append(target) + batch[i * num_skips + j] = buffer[skip_window] + labels[i * num_skips + j, 0] = buffer[target] + buffer.append(data[data_index]) + data_index = (data_index + 1) % len(data) + return batch, labels + +batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) +for i in range(8): + print batch[i], '->', labels[i, 0] + print reverse_dictionary[batch[i]], '->', reverse_dictionary[labels[i, 0]] + +# Step 5: Build and train a skip-gram model. + +batch_size = 128 +embedding_size = 128 # Dimension of the embedding vector. +skip_window = 1 # How many words to consider left and right. +num_skips = 2 # How many times to reuse an input to generate a label. + +# We pick a random validation set to sample nearest neighbors. Here we limit the +# validation samples to the words that have a low numeric ID, which by +# construction are also the most frequent. +valid_size = 16 # Random set of words to evaluate similarity on. +valid_window = 100 # Only pick dev samples in the head of the distribution. +valid_examples = np.array(random.sample(xrange(valid_window), valid_size)) +num_sampled = 64 # Number of negative examples to sample. + +graph = tf.Graph() + +with graph.as_default(): + + # Input data. + train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) + train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) + valid_dataset = tf.constant(valid_examples, dtype=tf.int32) + + # Construct the variables. + embeddings = tf.Variable( + tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) + nce_weights = tf.Variable( + tf.truncated_normal([vocabulary_size, embedding_size], + stddev=1.0 / math.sqrt(embedding_size))) + nce_biases = tf.Variable(tf.zeros([vocabulary_size])) + + # Look up embeddings for inputs. + embed = tf.nn.embedding_lookup(embeddings, train_inputs) + + # Compute the average NCE loss for the batch. + # tf.nce_loss automatically draws a new sample of the negative labels each + # time we evaluate the loss. + loss = tf.reduce_mean( + tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels, + num_sampled, vocabulary_size)) + + # Construct the SGD optimizer using a learning rate of 1.0. + optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) + + # Compute the cosine similarity between minibatch examples and all embeddings. + norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) + normalized_embeddings = embeddings / norm + valid_embeddings = tf.nn.embedding_lookup( + normalized_embeddings, valid_dataset) + similarity = tf.matmul( + valid_embeddings, normalized_embeddings, transpose_b=True) + +# Step 6: Begin training +num_steps = 100001 + +with tf.Session(graph=graph) as session: + # We must initialize all variables before we use them. + tf.initialize_all_variables().run() + print "Initialized" + + average_loss = 0 + for step in xrange(num_steps): + batch_inputs, batch_labels = generate_batch( + batch_size, num_skips, skip_window) + feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels} + + # We perform one update step by evaluating the optimizer op (including it + # in the list of returned values for session.run() + _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) + average_loss += loss_val + + if step % 2000 == 0: + if step > 0: + average_loss = average_loss / 2000 + # The average loss is an estimate of the loss over the last 2000 batches. + print "Average loss at step ", step, ": ", average_loss + average_loss = 0 + + # note that this is expensive (~20% slowdown if computed every 500 steps) + if step % 10000 == 0: + sim = similarity.eval() + for i in xrange(valid_size): + valid_word = reverse_dictionary[valid_examples[i]] + top_k = 8 # number of nearest neighbors + nearest = (-sim[i, :]).argsort()[1:top_k+1] + log_str = "Nearest to %s:" % valid_word + for k in xrange(top_k): + close_word = reverse_dictionary[nearest[k]] + log_str = "%s %s," % (log_str, close_word) + print log_str + final_embeddings = normalized_embeddings.eval() + +# Step 7: Visualize the embeddings. + +def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): + assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" + plt.figure(figsize=(18, 18)) #in inches + for i, label in enumerate(labels): + x, y = low_dim_embs[i,:] + plt.scatter(x, y) + plt.annotate(label, + xy=(x, y), + xytext=(5, 2), + textcoords='offset points', + ha='right', + va='bottom') + + plt.savefig(filename) + +try: + from sklearn.manifold import TSNE + import matplotlib.pyplot as plt + + tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) + plot_only = 500 + low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:]) + labels = dictionary.keys()[:plot_only] + plot_with_labels(low_dim_embs, labels) + +except ImportError: + print "Please install sklearn and matplotlib to visualize embeddings." + |