105 files changed, 2585 insertions, 1153 deletions
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 9525df9a4b..f323d23972 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -18,7 +18,10 @@ If installed from binary pip package, provide:
 1. Which pip package you installed.
 2. The output from `python -c "import tensorflow; print(tensorflow.__version__)"`.
 
-If installed from sources, provide the commit hash:
+If installed from source, provide 
+
+1. The commit hash (`git rev-parse HEAD`)
+2. The output of `bazel version`
 
 ### Steps to reproduce
 1.
diff --git a/eigen.BUILD b/eigen.BUILD
index fa054e426a..8e96413016 100644
--- a/eigen.BUILD
+++ b/eigen.BUILD
@@ -1,9 +1,8 @@
 package(default_visibility = ["//visibility:public"])
 
-archive_dir = "eigen-eigen-b4fa9622b809"
 cc_library(
     name = "eigen",
-    hdrs = glob([archive_dir+"/**/*.h", archive_dir+"/unsupported/Eigen/*", archive_dir+"/unsupported/Eigen/CXX11/*", archive_dir+"/Eigen/*"]),
-    includes = [ archive_dir ],
+    hdrs = glob(["**/*.h", "unsupported/Eigen/*", "unsupported/Eigen/CXX11/*", "Eigen/*"]),
+    includes = [ '.' ],
     visibility = ["//visibility:public"],
 )
diff --git a/gif.BUILD b/gif.BUILD
new file mode 100644
index 0000000000..8dbea9cc41
--- /dev/null
+++ b/gif.BUILD
@@ -0,0 +1,23 @@
+SOURCES = [
+    "dgif_lib.c",
+    "egif_lib.c",
+    "gif_font.c",
+    "gif_hash.c",
+    "gifalloc.c",
+    "openbsd-reallocarray.c",
+    "gif_err.c",
+    "quantize.c",
+]
+
+prefix_dir = "giflib-5.1.4/lib"
+
+cc_library(
+    name = "gif",
+    srcs = [prefix_dir + "/" + source for source in SOURCES],
+    hdrs = [prefix_dir + "/gif_lib.h"],
+    includes = [prefix_dir],
+    defines = [
+        "HAVE_CONFIG_H",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake
index 156d93affb..78c49ea96e 100644
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -7,16 +7,30 @@
 
 include (ExternalProject)
 
-set(eigen_archive_hash "b4fa9622b809")
+# We parse the current Eigen version and archive hash from the bazel configuration
+file(STRINGS ${PROJECT_SOURCE_DIR}/../../workspace.bzl workspace_contents)
+foreach(line ${workspace_contents})
+    string(REGEX MATCH ".*eigen_version.*=.*\"(.*)\"" has_version ${line})
+    if(has_version)
+        set(eigen_version ${CMAKE_MATCH_1})
+        break()
+    endif()
+endforeach()
+foreach(line ${workspace_contents})
+    string(REGEX MATCH ".*eigen_sha256.*=.*\"(.*)\"" has_hash ${line})
+    if(has_hash)
+        set(eigen_hash ${CMAKE_MATCH_1})
+        break()
+    endif()
+endforeach()
 
 set(eigen_INCLUDE_DIRS
     ${CMAKE_CURRENT_BINARY_DIR}
     ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
-    ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/eigen-eigen-${eigen_archive_hash}
     ${tensorflow_source_dir}/third_party/eigen3
 )
-set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
-set(eigen_HASH SHA256=2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62)
+set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_version}.tar.gz)
+set(eigen_HASH SHA256=${eigen_hash})
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
 
@@ -30,5 +44,5 @@ ExternalProject_Add(eigen
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${eigen_INSTALL}
-        -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/eigen-eigen-${eigen_archive_hash}
+        -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
 )
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index cb5173ce2c..99227a0442 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -55,12 +55,8 @@ class KMeansClustering(estimator.Estimator,
                distance_metric=clustering_ops.SQUARED_EUCLIDEAN_DISTANCE,
                random_seed=0,
                use_mini_batch=True,
-               batch_size=128,
-               steps=10,
                kmeans_plus_plus_num_retries=2,
-               continue_training=False,
-               config=None,
-               verbose=1):
+               config=None):
     """Creates a model for running KMeans training and inference.
 
     Args:
@@ -73,25 +69,17 @@ class KMeansClustering(estimator.Estimator,
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
         full batch.
-      batch_size: See TensorFlowEstimator
-      steps: See TensorFlowEstimator
       kmeans_plus_plus_num_retries: For each point that is sampled during
         kmeans++ initialization, this parameter specifies the number of
         additional points to draw from the current distribution before selecting
         the best. If a negative value is specified, a heuristic is used to
         sample O(log(num_to_sample)) additional points.
-      continue_training: See TensorFlowEstimator
-      config: See TensorFlowEstimator
-      verbose: See TensorFlowEstimator
+      config: See Estimator
     """
     super(KMeansClustering, self).__init__(
         model_dir=model_dir,
         config=config)
-    self.batch_size = batch_size
-    self.steps = steps
     self.kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
-    self.continue_training = continue_training
-    self.verbose = verbose
     self._num_clusters = num_clusters
     self._training_initial_clusters = initial_clusters
     self._training_graph = None
@@ -135,11 +123,11 @@ class KMeansClustering(estimator.Estimator,
       return relative_change < self._tolerance
 # pylint: enable=protected-access
 
-  def fit(self, x, y=None, monitors=None, logdir=None, steps=None,
+  def fit(self, x, y=None, monitors=None, logdir=None, steps=None, batch_size=128,
           relative_tolerance=None):
     """Trains a k-means clustering on x.
 
-    Note: See TensorFlowEstimator for logic for continuous training and graph
+    Note: See Estimator for logic for continuous training and graph
       construction across multiple calls to fit.
 
     Args:
@@ -151,6 +139,7 @@ class KMeansClustering(estimator.Estimator,
         visualization.
       steps: number of training steps. If not None, overrides the value passed
         in constructor.
+      batch_size: mini-batch size to use. Requires `use_mini_batch=True`.
       relative_tolerance: A relative tolerance of change in the loss between
         iterations.  Stops learning if the loss changes less than this amount.
         Note that this may not work correctly if use_mini_batch=True.
@@ -162,7 +151,7 @@ class KMeansClustering(estimator.Estimator,
     if logdir is not None:
       self._model_dir = logdir
     self._data_feeder = data_feeder.setup_train_data_feeder(
-        x, None, self._num_clusters, self.batch_size)
+        x, None, self._num_clusters, batch_size if self._use_mini_batch else None)
     if relative_tolerance is not None:
       if monitors is not None:
         monitors += [self._StopWhenConverged(relative_tolerance)]
@@ -173,7 +162,7 @@ class KMeansClustering(estimator.Estimator,
             or (self.steps is not None))
     self._train_model(input_fn=self._data_feeder.input_builder,
                       feed_fn=self._data_feeder.get_feed_dict_fn(),
-                      steps=steps or self.steps,
+                      steps=steps,
                       monitors=monitors,
                       init_feed_fn=self._data_feeder.get_feed_dict_fn())
     return self
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index 1b9c533f58..bc706453c1 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -53,13 +53,14 @@ class KMeansTest(tf.test.TestCase):
 
     self.kmeans = KMeans(self.num_centers,
                          initial_clusters=kmeans_ops.RANDOM_INIT,
-                         batch_size=self.batch_size,
                          use_mini_batch=self.use_mini_batch,
-                         steps=30,
-                         continue_training=True,
-                         config=run_config.RunConfig(tf_random_seed=14),
+                         config=self.config(14),
                          random_seed=12)
 
+  @staticmethod
+  def config(tf_random_seed):
+    return run_config.RunConfig(tf_random_seed=tf_random_seed)
+
   @property
   def batch_size(self):
     return self.num_points
@@ -86,7 +87,7 @@ class KMeansTest(tf.test.TestCase):
 
   def test_clusters(self):
     kmeans = self.kmeans
-    kmeans.fit(x=self.points, steps=0)
+    kmeans.fit(x=self.points, steps=1, batch_size=8)
     clusters = kmeans.clusters()
     self.assertAllEqual(list(clusters.shape),
                         [self.num_centers, self.num_dims])
@@ -97,10 +98,11 @@ class KMeansTest(tf.test.TestCase):
       return
     kmeans = self.kmeans
     kmeans.fit(x=self.points,
-               steps=1)
+               steps=1, batch_size=self.batch_size)
     score1 = kmeans.score(x=self.points)
     kmeans.fit(x=self.points,
-               steps=15 * self.num_points // self.batch_size)
+               steps=15 * self.num_points // self.batch_size,
+               batch_size=self.batch_size)
     score2 = kmeans.score(x=self.points)
     self.assertTrue(score1 > score2)
     self.assertNear(self.true_score, score2, self.true_score * 0.05)
@@ -111,39 +113,36 @@ class KMeansTest(tf.test.TestCase):
       return
     kmeans = KMeans(self.num_centers,
                     initial_clusters=kmeans_ops.RANDOM_INIT,
-                    batch_size=self.batch_size,
                     use_mini_batch=self.use_mini_batch,
-                    # Force it to train forever until the monitor stops it.
-                    steps=None,
-                    continue_training=True,
                     config=run_config.RunConfig(tf_random_seed=14),
                     random_seed=12)
 
     kmeans.fit(x=self.points,
                # Force it to train forever until the monitor stops it.
                steps=None,
+               batch_size=self.batch_size,
                relative_tolerance=1e-4)
     score = kmeans.score(x=self.points)
     self.assertNear(self.true_score, score, self.true_score * 0.005)
 
   def test_infer(self):
     kmeans = self.kmeans
-    kmeans.fit(x=self.points)
+    kmeans.fit(x=self.points, steps=10, batch_size=128)
     clusters = kmeans.clusters()
 
     # Make a small test set
     points, true_assignments, true_offsets = self.make_random_points(clusters,
                                                                      10)
     # Test predict
-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=self.batch_size)
     self.assertAllEqual(assignments, true_assignments)
 
     # Test score
-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=128)
     self.assertNear(score, np.sum(true_offsets), 0.01 * score)
 
     # Test transform
-    transform = kmeans.transform(points)
+    transform = kmeans.transform(points, batch_size=128)
     true_transform = np.maximum(
         0,
         np.sum(np.square(points), axis=1, keepdims=True) -
@@ -161,12 +160,9 @@ class KMeansTest(tf.test.TestCase):
                     initial_clusters=kmeans_ops.RANDOM_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
                     use_mini_batch=self.use_mini_batch,
-                    batch_size=4,
-                    steps=30,
-                    continue_training=True,
-                    config=run_config.RunConfig(tf_random_seed=2),
+                    config=self.config(2),
                     random_seed=12)
-    kmeans.fit(x=points)
+    kmeans.fit(x=points, steps=10, batch_size=4)
     centers = normalize(kmeans.clusters())
     self.assertAllClose(np.sort(centers, axis=0),
                         np.sort(true_centers, axis=0))
@@ -184,10 +180,8 @@ class KMeansTest(tf.test.TestCase):
                     initial_clusters=kmeans_ops.RANDOM_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
                     use_mini_batch=self.use_mini_batch,
-                    batch_size=8,
-                    continue_training=True,
-                    config=run_config.RunConfig(tf_random_seed=3))
-    kmeans.fit(x=points, steps=30)
+                    config=self.config(3))
+    kmeans.fit(x=points, steps=30, batch_size=8)
 
     centers = normalize(kmeans.clusters())
     self.assertAllClose(np.sort(centers, axis=0),
@@ -195,7 +189,7 @@ class KMeansTest(tf.test.TestCase):
                         atol=1e-2)
 
     true_transform = 1 - cosine_similarity(points, centers)
-    transform = kmeans.transform(points)
+    transform = kmeans.transform(points, batch_size=8)
     self.assertAllClose(transform, true_transform, atol=1e-3)
 
   def test_predict_with_cosine_distance(self):
@@ -217,20 +211,18 @@ class KMeansTest(tf.test.TestCase):
                     initial_clusters=kmeans_ops.RANDOM_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
                     use_mini_batch=self.use_mini_batch,
-                    batch_size=8,
-                    continue_training=True,
-                    config=run_config.RunConfig(tf_random_seed=3))
-    kmeans.fit(x=points, steps=30)
+                    config=self.config(3))
+    kmeans.fit(x=points, steps=30, batch_size=8)
 
     centers = normalize(kmeans.clusters())
     self.assertAllClose(np.sort(centers, axis=0),
                         np.sort(true_centers, axis=0), atol=1e-2)
 
-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=8)
     self.assertAllClose(centers[assignments],
                         true_centers[true_assignments], atol=1e-2)
 
-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=8)
     self.assertAllClose(score, true_score, atol=1e-2)
 
   def test_predict_with_cosine_distance_and_kmeans_plus_plus(self):
@@ -254,21 +246,19 @@ class KMeansTest(tf.test.TestCase):
                     initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
                     use_mini_batch=self.use_mini_batch,
-                    batch_size=12,
-                    continue_training=True,
-                    config=run_config.RunConfig(tf_random_seed=3))
-    kmeans.fit(x=points, steps=30)
+                    config=self.config(3))
+    kmeans.fit(x=points, steps=30, batch_size=12)
 
     centers = normalize(kmeans.clusters())
     self.assertAllClose(sorted(centers.tolist()),
                         sorted(true_centers.tolist()),
                         atol=1e-2)
 
-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=12)
     self.assertAllClose(centers[assignments],
                         true_centers[true_assignments], atol=1e-2)
 
-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=12)
     self.assertAllClose(score, true_score, atol=1e-2)
 
   def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self):
@@ -276,7 +266,7 @@ class KMeansTest(tf.test.TestCase):
 
     with self.assertRaisesOpError('less'):
       kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT)
-      kmeans.fit(x=points)
+      kmeans.fit(x=points, steps=10, batch_size=8)
 
   def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
       self):
@@ -285,7 +275,7 @@ class KMeansTest(tf.test.TestCase):
     with self.assertRaisesOpError(AssertionError):
       kmeans = KMeans(num_clusters=3,
                       initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT)
-      kmeans.fit(x=points)
+      kmeans.fit(x=points, steps=10, batch_size=8)
 
 
 class MiniBatchKMeansTest(KMeansTest):
diff --git a/tensorflow/contrib/ios_examples/README.md b/tensorflow/contrib/ios_examples/README.md
index 88c395395c..1c29c74e51 100644
--- a/tensorflow/contrib/ios_examples/README.md
+++ b/tensorflow/contrib/ios_examples/README.md
@@ -72,5 +72,14 @@ rundown:
    unused because no other code references the variables, but in fact their
    constructors have the important side effect of registering the class.
  
+ - C++11 support (or later) should be enabled by setting `C++ Language Dialect` to
+   `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
+ 
  - The library doesn't currently support bitcode, so you'll need to disable that
    in your project settings.
+
+ - Remove any use of the `-all_load` flag in your project. The protocol buffers
+   libraries (full and lite versions) contain duplicate symbols, and the `-all_load`
+   flag will cause these duplicates to become link errors. If you were using
+   `-all_load` to avoid issues with Objective-C categories in static libraries,
+   you may be able to replace it with the `-ObjC` flag.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/classifier.py b/tensorflow/contrib/learn/python/learn/estimators/classifier.py
index bd23e61988..5def0b6e45 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/classifier.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/classifier.py
@@ -47,7 +47,9 @@ class Classifier(estimator.Estimator):
     Args:
       model_fn: (targets, predictions, mode) -> logits, loss, train_op
       n_classes: Number of classes
-      model_dir: Base directory for output data
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       config: Configuration object (optional)
     """
     self._n_classes = n_classes
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index cb1fd39e03..91eb3a57ac 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -119,7 +119,9 @@ class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
       feature_columns: An iterable containing all the feature columns used by
         the model. All items in the set should be instances of classes derived
         from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       n_classes: number of target classes. Default is binary classification.
         It must be greater than 1.
       weight_column_name: A string defining feature column name representing
@@ -277,7 +279,9 @@ class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
       feature_columns: An iterable containing all the feature columns used by
         the model. All items in the set should be instances of classes derived
         from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index d04ec0ecf3..d43de0d8b8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -72,7 +72,9 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
 
     Args:
       target_column: A _TargetColumn object.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       linear_feature_columns: An iterable containing all the feature columns
         used by linear part of the model. All items in the set should be
         instances of classes derived from `FeatureColumn`.
@@ -354,7 +356,9 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator):
     """Constructs a DNNLinearCombinedClassifier instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       n_classes: number of target classes. Default is binary classification.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training.
@@ -537,7 +541,9 @@ class DNNLinearCombinedRegressor(_DNNLinearCombinedBaseEstimator):
     """Initializes a DNNLinearCombinedRegressor instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 280bafaf4c..b1f6130f8c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -158,7 +158,9 @@ class BaseEstimator(sklearn.BaseEstimator):
     """Initializes a BaseEstimator instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       config: A RunConfig instance.
     """
     # Model directory.
@@ -766,7 +768,9 @@ class Estimator(BaseEstimator):
                  is passed to Estimator in `params` parameter. This allows
                  to configure Estimators from hyper parameter tunning.
 
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index 4eb387eea1..f025fc0941 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -122,7 +122,9 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
       feature_columns: An iterable containing all the feature columns used by
         the model. All items in the set should be instances of classes derived
         from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       n_classes: number of target classes. Default is binary classification.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
@@ -280,7 +282,9 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
       feature_columns: An iterable containing all the feature columns used by
         the model. All items in the set should be instances of classes derived
         from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph, etc.
+      model_dir: Directory to save model parameters, graph, etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
index d4e0bb6283..cafdb980c5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@@ -57,7 +57,9 @@ class LogisticRegressor(estimator.Estimator):
         expects the returned predictions to be probabilities in [0.0, 1.0].
       thresholds: List of floating point thresholds to use for accuracy,
         precision, and recall metrics. If None, defaults to [0.5].
-      model_dir: Directory to save model parameters, graphs, etc.
+      model_dir: Directory to save model parameters, graphs, etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
       config: A RunConfig configuration object.
     """
     if thresholds is None:
diff --git a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
index 4aa090ee4d..ec70453163 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
@@ -69,8 +69,7 @@ class TensorForestEstimator(estimator.BaseEstimator):
   def __init__(self, params, device_assigner=None, model_dir=None,
                graph_builder_class=tensor_forest.RandomForestGraphs,
                master='', accuracy_metric=None,
-               tf_random_seed=None, continue_training=False, verbose=1,
-               max_to_keep=5, save_checkpoint_secs=300):
+               tf_random_seed=None, config=None):
     self.params = params.fill()
     self.accuracy_metric = (accuracy_metric or
                             ('r2' if self.params.regression else 'accuracy'))
@@ -81,12 +80,6 @@ class TensorForestEstimator(estimator.BaseEstimator):
     self.training_args = {}
     self.construction_args = {}
 
-    config = run_config.RunConfig(
-        master=master,
-        tf_random_seed=(tf_random_seed or int((time.time() * 1000) % 1000)),
-        save_checkpoints_secs=save_checkpoint_secs,
-        keep_checkpoint_max=max_to_keep)
-
     super(TensorForestEstimator, self).__init__(model_dir=model_dir,
                                                 config=config)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm.py b/tensorflow/contrib/learn/python/learn/estimators/svm.py
index b8cac88e48..f646cdf477 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@@ -74,7 +74,9 @@ class SVM(linear.LinearClassifier):
     weight_column_name: A string defining feature column name representing
       weights. It is used to down weight or boost examples during training. It
       will be multiplied by the loss of the example.
-    model_dir: Directory to save model parameters, graph and etc.
+    model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
     l1_regularization: L1-regularization parameter
     l2_regularization: L2-regularization parameter
     kernels: A list of kernels for the SVM. Currently, no kernels are supported.
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 6bb1655186..d1e81f73ac 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -38,29 +38,29 @@ HOST_OBJDIR := $(MAKEFILE_DIR)/gen/host_obj/
 HOST_BINDIR := $(MAKEFILE_DIR)/gen/host_bin/
 HOST_GENDIR := $(MAKEFILE_DIR)/gen/host_obj/
 
-# Find the current Eigen version name from the Bazel build file
-EIGEN_HASH := $(shell cat eigen.BUILD | grep archive_dir | head -1 | cut -f3 -d- | cut -f1 -d\")
+# Find the current Eigen version from the Bazel configuration
+EIGEN_VERSION := $(shell grep eigen_version tensorflow/workspace.bzl | head -1 | sed -e 's/.*eigen_version.*=.*"\(.*\)"/\1/')
 
 # Settings for the host compiler.
 HOST_CXX := $(CC_PREFIX) gcc
 HOST_CXXFLAGS := --std=c++11
-HOST_LDOPTS := \
--L/usr/local/lib
-
+HOST_LDOPTS := 
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_LDOPTS += -L$(MAKEFILE_DIR)/gen/protobuf-host/lib
 endif
+HOST_LDOPTS += -L/usr/local/lib
 
 HOST_INCLUDES := \
--I/usr/local/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
--I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(HOST_GENDIR)
-
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
 endif
+# This is at the end so any globally-installed frameworks like protobuf don't
+# override local versions in the source tree.
+HOST_INCLUDES += -I/usr/local/include
 
 HOST_LIBS := \
 -lstdc++ \
@@ -120,21 +120,18 @@ CXXFLAGS := --std=c++11 -DIS_SLIM_BUILD $(OPTFLAGS)
 LDFLAGS := \
 -L/usr/local/lib
 
-ifeq ($(HAS_GEN_HOST_PROTOC),true)
-	HOST_LDOPTS += -L$(MAKEFILE_DIR)/gen/protobuf-host/lib
-endif
-
 INCLUDES := \
--I/usr/local/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
--I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
-
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
 endif
+# This is at the end so any globally-installed frameworks like protobuf don't
+# override local versions in the source tree.
+INCLUDES += -I/usr/local/include
 
 LIBS := \
 -lstdc++ \
@@ -211,7 +208,7 @@ ifeq ($(TARGET),ANDROID)
 -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
--I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(MAKEFILE_DIR)/gen/protobuf/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@@ -364,7 +361,52 @@ BENCHMARK_NAME := $(BINDIR)benchmark
 
 # What sources we want to compile, derived from the main Bazel build using the
 # gen_file_lists.sh script.
-TF_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_cc_files.txt)
+
+CORE_CC_ALL_SRCS := \
+$(wildcard tensorflow/core/*.cc) \
+$(wildcard tensorflow/core/common_runtime/*.cc) \
+$(wildcard tensorflow/core/debug/*.cc) \
+$(wildcard tensorflow/core/framework/*.cc) \
+$(wildcard tensorflow/core/graph/*.cc) \
+$(wildcard tensorflow/core/lib/*/*.cc) \
+$(wildcard tensorflow/core/platform/*.cc) \
+$(wildcard tensorflow/core/platform/*/*.cc) \
+$(wildcard tensorflow/core/util/*.cc) \
+$(wildcard tensorflow/core/util/*/*.cc)
+CORE_CC_EXCLUDE_SRCS := \
+$(wildcard tensorflow/core/*/*test.cc) \
+$(wildcard tensorflow/core/*/*testutil*) \
+$(wildcard tensorflow/core/*/*testlib*) \
+$(wildcard tensorflow/core/*/*main.cc) \
+$(wildcard tensorflow/core/*/*/*test.cc) \
+$(wildcard tensorflow/core/*/*/*testutil*) \
+$(wildcard tensorflow/core/*/*/*testlib*) \
+$(wildcard tensorflow/core/*/*/*main.cc) \
+$(wildcard tensorflow/core/graph/dot.*) \
+$(wildcard tensorflow/core/lib/gif/*) \
+$(wildcard tensorflow/core/lib/jpeg/*) \
+$(wildcard tensorflow/core/lib/png/*) \
+$(wildcard tensorflow/core/util/checkpoint_reader.*) \
+$(wildcard tensorflow/core/util/events_writer.*) \
+$(wildcard tensorflow/core/util/reporter.*) \
+$(wildcard tensorflow/core/util/tf_status_helper.*) \
+$(wildcard tensorflow/core/platform/default/stream_executor.*) \
+$(wildcard tensorflow/core/platform/default/test_benchmark.*) \
+$(wildcard tensorflow/core/platform/cuda.h) \
+$(wildcard tensorflow/core/platform/cloud/*) \
+$(wildcard tensorflow/core/platform/google/*) \
+$(wildcard tensorflow/core/platform/jpeg.*) \
+$(wildcard tensorflow/core/platform/png.*) \
+$(wildcard tensorflow/core/platform/stream_executor.*) \
+$(wildcard tensorflow/core/user_ops/*.cu.cc) \
+$(wildcard tensorflow/core/common_runtime/gpu/*) \
+$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
+# Filter out all the excluded files.
+TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
+# Add in any extra files that don't fit the patterns easily
+TF_CC_SRCS += tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+# Also include the op and kernel definitions.
+TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt)
 PBT_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_pb_text_files.txt)
 PROTO_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_proto_files.txt)
 BENCHMARK_SRCS := \
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index dff9373c10..52bb158c2a 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -16,15 +16,15 @@ This static library will not contain:
  - Python or other language bindings
  - GPU support
  
- You can target:
- - iOS
- - OS X (macOS)
- - Android
- - Raspberry-PI
+You can target:
+- iOS
+- OS X (macOS)
+- Android
+- Raspberry-PI
  
- You will compile tensorflow and protobuf libraries that you can link into other
- applications.  You will also compile the [benchmark](../../tools/benchmark/)
- application that will let you check your application.
+You will compile tensorflow and protobuf libraries that you can link into other
+applications.  You will also compile the [benchmark](../../tools/benchmark/)
+application that will let you check your application.
  
 ## Before you start (all platforms)
 
@@ -176,15 +176,16 @@ curl -o ~/graphs/inception.zip \
 
 ### Building all at once
 
-If you just want to get the libraries compiled in a hurry, you can run:
+If you just want to get the libraries compiled in a hurry, you can run this
+from the root of your TensorFlow source folder:
 
 ```bash
-build_all_ios.sh
+tensorflow/contrib/makefile/build_all_ios.sh
 ```
 
-and wait a long time.
+This process will take around twenty minutes on a modern MacBook Pro.
 
-When this completes, you will have a library for a single architecture and the
+When it completes, you will have a library for a single architecture and the
 benchmark program. Although successfully compiling the benchmark program is a
 sign of success, the program is not a complete iOS app.
 
@@ -284,6 +285,17 @@ make -f tensorflow/contrib/makefile/Makefile HOST_OS=PI TARGET=PI \
  OPTFLAGS="-Os -mfpu=neon-vfpv4 -funsafe-math-optimizations -ftree-vectorize"
 ```
 
+If you hit compilation errors mentioning `__atomic_compare_exchange` and you're
+using gcc 4.9, you should try installing gcc 4.8 and using that instead:
+
+```bash
+sudo apt-get install -y gcc-4.8 g++-4.8
+make -f tensorflow/contrib/makefile/Makefile HOST_OS=PI TARGET=PI \
+OPTFLAGS="-Os -mfpu=neon-vfpv4 -funsafe-math-optimizations -ftree-vectorize" \
+CXX=g++-4.8
+```
+
+
 # Other notes
 
 ## Supported Systems
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 0596b7ddf4..8ab7aad270 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -x
+#!/bin/bash -ex
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,11 +15,22 @@
 # ==============================================================================
 
 DOWNLOADS_DIR=tensorflow/contrib/makefile/downloads
+BZL_FILE_PATH=tensorflow/workspace.bzl
 
-mkdir ${DOWNLOADS_DIR}
+mkdir -p ${DOWNLOADS_DIR}
 
 # Grab the current Eigen version name from the Bazel build file
-EIGEN_HASH=$(cat eigen.BUILD | grep archive_dir | head -1 | cut -f3 -d- | cut -f1 -d\")
+EIGEN_HASH=$(cat "${BZL_FILE_PATH}" | egrep "eigen_version.*=.*\".*\"" | awk '{ print $3 }')
+# Trim trailing and preceding double quotes
+EIGEN_HASH="${EIGEN_HASH%\"}"
+EIGEN_HASH="${EIGEN_HASH#\"}"
+
+if [[ -z "${EIGEN_HASH}" ]]; then
+    echo >&2 "Eigen hash does not exist."
+    exit 1
+else
+    echo "Eigen hash = ${EIGEN_HASH}"
+fi
 
 curl "https://bitbucket.org/eigen/eigen/get/${EIGEN_HASH}.tar.gz" \
 -o /tmp/eigen-${EIGEN_HASH}.tar.gz
@@ -34,3 +45,5 @@ git clone https://github.com/google/protobuf.git ${DOWNLOADS_DIR}/protobuf
 cd ${DOWNLOADS_DIR}
 rm -rf eigen-latest
 ln -s eigen-eigen-${EIGEN_HASH} eigen-latest
+
+echo "download_dependencies.sh completed successfully."
diff --git a/tensorflow/contrib/makefile/gen_file_lists.sh b/tensorflow/contrib/makefile/gen_file_lists.sh
index 2bbc6bfcae..68a6fdf909 100755
--- a/tensorflow/contrib/makefile/gen_file_lists.sh
+++ b/tensorflow/contrib/makefile/gen_file_lists.sh
@@ -17,16 +17,6 @@
 # the master Bazel build configuration.
 
 bazel query 'kind("source file", deps(//tensorflow/core:android_tensorflow_lib))' | \
-grep "//tensorflow/.*\.cc$" | \
-grep -v "gen_proto_text" | \
-grep -E -v "jpeg" | \
-grep -E -v "png" | \
-grep -E -v "zlib" | \
-sed -E 's#^//##g' | \
-sed -E 's#:#/#g' \
-> tensorflow/contrib/makefile/tf_cc_files.txt
-
-bazel query 'kind("source file", deps(//tensorflow/core:android_tensorflow_lib))' | \
 grep "//tensorflow/.*\.proto$" | \
 sed -E 's#^//##g' | \
 sed -E 's#:#/#g' \
diff --git a/tensorflow/contrib/makefile/tf_cc_files.txt b/tensorflow/contrib/makefile/tf_cc_files.txt
deleted file mode 100644
index 1642074484..0000000000
--- a/tensorflow/contrib/makefile/tf_cc_files.txt
+++ /dev/null
@@ -1,264 +0,0 @@
-tensorflow/core/kernels/xent_op.cc
-tensorflow/core/kernels/where_op.cc
-tensorflow/core/kernels/variable_ops.cc
-tensorflow/core/kernels/unpack_op.cc
-tensorflow/core/kernels/transpose_op.cc
-tensorflow/core/kernels/transpose_functor_cpu.cc
-tensorflow/core/kernels/training_ops.cc
-tensorflow/core/kernels/topk_op.cc
-tensorflow/core/kernels/tile_ops.cc
-tensorflow/core/kernels/strided_slice_op.cc
-tensorflow/core/kernels/stack_ops.cc
-tensorflow/core/kernels/split_op.cc
-tensorflow/core/kernels/split_lib_cpu.cc
-tensorflow/core/kernels/sparse_to_dense_op.cc
-tensorflow/core/kernels/softsign_op.cc
-tensorflow/core/kernels/softplus_op.cc
-tensorflow/core/kernels/softmax_op.cc
-tensorflow/core/kernels/slice_op.cc
-tensorflow/core/kernels/shape_ops.cc
-tensorflow/core/kernels/session_ops.cc
-tensorflow/core/kernels/sequence_ops.cc
-tensorflow/core/kernels/sendrecv_ops.cc
-tensorflow/core/kernels/save_restore_tensor.cc
-tensorflow/core/kernels/save_op.cc
-tensorflow/core/kernels/reverse_sequence_op.cc
-tensorflow/core/kernels/reverse_op.cc
-tensorflow/core/kernels/restore_op.cc
-tensorflow/core/kernels/resize_nearest_neighbor_op.cc
-tensorflow/core/kernels/resize_bilinear_op.cc
-tensorflow/core/kernels/reshape_op.cc
-tensorflow/core/kernels/relu_op.cc
-tensorflow/core/kernels/reduction_ops_sum.cc
-tensorflow/core/kernels/reduction_ops_prod.cc
-tensorflow/core/kernels/reduction_ops_min.cc
-tensorflow/core/kernels/reduction_ops_mean.cc
-tensorflow/core/kernels/reduction_ops_max.cc
-tensorflow/core/kernels/reduction_ops_common.cc
-tensorflow/core/kernels/pooling_ops_common.cc
-tensorflow/core/kernels/pad_op.cc
-tensorflow/core/kernels/pack_op.cc
-tensorflow/core/kernels/ops_util.cc
-tensorflow/core/kernels/no_op.cc
-tensorflow/core/kernels/maxpooling_op.cc
-tensorflow/core/kernels/matmul_op.cc
-tensorflow/core/kernels/lrn_op.cc
-tensorflow/core/kernels/in_topk_op.cc
-tensorflow/core/kernels/immutable_constant_op.cc
-tensorflow/core/kernels/identity_op.cc
-tensorflow/core/kernels/gather_op.cc
-tensorflow/core/kernels/fill_functor.cc
-tensorflow/core/kernels/example_parsing_ops.cc
-tensorflow/core/kernels/dynamic_stitch_op.cc
-tensorflow/core/kernels/dynamic_partition_op.cc
-tensorflow/core/kernels/dense_update_ops.cc
-tensorflow/core/kernels/cwise_ops_common.cc
-tensorflow/core/kernels/cwise_op_tanh.cc
-tensorflow/core/kernels/cwise_op_sub.cc
-tensorflow/core/kernels/cwise_op_squared_difference.cc
-tensorflow/core/kernels/cwise_op_square.cc
-tensorflow/core/kernels/cwise_op_sqrt.cc
-tensorflow/core/kernels/cwise_op_sigmoid.cc
-tensorflow/core/kernels/cwise_op_select.cc
-tensorflow/core/kernels/cwise_op_rsqrt.cc
-tensorflow/core/kernels/cwise_op_neg.cc
-tensorflow/core/kernels/cwise_op_mul.cc
-tensorflow/core/kernels/cwise_op_minimum.cc
-tensorflow/core/kernels/cwise_op_maximum.cc
-tensorflow/core/kernels/cwise_op_log.cc
-tensorflow/core/kernels/cwise_op_less.cc
-tensorflow/core/kernels/cwise_op_isfinite.cc
-tensorflow/core/kernels/cwise_op_inverse.cc
-tensorflow/core/kernels/cwise_op_greater.cc
-tensorflow/core/kernels/cwise_op_exp.cc
-tensorflow/core/kernels/cwise_op_equal_to.cc
-tensorflow/core/kernels/cwise_op_div.cc
-tensorflow/core/kernels/cwise_op_add.cc
-tensorflow/core/kernels/ctc_decoder_ops.cc
-tensorflow/core/kernels/conv_ops.cc
-tensorflow/core/kernels/conv_grad_ops.cc
-tensorflow/core/kernels/control_flow_ops.cc
-tensorflow/core/kernels/constant_op.cc
-tensorflow/core/kernels/concat_op.cc
-tensorflow/core/kernels/concat_lib_cpu.cc
-tensorflow/core/kernels/check_numerics_op.cc
-tensorflow/core/kernels/cast_op.cc
-tensorflow/core/kernels/bias_op.cc
-tensorflow/core/kernels/bcast_ops.cc
-tensorflow/core/kernels/batch_norm_op.cc
-tensorflow/core/kernels/avgpooling_op.cc
-tensorflow/core/kernels/argmax_op.cc
-tensorflow/core/kernels/aggregate_ops.cc
-tensorflow/core/util/work_sharder.cc
-tensorflow/core/util/util.cc
-tensorflow/core/util/use_cudnn.cc
-tensorflow/core/util/tensor_slice_writer.cc
-tensorflow/core/util/tensor_slice_set.cc
-tensorflow/core/util/tensor_slice_reader_cache.cc
-tensorflow/core/util/tensor_slice_reader.cc
-tensorflow/core/util/tensor_format.cc
-tensorflow/core/util/stat_summarizer.cc
-tensorflow/core/util/sparse/group_iterator.cc
-tensorflow/core/util/saved_tensor_slice_util.cc
-tensorflow/core/util/port.cc
-tensorflow/core/util/padding.cc
-tensorflow/core/util/mirror_pad_mode.cc
-tensorflow/core/util/memmapped_file_system_writer.cc
-tensorflow/core/util/memmapped_file_system.cc
-tensorflow/core/util/guarded_philox_random.cc
-tensorflow/core/util/example_proto_helper.cc
-tensorflow/core/util/device_name_utils.cc
-tensorflow/core/util/command_line_flags.cc
-tensorflow/core/util/bcast.cc
-tensorflow/core/platform/tracing.cc
-tensorflow/core/platform/tensor_coding.cc
-tensorflow/core/platform/protobuf_util.cc
-tensorflow/core/platform/posix/posix_file_system.cc
-tensorflow/core/platform/posix/port.cc
-tensorflow/core/platform/posix/env.cc
-tensorflow/core/platform/load_library.cc
-tensorflow/core/platform/file_system.cc
-tensorflow/core/platform/env.cc
-tensorflow/core/platform/denormal.cc
-tensorflow/core/platform/default/tracing.cc
-tensorflow/core/platform/default/logging.cc
-tensorflow/core/ops/training_ops.cc
-tensorflow/core/ops/string_ops.cc
-tensorflow/core/ops/state_ops.cc
-tensorflow/core/ops/sparse_ops.cc
-tensorflow/core/ops/sendrecv_ops.cc
-tensorflow/core/ops/script_ops.cc
-tensorflow/core/ops/random_ops.cc
-tensorflow/core/ops/random_grad.cc
-tensorflow/core/ops/parsing_ops.cc
-tensorflow/core/ops/no_op.cc
-tensorflow/core/ops/nn_ops.cc
-tensorflow/core/ops/nn_grad.cc
-tensorflow/core/ops/math_ops.cc
-tensorflow/core/ops/math_grad.cc
-tensorflow/core/ops/logging_ops.cc
-tensorflow/core/ops/linalg_ops.cc
-tensorflow/core/ops/io_ops.cc
-tensorflow/core/ops/image_ops.cc
-tensorflow/core/ops/functional_ops.cc
-tensorflow/core/ops/functional_grad.cc
-tensorflow/core/ops/function_ops.cc
-tensorflow/core/ops/data_flow_ops.cc
-tensorflow/core/ops/ctc_ops.cc
-tensorflow/core/ops/control_flow_ops.cc
-tensorflow/core/ops/candidate_sampling_ops.cc
-tensorflow/core/ops/array_ops.cc
-tensorflow/core/ops/array_grad.cc
-tensorflow/core/lib/wav/wav_io.cc
-tensorflow/core/lib/strings/stringprintf.cc
-tensorflow/core/lib/strings/strcat.cc
-tensorflow/core/lib/strings/str_util.cc
-tensorflow/core/lib/strings/scanner.cc
-tensorflow/core/lib/strings/proto_text_util.cc
-tensorflow/core/lib/strings/ordered_code.cc
-tensorflow/core/lib/strings/numbers.cc
-tensorflow/core/lib/random/weighted_picker.cc
-tensorflow/core/lib/random/simple_philox.cc
-tensorflow/core/lib/random/random.cc
-tensorflow/core/lib/random/distribution_sampler.cc
-tensorflow/core/lib/io/two_level_iterator.cc
-tensorflow/core/lib/io/table_builder.cc
-tensorflow/core/lib/io/table.cc
-tensorflow/core/lib/io/record_writer.cc
-tensorflow/core/lib/io/record_reader.cc
-tensorflow/core/lib/io/path.cc
-tensorflow/core/lib/io/match.cc
-tensorflow/core/lib/io/iterator.cc
-tensorflow/core/lib/io/inputbuffer.cc
-tensorflow/core/lib/io/format.cc
-tensorflow/core/lib/io/block_builder.cc
-tensorflow/core/lib/io/block.cc
-tensorflow/core/lib/histogram/histogram.cc
-tensorflow/core/lib/hash/hash.cc
-tensorflow/core/lib/hash/crc32c.cc
-tensorflow/core/lib/core/threadpool.cc
-tensorflow/core/lib/core/stringpiece.cc
-tensorflow/core/lib/core/status.cc
-tensorflow/core/lib/core/coding.cc
-tensorflow/core/lib/core/arena.cc
-tensorflow/core/graph/validate.cc
-tensorflow/core/graph/tensor_id.cc
-tensorflow/core/graph/subgraph.cc
-tensorflow/core/graph/quantize_training.cc
-tensorflow/core/graph/optimizer_cse.cc
-tensorflow/core/graph/node_builder.cc
-tensorflow/core/graph/graph_partition.cc
-tensorflow/core/graph/graph_def_builder.cc
-tensorflow/core/graph/graph_constructor.cc
-tensorflow/core/graph/graph.cc
-tensorflow/core/graph/gradients.cc
-tensorflow/core/graph/equal_graph_def.cc
-tensorflow/core/graph/edgeset.cc
-tensorflow/core/graph/costmodel.cc
-tensorflow/core/graph/colors.cc
-tensorflow/core/graph/algorithm.cc
-tensorflow/core/framework/versions.cc
-tensorflow/core/framework/unique_tensor_references.cc
-tensorflow/core/framework/types.cc
-tensorflow/core/framework/tracking_allocator.cc
-tensorflow/core/framework/tensor_util.cc
-tensorflow/core/framework/tensor_slice.cc
-tensorflow/core/framework/tensor_shape.cc
-tensorflow/core/framework/tensor_reference.cc
-tensorflow/core/framework/tensor.cc
-tensorflow/core/framework/shape_inference.cc
-tensorflow/core/framework/resource_mgr.cc
-tensorflow/core/framework/rendezvous.cc
-tensorflow/core/framework/reader_op_kernel.cc
-tensorflow/core/framework/partial_tensor_shape.cc
-tensorflow/core/framework/op_segment.cc
-tensorflow/core/framework/op_kernel.cc
-tensorflow/core/framework/op_gen_lib.cc
-tensorflow/core/framework/op_def_util.cc
-tensorflow/core/framework/op_def_builder.cc
-tensorflow/core/framework/op.cc
-tensorflow/core/framework/node_def_util.cc
-tensorflow/core/framework/node_def_builder.cc
-tensorflow/core/framework/memory_types.cc
-tensorflow/core/framework/lookup_interface.cc
-tensorflow/core/framework/log_memory.cc
-tensorflow/core/framework/load_library.cc
-tensorflow/core/framework/kernel_def_builder.cc
-tensorflow/core/framework/graph_def_util.cc
-tensorflow/core/framework/function.cc
-tensorflow/core/framework/fake_input.cc
-tensorflow/core/framework/device_base.cc
-tensorflow/core/framework/common_shape_fns.cc
-tensorflow/core/framework/cancellation.cc
-tensorflow/core/framework/bfloat16.cc
-tensorflow/core/framework/attr_value_util.cc
-tensorflow/core/framework/allocator.cc
-tensorflow/core/common_runtime/threadpool_device_factory.cc
-tensorflow/core/common_runtime/threadpool_device.cc
-tensorflow/core/common_runtime/step_stats_collector.cc
-tensorflow/core/common_runtime/simple_placer.cc
-tensorflow/core/common_runtime/simple_graph_execution_state.cc
-tensorflow/core/common_runtime/session_state.cc
-tensorflow/core/common_runtime/session_options.cc
-tensorflow/core/common_runtime/session_factory.cc
-tensorflow/core/common_runtime/session.cc
-tensorflow/core/common_runtime/rendezvous_mgr.cc
-tensorflow/core/common_runtime/process_util.cc
-tensorflow/core/common_runtime/memory_types.cc
-tensorflow/core/common_runtime/local_device.cc
-tensorflow/core/common_runtime/graph_optimizer.cc
-tensorflow/core/common_runtime/gpu/gpu_tracer.cc
-tensorflow/core/common_runtime/function.cc
-tensorflow/core/common_runtime/executor.cc
-tensorflow/core/common_runtime/direct_session.cc
-tensorflow/core/common_runtime/device_set.cc
-tensorflow/core/common_runtime/device_mgr.cc
-tensorflow/core/common_runtime/device_factory.cc
-tensorflow/core/common_runtime/device.cc
-tensorflow/core/common_runtime/costmodel_manager.cc
-tensorflow/core/common_runtime/copy_tensor.cc
-tensorflow/core/common_runtime/constant_folding.cc
-tensorflow/core/common_runtime/build_graph_options.cc
-tensorflow/core/common_runtime/bfc_allocator.cc
-tensorflow/core/common_runtime/allocator_retry.cc
-tensorflow/core/client/tensor_c_api.cc
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
new file mode 100644
index 0000000000..098007b907
--- /dev/null
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -0,0 +1,124 @@
+tensorflow/core/kernels/xent_op.cc
+tensorflow/core/kernels/where_op.cc
+tensorflow/core/kernels/variable_ops.cc
+tensorflow/core/kernels/unpack_op.cc
+tensorflow/core/kernels/transpose_op.cc
+tensorflow/core/kernels/transpose_functor_cpu.cc
+tensorflow/core/kernels/training_ops.cc
+tensorflow/core/kernels/topk_op.cc
+tensorflow/core/kernels/tile_ops.cc
+tensorflow/core/kernels/strided_slice_op_inst_6.cc
+tensorflow/core/kernels/strided_slice_op_inst_5.cc
+tensorflow/core/kernels/strided_slice_op_inst_4.cc
+tensorflow/core/kernels/strided_slice_op_inst_3.cc
+tensorflow/core/kernels/strided_slice_op_inst_2.cc
+tensorflow/core/kernels/strided_slice_op_inst_1.cc
+tensorflow/core/kernels/strided_slice_op.cc
+tensorflow/core/kernels/stack_ops.cc
+tensorflow/core/kernels/split_op.cc
+tensorflow/core/kernels/split_lib_cpu.cc
+tensorflow/core/kernels/sparse_to_dense_op.cc
+tensorflow/core/kernels/softsign_op.cc
+tensorflow/core/kernels/softplus_op.cc
+tensorflow/core/kernels/softmax_op.cc
+tensorflow/core/kernels/slice_op.cc
+tensorflow/core/kernels/shape_ops.cc
+tensorflow/core/kernels/session_ops.cc
+tensorflow/core/kernels/sequence_ops.cc
+tensorflow/core/kernels/sendrecv_ops.cc
+tensorflow/core/kernels/save_restore_tensor.cc
+tensorflow/core/kernels/save_op.cc
+tensorflow/core/kernels/reverse_sequence_op.cc
+tensorflow/core/kernels/reverse_op.cc
+tensorflow/core/kernels/restore_op.cc
+tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+tensorflow/core/kernels/resize_bilinear_op.cc
+tensorflow/core/kernels/reshape_op.cc
+tensorflow/core/kernels/relu_op.cc
+tensorflow/core/kernels/reduction_ops_sum.cc
+tensorflow/core/kernels/reduction_ops_prod.cc
+tensorflow/core/kernels/reduction_ops_min.cc
+tensorflow/core/kernels/reduction_ops_mean.cc
+tensorflow/core/kernels/reduction_ops_max.cc
+tensorflow/core/kernels/reduction_ops_common.cc
+tensorflow/core/kernels/pooling_ops_common.cc
+tensorflow/core/kernels/pad_op.cc
+tensorflow/core/kernels/pack_op.cc
+tensorflow/core/kernels/ops_util.cc
+tensorflow/core/kernels/no_op.cc
+tensorflow/core/kernels/maxpooling_op.cc
+tensorflow/core/kernels/matmul_op.cc
+tensorflow/core/kernels/lrn_op.cc
+tensorflow/core/kernels/in_topk_op.cc
+tensorflow/core/kernels/immutable_constant_op.cc
+tensorflow/core/kernels/identity_op.cc
+tensorflow/core/kernels/gather_op.cc
+tensorflow/core/kernels/fill_functor.cc
+tensorflow/core/kernels/example_parsing_ops.cc
+tensorflow/core/kernels/dynamic_stitch_op.cc
+tensorflow/core/kernels/dynamic_partition_op.cc
+tensorflow/core/kernels/dense_update_ops.cc
+tensorflow/core/kernels/cwise_ops_common.cc
+tensorflow/core/kernels/cwise_op_tanh.cc
+tensorflow/core/kernels/cwise_op_sub.cc
+tensorflow/core/kernels/cwise_op_squared_difference.cc
+tensorflow/core/kernels/cwise_op_square.cc
+tensorflow/core/kernels/cwise_op_sqrt.cc
+tensorflow/core/kernels/cwise_op_sigmoid.cc
+tensorflow/core/kernels/cwise_op_select.cc
+tensorflow/core/kernels/cwise_op_rsqrt.cc
+tensorflow/core/kernels/cwise_op_neg.cc
+tensorflow/core/kernels/cwise_op_mul.cc
+tensorflow/core/kernels/cwise_op_minimum.cc
+tensorflow/core/kernels/cwise_op_maximum.cc
+tensorflow/core/kernels/cwise_op_log.cc
+tensorflow/core/kernels/cwise_op_less.cc
+tensorflow/core/kernels/cwise_op_isfinite.cc
+tensorflow/core/kernels/cwise_op_inverse.cc
+tensorflow/core/kernels/cwise_op_greater.cc
+tensorflow/core/kernels/cwise_op_exp.cc
+tensorflow/core/kernels/cwise_op_equal_to.cc
+tensorflow/core/kernels/cwise_op_div.cc
+tensorflow/core/kernels/cwise_op_add.cc
+tensorflow/core/kernels/ctc_decoder_ops.cc
+tensorflow/core/kernels/conv_ops.cc
+tensorflow/core/kernels/conv_grad_ops.cc
+tensorflow/core/kernels/control_flow_ops.cc
+tensorflow/core/kernels/constant_op.cc
+tensorflow/core/kernels/concat_op.cc
+tensorflow/core/kernels/concat_lib_cpu.cc
+tensorflow/core/kernels/check_numerics_op.cc
+tensorflow/core/kernels/cast_op.cc
+tensorflow/core/kernels/bias_op.cc
+tensorflow/core/kernels/bcast_ops.cc
+tensorflow/core/kernels/batch_norm_op.cc
+tensorflow/core/kernels/avgpooling_op.cc
+tensorflow/core/kernels/argmax_op.cc
+tensorflow/core/kernels/aggregate_ops.cc
+tensorflow/core/ops/training_ops.cc
+tensorflow/core/ops/string_ops.cc
+tensorflow/core/ops/state_ops.cc
+tensorflow/core/ops/sparse_ops.cc
+tensorflow/core/ops/sendrecv_ops.cc
+tensorflow/core/ops/script_ops.cc
+tensorflow/core/ops/random_ops.cc
+tensorflow/core/ops/random_grad.cc
+tensorflow/core/ops/parsing_ops.cc
+tensorflow/core/ops/no_op.cc
+tensorflow/core/ops/nn_ops.cc
+tensorflow/core/ops/nn_grad.cc
+tensorflow/core/ops/math_ops.cc
+tensorflow/core/ops/math_grad.cc
+tensorflow/core/ops/logging_ops.cc
+tensorflow/core/ops/linalg_ops.cc
+tensorflow/core/ops/io_ops.cc
+tensorflow/core/ops/image_ops.cc
+tensorflow/core/ops/functional_ops.cc
+tensorflow/core/ops/functional_grad.cc
+tensorflow/core/ops/function_ops.cc
+tensorflow/core/ops/data_flow_ops.cc
+tensorflow/core/ops/ctc_ops.cc
+tensorflow/core/ops/control_flow_ops.cc
+tensorflow/core/ops/candidate_sampling_ops.cc
+tensorflow/core/ops/array_ops.cc
+tensorflow/core/ops/array_grad.cc
diff --git a/tensorflow/contrib/quantization/BUILD b/tensorflow/contrib/quantization/BUILD
index 2b5f55ada7..881349fda7 100644
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@@ -69,6 +69,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ops",
+        "//tensorflow/contrib/quantization:quantized_ops_py",
+        "//tensorflow/contrib/quantization/kernels:quantized_kernels_py",
     ],
 )
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index acaaa47069..0920379403 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -603,6 +603,7 @@ filegroup(
             "graph/dot.*",
             "lib/jpeg/**/*",
             "lib/png/**/*",
+            "lib/gif/**/*",
             "util/checkpoint_reader.*",
             "util/events_writer.*",
             "util/reporter.*",
@@ -613,6 +614,7 @@ filegroup(
             "platform/google/**/*",
             "platform/jpeg.*",
             "platform/png.*",
+            "platform/gif.*",
             "platform/stream_executor.*",
             "user_ops/**/*.cu.cc",
             "common_runtime/gpu/**/*",
@@ -843,6 +845,7 @@ cc_library(
     hdrs = [
         "lib/core/blocking_counter.h",
         "lib/core/refcount.h",
+        "lib/gif/gif_io.h",
         "lib/gtl/edit_distance.h",
         "lib/gtl/int_type.h",
         "lib/gtl/iterator_range.h",
@@ -1967,6 +1970,10 @@ filegroup(
         "lib/jpeg/testdata/corrupt34_3.jpg",
         # -- hand-edited variant: stops after a restart marker
         "lib/jpeg/testdata/corrupt34_4.jpg",
+        # GIF data
+        "lib/gif/testdata/scan.gif",
+        # GIF data with optimization
+        "lib/gif/testdata/optimized.gif",
     ],
 )
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b3341e3dde..d52f389518 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -859,6 +859,7 @@ tf_kernel_libraries(
         "crop_and_resize_op",
         "decode_jpeg_op",
         "decode_png_op",
+        "decode_gif_op",
         "draw_bounding_box_op",
         "encode_jpeg_op",
         "attention_ops",
@@ -1108,6 +1109,7 @@ tf_kernel_libraries(
         "matmul_op",
         "reduction_ops",
         "segment_reduction_ops",
+        "scan_ops",
         "sequence_ops",
         "sparse_matmul_op",
     ],
@@ -2040,6 +2042,7 @@ filegroup(
             "decode_png_op.*",
             "encode_jpeg_op.*",
             "decode_jpeg_op.*",
+            "decode_gif_op.*",
             "identity_reader_op.*",
             "reader_base.*",
             "fixed_length_record_reader_op.*",
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index 26f616f9b9..d65a34fd73 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -36,7 +36,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device>
+template <typename Device, typename T>
 class RGBToHSVOp : public OpKernel {
  public:
   explicit RGBToHSVOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -59,23 +59,23 @@ class RGBToHSVOp : public OpKernel {
 
     // Make a canonical image, maintaining the last (channel) dimension, while
     // flattening all others do give the functor easy to work with data.
-    TTypes<float, 2>::ConstTensor input_data = input.flat_inner_dims<float>();
-    TTypes<float, 2>::Tensor output_data = output->flat_inner_dims<float>();
+    typename TTypes<T, 2>::ConstTensor input_data = input.flat_inner_dims<T>();
+    typename TTypes<T, 2>::Tensor output_data = output->flat_inner_dims<T>();
 
     Tensor trange;
     OP_REQUIRES_OK(
-        context, context->allocate_temp(DataTypeToEnum<float>::value,
+        context, context->allocate_temp(DataTypeToEnum<T>::value,
                                         TensorShape({input_data.dimension(0)}),
                                         &trange));
 
-    TTypes<float, 1>::Tensor range = trange.tensor<float, 1>();
+    typename TTypes<T, 1>::Tensor range = trange.tensor<T, 1>();
 
-    functor::RGBToHSV<Device>()(context->eigen_device<Device>(), input_data,
-                                range, output_data);
+    functor::RGBToHSV<Device, T>()(context->eigen_device<Device>(), input_data,
+                                   range, output_data);
   }
 };
 
-template <typename Device>
+template <typename Device, typename T>
 class HSVToRGBOp : public OpKernel {
  public:
   explicit HSVToRGBOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -96,41 +96,54 @@ class HSVToRGBOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
 
-    TTypes<float, 2>::ConstTensor input_data = input.flat_inner_dims<float>();
-    TTypes<float, 2>::Tensor output_data = output->flat_inner_dims<float>();
+    typename TTypes<T, 2>::ConstTensor input_data = input.flat_inner_dims<T>();
+    typename TTypes<T, 2>::Tensor output_data = output->flat_inner_dims<T>();
 
-    functor::HSVToRGB<Device>()(context->eigen_device<Device>(), input_data,
-                                output_data);
+    functor::HSVToRGB<Device, T>()(context->eigen_device<Device>(), input_data,
+                                   output_data);
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU),
-                        RGBToHSVOp<CPUDevice>);
-template class RGBToHSVOp<CPUDevice>;
-REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU),
-                        HSVToRGBOp<CPUDevice>);
-template class HSVToRGBOp<CPUDevice>;
+#define REGISTER_CPU(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
+                              .TypeConstraint<T>("T"),        \
+                          RGBToHSVOp<CPUDevice, T>);          \
+  template class RGBToHSVOp<CPUDevice, T>;                    \
+  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
+                              .TypeConstraint<T>("T"),        \
+                          HSVToRGBOp<CPUDevice, T>);          \
+  template class HSVToRGBOp<CPUDevice, T>;
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
 
 #if GOOGLE_CUDA
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-template <>
-void RGBToHSV<GPUDevice>::operator()(const GPUDevice& d,
-                                     TTypes<float, 2>::ConstTensor input_data,
-                                     TTypes<float, 1>::Tensor range,
-                                     TTypes<float, 2>::Tensor output_data);
-extern template struct RGBToHSV<GPUDevice>;
-template <>
-void HSVToRGB<GPUDevice>::operator()(const GPUDevice& d,
-                                     TTypes<float, 2>::ConstTensor input_data,
-                                     TTypes<float, 2>::Tensor output_data);
-extern template struct HSVToRGB<GPUDevice>;
+#define DECLARE_GPU(T)                                        \
+  template <>                                                 \
+  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
+      TTypes<T, 2>::ConstTensor input_data,                   \
+      TTypes<T, 1>::Tensor range,                             \
+      TTypes<T, 2>::Tensor output_data);                      \
+  extern template struct RGBToHSV<GPUDevice, T>;              \
+  template <>                                                 \
+  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
+      TTypes<T, 2>::ConstTensor input_data,                   \
+      TTypes<T, 2>::Tensor output_data);                      \
+  extern template struct HSVToRGB<GPUDevice, T>;
+TF_CALL_float(DECLARE_GPU);
+TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU),
-                        RGBToHSVOp<GPUDevice>);
-REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU),
-                        HSVToRGBOp<GPUDevice>);
+#define REGISTER_GPU(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
+                              .TypeConstraint<T>("T"),        \
+                          RGBToHSVOp<GPUDevice, T>);          \
+  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
+                              .TypeConstraint<T>("T"),        \
+                          HSVToRGBOp<GPUDevice, T>);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h
index 6362b6fd90..c5721ef6dd 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@@ -24,18 +24,19 @@ namespace tensorflow {
 
 namespace functor {
 
-template <typename Device>
+template <typename Device, typename T>
 struct RGBToHSV {
-  void operator()(const Device &d, TTypes<float, 2>::ConstTensor input_data,
-                  TTypes<float, 1>::Tensor range,
-                  TTypes<float, 2>::Tensor output_data) {
-    auto H = output_data.chip<1>(0);
-    auto S = output_data.chip<1>(1);
-    auto V = output_data.chip<1>(2);
-
-    auto R = input_data.chip<1>(0);
-    auto G = input_data.chip<1>(1);
-    auto B = input_data.chip<1>(2);
+  void operator()(const Device &d,
+                  typename TTypes<T, 2>::ConstTensor input_data,
+                  typename TTypes<T, 1>::Tensor range,
+                  typename TTypes<T, 2>::Tensor output_data) {
+    auto H = output_data.template chip<1>(0);
+    auto S = output_data.template chip<1>(1);
+    auto V = output_data.template chip<1>(2);
+
+    auto R = input_data.template chip<1>(0);
+    auto G = input_data.template chip<1>(1);
+    auto B = input_data.template chip<1>(2);
 
 #if !defined(EIGEN_HAS_INDEX_LIST)
     Eigen::array<int, 1> channel_axis{{1}};
@@ -47,38 +48,40 @@ struct RGBToHSV {
 
     range.device(d) = V - input_data.minimum(channel_axis);
 
-    S.device(d) = (V > 0.f).select(range / V, V.constant(0.f));
+    S.device(d) = (V > T(0)).select(range / V, V.constant(T(0)));
 
-    auto norm = range.inverse() * (1.f / 6.f);
+    auto norm = range.inverse() * (T(1) / T(6));
     // TODO(wicke): all these assignments are only necessary because a combined
     // expression is larger than kernel parameter space. A custom kernel is
     // probably in order.
     H.device(d) = (R == V).select(norm * (G - B),
-                                  (G == V).select(norm * (B - R) + 2.f / 6.f,
-                                                  norm * (R - G) + 4.f / 6.f));
-    H.device(d) = (range > 0.f).select(H, H.constant(0.f));
-    H.device(d) = (H < 0.f).select(H + 1.f, H);
+                                  (G == V).select(
+                                      norm * (B - R) + T(2) / T(6),
+                                      norm * (R - G) + T(4) / T(6)));
+    H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
+    H.device(d) = (H < T(0)).select(H + T(1), H);
   }
 };
 
-template <typename Device>
+template <typename Device, typename T>
 struct HSVToRGB {
-  void operator()(const Device &d, TTypes<float, 2>::ConstTensor input_data,
-                  TTypes<float, 2>::Tensor output_data) {
-    auto H = input_data.chip<1>(0);
-    auto S = input_data.chip<1>(1);
-    auto V = input_data.chip<1>(2);
+  void operator()(const Device &d,
+                  typename TTypes<T, 2>::ConstTensor input_data,
+                  typename TTypes<T, 2>::Tensor output_data) {
+    auto H = input_data.template chip<1>(0);
+    auto S = input_data.template chip<1>(1);
+    auto V = input_data.template chip<1>(2);
 
     // TODO(wicke): compute only the fractional part of H for robustness
-    auto dh = H * 6.f;
-    auto dr = ((dh - 3.f).abs() - 1.f).cwiseMax(0.f).cwiseMin(1.f);
-    auto dg = (-(dh - 2.f).abs() + 2.f).cwiseMax(0.f).cwiseMin(1.f);
-    auto db = (-(dh - 4.f).abs() + 2.f).cwiseMax(0.f).cwiseMin(1.f);
-    auto one_s = -S + 1.f;
-
-    auto R = output_data.chip<1>(0);
-    auto G = output_data.chip<1>(1);
-    auto B = output_data.chip<1>(2);
+    auto dh = H * T(6);
+    auto dr = ((dh - T(3)).abs() - T(1)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto dg = (-(dh - T(2)).abs() + T(2)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto db = (-(dh - T(4)).abs() + T(2)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto one_s = -S + T(1);
+
+    auto R = output_data.template chip<1>(0);
+    auto G = output_data.template chip<1>(1);
+    auto B = output_data.template chip<1>(2);
 
     R.device(d) = (one_s + S * dr) * V;
     G.device(d) = (one_s + S * dg) * V;
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
index defd491dbc..e19d0b14d5 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@@ -24,8 +24,11 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template class functor::RGBToHSV<GPUDevice>;
-template class functor::HSVToRGB<GPUDevice>;
+#define INSTANTIATE_GPU(T)                        \
+  template class functor::RGBToHSV<GPUDevice, T>; \
+  template class functor::HSVToRGB<GPUDevice, T>;
+TF_CALL_float(INSTANTIATE_GPU);
+TF_CALL_double(INSTANTIATE_GPU);
 }
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc
index 5130a5d398..4719a59b63 100644
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@@ -29,183 +29,241 @@ limitations under the License.
 
 namespace tensorflow {
 
+template <typename T>
 class RGBToHSVOpTest : public OpsTestBase {
  protected:
-  RGBToHSVOpTest() {
+  void MakeOp(DataType data_type) {
     TF_EXPECT_OK(NodeDefBuilder("rgb_to_hsv_op", "RGBToHSV")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(data_type))
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
-};
 
-TEST_F(RGBToHSVOpTest, CheckBlack) {
-  // Black pixel should map to hsv = [0,0,0]
-  AddInputFromArray<float>(TensorShape({3}), {0, 0, 0});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {0.0, 0.0, 0.0});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(RGBToHSVOpTest, CheckGray) {
-  // Gray pixel should have hue = saturation = 0.0, value = r/255
-  AddInputFromArray<float>(TensorShape({3}), {.5, .5, .5});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {0.0, 0.0, .5});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(RGBToHSVOpTest, CheckWhite) {
-  // Gray pixel should have hue = saturation = 0.0, value = 1.0
-  AddInputFromArray<float>(TensorShape({3}), {1, 1, 1});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {0.0, 0.0, 1.0});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(RGBToHSVOpTest, CheckRedMax) {
-  // Test case where red channel dominates
-  AddInputFromArray<float>(TensorShape({3}), {.8, .4, .2});
-  TF_ASSERT_OK(RunOpKernel());
-
-  float expected_h = 1. / 6. * .2 / .6;
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(RGBToHSVOpTest, CheckGreenMax) {
-  // Test case where green channel dominates
-  AddInputFromArray<float>(TensorShape({3}), {.2, .8, .4});
-  TF_ASSERT_OK(RunOpKernel());
-
-  float expected_h = 1. / 6. * (2.0 + (.2 / .6));
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(RGBToHSVOpTest, CheckBlueMax) {
-  // Test case where blue channel dominates
-  AddInputFromArray<float>(TensorShape({3}), {.4, .2, .8});
-  TF_ASSERT_OK(RunOpKernel());
-
-  float expected_h = 1. / 6. * (4.0 + (.2 / .6));
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(RGBToHSVOpTest, CheckNegativeDifference) {
-  AddInputFromArray<float>(TensorShape({3}), {0, .1, .2});
-  TF_ASSERT_OK(RunOpKernel());
-
-  float expected_h = 1. / 6. * (4.0 + (-.1 / .2));
-  float expected_s = .2 / .2;
-  float expected_v = .2 / 1.;
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
+  void CheckBlack(DataType data_type) {
+    // Black pixel should map to hsv = [0,0,0]
+    AddInputFromArray<T>(TensorShape({3}), {0, 0, 0});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0.0, 0.0, 0.0});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckGray(DataType data_type) {
+    // Gray pixel should have hue = saturation = 0.0, value = r/255
+    AddInputFromArray<T>(TensorShape({3}), {.5, .5, .5});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0.0, 0.0, .5});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckWhite(DataType data_type) {
+    // Gray pixel should have hue = saturation = 0.0, value = 1.0
+    AddInputFromArray<T>(TensorShape({3}), {1, 1, 1});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0.0, 0.0, 1.0});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckRedMax(DataType data_type) {
+    // Test case where red channel dominates
+    AddInputFromArray<T>(TensorShape({3}), {.8, .4, .2});
+    TF_ASSERT_OK(RunOpKernel());
+
+    T expected_h = 1. / 6. * .2 / .6;
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckGreenMax(DataType data_type) {
+    // Test case where green channel dominates
+    AddInputFromArray<T>(TensorShape({3}), {.2, .8, .4});
+    TF_ASSERT_OK(RunOpKernel());
 
+    T expected_h = 1. / 6. * (2.0 + (.2 / .6));
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckBlueMax(DataType data_type) {
+    // Test case where blue channel dominates
+    AddInputFromArray<T>(TensorShape({3}), {.4, .2, .8});
+    TF_ASSERT_OK(RunOpKernel());
+
+    T expected_h = 1. / 6. * (4.0 + (.2 / .6));
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckNegativeDifference(DataType data_type) {
+    AddInputFromArray<T>(TensorShape({3}), {0, .1, .2});
+    TF_ASSERT_OK(RunOpKernel());
+
+    T expected_h = 1. / 6. * (4.0 + (-.1 / .2));
+    T expected_s = .2 / .2;
+    T expected_v = .2 / 1.;
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+};
+
+template <typename T>
 class HSVToRGBOpTest : public OpsTestBase {
  protected:
-  HSVToRGBOpTest() {
+  void MakeOp(DataType data_type) {
     TF_EXPECT_OK(NodeDefBuilder("hsv_to_rgb_op", "HSVToRGB")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(data_type))
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
+
+  void CheckBlack(DataType data_type) {
+    // Black pixel should map to rgb = [0,0,0]
+    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, 0.0});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0, 0, 0});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckGray(DataType data_type) {
+    // Gray pixel should have hue = saturation = 0.0, value = r/255
+    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, .5});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {.5, .5, .5});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckWhite(DataType data_type) {
+    // Gray pixel should have hue = saturation = 0.0, value = 1.0
+    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, 1.0});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {1, 1, 1});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckRedMax(DataType data_type) {
+    // Test case where red channel dominates
+    T expected_h = 1. / 6. * .2 / .6;
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    AddInputFromArray<T>(TensorShape({3}),
+                         {expected_h, expected_s, expected_v});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {.8, .4, .2});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckGreenMax(DataType data_type) {
+    // Test case where green channel dominates
+    T expected_h = 1. / 6. * (2.0 + (.2 / .6));
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    AddInputFromArray<T>(TensorShape({3}),
+                         {expected_h, expected_s, expected_v});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {.2, .8, .4});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckBlueMax(DataType data_type) {
+    // Test case where blue channel dominates
+    T expected_h = 1. / 6. * (4.0 + (.2 / .6));
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.0;
+
+    AddInputFromArray<T>(TensorShape({3}),
+                         {expected_h, expected_s, expected_v});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {.4, .2, .8});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckNegativeDifference(DataType data_type) {
+    T expected_h = 1. / 6. * (4.0 + (-.1 / .2));
+    T expected_s = .2 / .2;
+    T expected_v = .2 / 1.;
+
+    AddInputFromArray<T>(TensorShape({3}),
+                         {expected_h, expected_s, expected_v});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0, .1, .2});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
 };
 
-TEST_F(HSVToRGBOpTest, CheckBlack) {
-  // Black pixel should map to rgb = [0,0,0]
-  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, 0.0});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {0, 0, 0});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(HSVToRGBOpTest, CheckGray) {
-  // Gray pixel should have hue = saturation = 0.0, value = r/255
-  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, .5});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {.5, .5, .5});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(HSVToRGBOpTest, CheckWhite) {
-  // Gray pixel should have hue = saturation = 0.0, value = 1.0
-  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, 1.0});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {1, 1, 1});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(HSVToRGBOpTest, CheckRedMax) {
-  // Test case where red channel dominates
-  float expected_h = 1. / 6. * .2 / .6;
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  AddInputFromArray<float>(TensorShape({3}),
-                           {expected_h, expected_s, expected_v});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {.8, .4, .2});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(HSVToRGBOpTest, CheckGreenMax) {
-  // Test case where green channel dominates
-  float expected_h = 1. / 6. * (2.0 + (.2 / .6));
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  AddInputFromArray<float>(TensorShape({3}),
-                           {expected_h, expected_s, expected_v});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {.2, .8, .4});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(HSVToRGBOpTest, CheckBlueMax) {
-  // Test case where blue channel dominates
-  float expected_h = 1. / 6. * (4.0 + (.2 / .6));
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.0;
-
-  AddInputFromArray<float>(TensorShape({3}),
-                           {expected_h, expected_s, expected_v});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {.4, .2, .8});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
+#define TEST_COLORSPACE(test, dt)                               \
+  TEST_F(test, CheckBlack) {                                    \
+    MakeOp(dt);                                                 \
+    CheckBlack(dt);                                             \
+  }                                                             \
+  TEST_F(test, CheckGray) {                                     \
+    MakeOp(dt);                                                 \
+    CheckGray(dt);                                              \
+  }                                                             \
+  TEST_F(test, CheckWhite) {                                    \
+    MakeOp(dt);                                                 \
+    CheckWhite(dt);                                             \
+  }                                                             \
+  TEST_F(test, CheckRedMax) {                                   \
+    MakeOp(dt);                                                 \
+    CheckRedMax(dt);                                            \
+  }                                                             \
+  TEST_F(test, CheckGreenMax) {                                 \
+    MakeOp(dt);                                                 \
+    CheckGreenMax(dt);                                          \
+  }                                                             \
+  TEST_F(test, CheckBlueMax) {                                  \
+    MakeOp(dt);                                                 \
+    CheckBlueMax(dt);                                           \
+  }                                                             \
+  TEST_F(test, CheckNegativeDifference) {                       \
+    MakeOp(dt);                                                 \
+    CheckNegativeDifference(dt);                                \
+  }
+
+typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
+typedef RGBToHSVOpTest<double> rgb_to_hsv_double;
+
+TEST_COLORSPACE(rgb_to_hsv_float, DT_FLOAT);
+TEST_COLORSPACE(rgb_to_hsv_double, DT_DOUBLE);
+
+typedef HSVToRGBOpTest<float> hsv_to_rgb_float;
+typedef HSVToRGBOpTest<double> hsv_to_rgb_double;
+
+TEST_COLORSPACE(hsv_to_rgb_float, DT_FLOAT);
+TEST_COLORSPACE(hsv_to_rgb_double, DT_DOUBLE);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 6ccbe46c7f..aa9c55a2ca 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -65,7 +65,7 @@ class BinaryOpShared : public OpKernel {
 
 // Coefficient-wise binary operations:
 //   Device: E.g., CPUDevice, GPUDevice.
-//   Functor: defined in cwise_functors.h. E.g., functor::add2.
+//   Functor: defined in cwise_ops.h. E.g., functor::add.
 template <typename Device, typename Functor>
 class BinaryOp : public BinaryOpShared {
  public:
@@ -162,7 +162,7 @@ class SimpleBinaryOp : public OpKernel {
 
 // Coefficient-wise unary operations:
 //   Device: E.g., CPUDevice, GPUDevice.
-//   Functor: defined in cwise_functors.h. E.g., functor::sqrt.
+//   Functor: defined in cwise_ops.h. E.g., functor::sqrt.
 template <typename Device, typename Functor>
 class UnaryOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/decode_gif_op.cc b/tensorflow/core/kernels/decode_gif_op.cc
new file mode 100644
index 0000000000..29596b15f4
--- /dev/null
+++ b/tensorflow/core/kernels/decode_gif_op.cc
@@ -0,0 +1,66 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// Decode the contents of a GIF file
+class DecodeGifOp : public OpKernel {
+ public:
+  explicit DecodeGifOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().DebugString()));
+
+    // Start decoding image to get shape details
+    const StringPiece input = contents.scalar<string>()();
+
+    // Decode image, allocating tensor once the image size is known
+    Tensor* output = nullptr;
+    OP_REQUIRES(
+        context,
+        gif::Decode(input.data(), input.size(),
+                    [=, &output](int num_frames, int width, int height,
+                                 int channels) -> uint8* {
+                      Status status(context->allocate_output(
+                          0, TensorShape({num_frames, height, width, channels}),
+                          &output));
+                      if (!status.ok()) {
+                        VLOG(1) << status;
+                        context->SetStatus(status);
+                        return nullptr;
+                      }
+                      return output->flat<uint8>().data();
+                    }),
+        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeGifOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 18fb480515..30e90f2206 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -97,13 +97,7 @@ class ReverseOp : public OpKernel {
                               .HostMemory("dims"),    \
                           ReverseOp<CPUDevice, T>)
 
-TF_CALL_uint8(REGISTER_KERNEL);
-TF_CALL_int8(REGISTER_KERNEL);
-TF_CALL_int32(REGISTER_KERNEL);
-TF_CALL_bool(REGISTER_KERNEL);
-TF_CALL_half(REGISTER_KERNEL);
-TF_CALL_float(REGISTER_KERNEL);
-TF_CALL_double(REGISTER_KERNEL);
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -136,6 +130,8 @@ TF_CALL_bool(DECLARE_GPU_SPEC);
 TF_CALL_half(DECLARE_GPU_SPEC);
 TF_CALL_float(DECLARE_GPU_SPEC);
 TF_CALL_double(DECLARE_GPU_SPEC);
+TF_CALL_complex64(DECLARE_GPU_SPEC);
+TF_CALL_complex128(DECLARE_GPU_SPEC);
 #undef DECLARE_GPU_SPEC
 #undef DECLARE_GPU_SPEC_DIM
 }  // namespace functor
@@ -149,9 +145,15 @@ TF_CALL_double(DECLARE_GPU_SPEC);
                           ReverseOp<GPUDevice, T>)
 TF_CALL_uint8(REGISTER_GPU_KERNEL);
 TF_CALL_int8(REGISTER_GPU_KERNEL);
+// TODO Find out why the int32 GPU kernel doesn't work
+// and decide whether we want to enable the bool kernel.
+//TF_CALL_int32(REGISTER_GPU_KERNEL);
+//TF_CALL_bool(REGISTER_GPU_KERNEL);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
+TF_CALL_complex64(REGISTER_GPU_KERNEL);
+TF_CALL_complex128(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index e5f5f2fc51..39ab010627 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -25,24 +25,30 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_REVERSE(DIM)                                      \
-  template struct functor::Reverse<GPUDevice, uint8, DIM>;       \
-  template struct functor::Reverse<GPUDevice, int8, DIM>;        \
-  template struct functor::Reverse<GPUDevice, int32, DIM>;       \
-  template struct functor::Reverse<GPUDevice, bool, DIM>;        \
-  template struct functor::Reverse<GPUDevice, Eigen::half, DIM>; \
-  template struct functor::Reverse<GPUDevice, float, DIM>;       \
-  template struct functor::Reverse<GPUDevice, double, DIM>;
-DEFINE_REVERSE(0)
-DEFINE_REVERSE(1)
-DEFINE_REVERSE(2)
-DEFINE_REVERSE(3)
-DEFINE_REVERSE(4)
-DEFINE_REVERSE(5)
-DEFINE_REVERSE(6)
-DEFINE_REVERSE(7)
-DEFINE_REVERSE(8)
+#define DEFINE_REVERSE(T, DIM) \
+  template struct functor::Reverse<GPUDevice, T, DIM>;
+#define DEFINE_REVERSE_ALL_DIMS(T) \
+  DEFINE_REVERSE(T, 0) \
+  DEFINE_REVERSE(T, 1) \
+  DEFINE_REVERSE(T, 2) \
+  DEFINE_REVERSE(T, 3) \
+  DEFINE_REVERSE(T, 4) \
+  DEFINE_REVERSE(T, 5) \
+  DEFINE_REVERSE(T, 6) \
+  DEFINE_REVERSE(T, 7) \
+  DEFINE_REVERSE(T, 8)
+
+TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_int8(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_int32(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_bool(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_half(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_float(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_double(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_complex64(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_complex128(DEFINE_REVERSE_ALL_DIMS);
 #undef DEFINE_REVERSE
+#undef DEFINE_REVERSE_ALL_DIMS
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc
new file mode 100644
index 0000000000..e6505dff6e
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops.cc
@@ -0,0 +1,177 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/kernels/scan_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, class T, typename Reducer>
+class ScanOp : public OpKernel {
+public:
+  explicit ScanOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reverse", &reverse_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("exclusive", &exclusive_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    const Tensor& tensor_axis = ctx->input(1);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_axis.shape()),
+                errors::InvalidArgument("ScanOp: axis must be a scalar, not ",
+                                        tensor_axis.shape().DebugString()));
+
+    const int axis = internal::SubtleMustCopy(tensor_axis.scalar<int>()());
+
+    OP_REQUIRES(
+        ctx, FastBoundsCheck(axis, input.dims()),
+        errors::InvalidArgument("ScanOp: Expected scan axis in the range [", 0,
+                                ", ", input.dims(), "), but got ", axis));
+
+    TensorShape output_shape = input.shape();
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output));
+
+    const Device& d = ctx->eigen_device<Device>();
+    Reducer reducer;
+
+#define HANDLE_SCAN(NDIMS)                                                \
+  case NDIMS:                                                             \
+    functor::Scan<Device, Reducer, T, NDIMS>()(                           \
+        d, input.tensor<T, NDIMS>(), output->tensor<T, NDIMS>(), reducer, \
+        axis, reverse_, exclusive_);                                      \
+    return;
+
+    switch (input.dims()) {
+      // input.dims() == 0 can't occur as there
+      // is no valid axis parameter in this case
+      HANDLE_SCAN(1);
+      HANDLE_SCAN(2);
+      HANDLE_SCAN(3);
+      HANDLE_SCAN(4);
+      HANDLE_SCAN(5);
+      HANDLE_SCAN(6);
+      HANDLE_SCAN(7);
+      HANDLE_SCAN(8);
+      default:
+        OP_REQUIRES(ctx, false, errors::InvalidArgument(
+                                    "Scan does not support tensors with "
+                                    "more than 8 dimensions",
+                                    input.dims()));
+    }
+#undef HANDLE_SCAN
+  }
+
+private:
+  bool reverse_;
+  bool exclusive_;
+};
+
+#ifdef GOOGLE_CUDA
+namespace functor {
+
+// Forward declarations of GPU functors
+#define DECLARE(REDUCER, T, D)                                             \
+  template <>                                                              \
+  void Scan<GPUDevice, REDUCER, T, D>::operator()(                         \
+      const GPUDevice& d, TTypes<T, D>::ConstTensor in,                    \
+      TTypes<T, D>::Tensor out, const REDUCER& reducer,                    \
+      const Eigen::Index& axis, const bool reverse, const bool exclusive); \
+  extern template struct Scan<GPUDevice, REDUCER, T, D>;
+
+#define DECLARE_FOR_ALL_DIMS(REDUCER, T) \
+  DECLARE(REDUCER, T, 1);                \
+  DECLARE(REDUCER, T, 2);                \
+  DECLARE(REDUCER, T, 3);                \
+  DECLARE(REDUCER, T, 4);                \
+  DECLARE(REDUCER, T, 5);                \
+  DECLARE(REDUCER, T, 6);                \
+  DECLARE(REDUCER, T, 7);                \
+  DECLARE(REDUCER, T, 8);
+
+#define DECLARE_FOR_ALL_REDUCERS(T)                        \
+  DECLARE_FOR_ALL_DIMS(Eigen::internal::SumReducer<T>, T); \
+  DECLARE_FOR_ALL_DIMS(Eigen::internal::ProdReducer<T>, T);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_ALL_REDUCERS);
+
+#undef DECLARE_FOR_ALL_REDUCERS
+#undef DECLARE_FOR_ALL_DIMS
+#undef DECLARE
+
+}  // namespace functor
+#endif  // GOOGLE_CUDA
+
+
+// Register Cumsum kernels
+#define REGISTER_CPU_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Cumsum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ScanOp<CPUDevice, type, Eigen::internal::SumReducer<type>>)
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU_KERNELS(type)   \
+  REGISTER_KERNEL_BUILDER(           \
+      Name("Cumsum")                 \
+          .Device(DEVICE_GPU)        \
+          .TypeConstraint<type>("T") \
+          .HostMemory("axis"),       \
+      ScanOp<GPUDevice, type, Eigen::internal::SumReducer<type>>)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS)
+#undef REGISTER_GPU_KERNELS
+#endif // GOOGLE_CUDA
+
+
+// Register Cumprod kernels
+#define REGISTER_CPU_KERNELS(type)                                  \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Cumprod").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ScanOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>)
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU_KERNELS(type)   \
+  REGISTER_KERNEL_BUILDER(           \
+      Name("Cumprod")                \
+          .Device(DEVICE_GPU)        \
+          .TypeConstraint<type>("T") \
+          .HostMemory("axis"),       \
+      ScanOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS)
+#undef REGISTER_GPU_KERNELS
+#endif // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scan_ops.h b/tensorflow/core/kernels/scan_ops.h
new file mode 100644
index 0000000000..38f71b1474
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops.h
@@ -0,0 +1,47 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_SCAN_OPS_H_
+#define TENSORFLOW_KERNELS_SCAN_OPS_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::Index Index;
+
+template <typename Device, typename Reducer, typename T, int Dims>
+struct Scan {
+  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor in,
+                  typename TTypes<T, Dims>::Tensor out, const Reducer& reducer,
+                  const Index& axis, const bool reverse, const bool exclusive) {
+    // Perform the reverse ops directly with Eigen, which avoids copying the
+    // tensor twice compared to using individual ops.
+    Eigen::array<bool, Dims> dims;
+    for (int i = 0; i < dims.size(); i++) {
+      dims[i] = reverse && (i == axis);
+    }
+    To32Bit(out).device(d) = To32Bit(in).reverse(dims)
+                                        .scan(axis, reducer, exclusive)
+                                        .reverse(dims);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SCAN_OPS_H_
diff --git a/tensorflow/core/kernels/scan_ops_gpu.cu.cc b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
new file mode 100644
index 0000000000..d1cd9aa478
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
@@ -0,0 +1,54 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/register_types.h"
+
+#include "tensorflow/core/kernels/scan_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::Index Index;
+
+#define DEFINE(REDUCER, T, D) \
+  template struct functor::Scan<GPUDevice, REDUCER, T, D>;
+
+#define DEFINE_FOR_ALL_DIMS(REDUCER, T) \
+  DEFINE(REDUCER, T, 1);                \
+  DEFINE(REDUCER, T, 2);                \
+  DEFINE(REDUCER, T, 3);                \
+  DEFINE(REDUCER, T, 4);                \
+  DEFINE(REDUCER, T, 5);                \
+  DEFINE(REDUCER, T, 6);                \
+  DEFINE(REDUCER, T, 7);                \
+  DEFINE(REDUCER, T, 8)
+
+#define DEFINE_FOR_ALL_REDUCERS(T)                        \
+  DEFINE_FOR_ALL_DIMS(Eigen::internal::SumReducer<T>, T); \
+  DEFINE_FOR_ALL_DIMS(Eigen::internal::ProdReducer<T>, T);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_FOR_ALL_REDUCERS);
+#undef DEFINE_FOR_ALL_REDUCERS
+#undef DEFINE_FOR_ALL_DIMS
+#undef DEFINE
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 2f9714a37a..867045eb1f 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -58,9 +58,8 @@ struct ApplyAdadelta<CPUDevice, T> {
                   typename TTypes<T>::ConstFlat grad) {
     accum.device(d) =
         accum * rho() + grad.square() * (static_cast<T>(1) - rho());
-    const auto update = 
-	(accum_update + epsilon()).sqrt() *
-	(accum + epsilon()).rsqrt() * grad;
+    const auto update =
+        (accum_update + epsilon()).sqrt() * (accum + epsilon()).rsqrt() * grad;
     accum_update.device(d) =
         accum_update * rho() + update.square() * (static_cast<T>(1) - rho());
     var.device(d) -= update * lr();
@@ -176,9 +175,13 @@ struct ApplyMomentum<CPUDevice, T> {
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
                   typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum) {
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
     accum.device(d) = accum * momentum() + grad;
-    var.device(d) -= accum * lr();
+    if (use_nesterov) {
+      var.device(d) -= grad * lr() + accum * momentum() * lr();
+    } else {
+      var.device(d) -= accum * lr();
+    }
   }
 };
 
@@ -1515,6 +1518,7 @@ class ApplyMomentumOp : public OpKernel {
  public:
   explicit ApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -1554,12 +1558,13 @@ class ApplyMomentumOp : public OpKernel {
     const Device& device = ctx->template eigen_device<Device>();
     functor::ApplyMomentum<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
                                         lr.scalar<T>(), grad.flat<T>(),
-                                        momentum.scalar<T>());
+                                        momentum.scalar<T>(), use_nesterov_);
     ctx->forward_ref_input_to_ref_output(0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
+  bool use_nesterov_;
 };
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -1584,7 +1589,7 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T>::Flat var,                   \
       typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
       typename TTypes<T>::ConstFlat grad,                                 \
-      typename TTypes<T>::ConstScalar momentum);                          \
+      typename TTypes<T>::ConstScalar momentum, bool use_nesterov);       \
   extern template struct ApplyMomentum<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -1605,6 +1610,7 @@ class SparseApplyMomentumOp : public OpKernel {
  public:
   explicit SparseApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
@@ -1672,7 +1678,12 @@ class SparseApplyMomentumOp : public OpKernel {
         auto g = grad_flat.template chip<0>(i);
         auto v = var_flat.template chip<0>(index);
         a = a * a.constant(momentum_scalar) + g;
-        v -= a.constant(lr_scalar) * a;
+        if (use_nesterov_) {
+          v -= g.constant(lr_scalar) * g +
+               a.constant(lr_scalar) * a.constant(momentum_scalar) * a;
+        } else {
+          v -= a.constant(lr_scalar) * a;
+        }
       }
     }
 
@@ -1681,6 +1692,7 @@ class SparseApplyMomentumOp : public OpKernel {
 
  private:
   bool use_exclusive_lock_;
+  bool use_nesterov_;
 };
 
 #define REGISTER_KERNELS(T, Tindices)                                \
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index b9946cd922..017cae6c7c 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_KERNELS_TRAINING_OPS_H_
 #define TENSORFLOW_KERNELS_TRAINING_OPS_H_
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace functor {
@@ -98,7 +98,7 @@ struct ApplyMomentum {
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
                   typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum);
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
 };
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index ab56880cfb..589e70e76d 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/training_ops.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -84,12 +84,18 @@ struct ApplyMomentum<GPUDevice, T> {
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
                   typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum) {
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
     accum.device(d) = accum * momentum.reshape(single).broadcast(bcast) + grad;
-    var.device(d) -= lr.reshape(single).broadcast(bcast) * accum;
+    if (use_nesterov) {
+      var.device(d) -= grad * lr.reshape(single).broadcast(bcast) +
+                       accum * momentum.reshape(single).broadcast(bcast) *
+                           lr.reshape(single).broadcast(bcast);
+    } else {
+      var.device(d) -= lr.reshape(single).broadcast(bcast) * accum;
+    }
   }
 };
 
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
new file mode 100644
index 0000000000..91a8d6343d
--- /dev/null
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -0,0 +1,95 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to read images in GIF format.
+
+#include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/platform/gif.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gif {
+
+int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
+  if (gif_file->UserData && memcpy(buf, gif_file->UserData, size)) {
+    gif_file->UserData = ((uint8_t*)gif_file->UserData) + size;
+    return size;
+  }
+  return 0;
+}
+
+uint8* Decode(const void* srcdata, int datasize,
+              std::function<uint8*(int, int, int, int)> allocate_output) {
+  int error_code = D_GIF_SUCCEEDED;
+  GifFileType* gif_file =
+      DGifOpen(const_cast<void*>(srcdata), &input_callback, &error_code);
+  if (error_code != D_GIF_SUCCEEDED) {
+    LOG(ERROR) << "Fail to open gif file, reason: "
+               << GifErrorString(error_code);
+    return nullptr;
+  }
+  if (DGifSlurp(gif_file) != GIF_OK) {
+    LOG(ERROR) << "Fail to slurp gif file, reason: "
+               << GifErrorString(gif_file->Error);
+    return nullptr;
+  }
+  if (gif_file->ImageCount <= 0) {
+    LOG(ERROR) << "Gif file does not contain any image";
+    return nullptr;
+  }
+
+  int num_frames = gif_file->ImageCount;
+  int width = gif_file->SWidth;
+  int height = gif_file->SHeight;
+  int channel = 3;
+
+  uint8* dstdata = allocate_output(num_frames, width, height, channel);
+  for (int k = 0; k < num_frames; k++) {
+    SavedImage* this_image = &gif_file->SavedImages[k];
+    GifImageDesc* img_desc = &this_image->ImageDesc;
+    if (img_desc->Left != 0 || img_desc->Top != 0 || img_desc->Width != width ||
+        img_desc->Height != height) {
+      LOG(ERROR) << "Can't process optimized gif.";
+      return nullptr;
+    }
+
+    ColorMapObject* color_map = this_image->ImageDesc.ColorMap
+                                    ? this_image->ImageDesc.ColorMap
+                                    : gif_file->SColorMap;
+
+    uint8* this_dst = dstdata + k * width * channel * height;
+    for (int i = 0; i < height; ++i) {
+      uint8* p_dst = this_dst + i * width * channel;
+      for (int j = 0; j < width; ++j) {
+        GifByteType color_index = this_image->RasterBits[i * width + j];
+        const GifColorType& gif_color = color_map->Colors[color_index];
+        p_dst[j * channel + 0] = gif_color.Red;
+        p_dst[j * channel + 1] = gif_color.Green;
+        p_dst[j * channel + 2] = gif_color.Blue;
+      }
+    }
+  }
+
+  if (DGifCloseFile(gif_file, &error_code) != GIF_OK) {
+    LOG(WARNING) << "Fail to close gif file, reason: "
+                 << GifErrorString(error_code);
+  }
+  return dstdata;
+}
+
+}  // namespace gif
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/gif/gif_io.h b/tensorflow/core/lib/gif/gif_io.h
new file mode 100644
index 0000000000..d7aa2845cf
--- /dev/null
+++ b/tensorflow/core/lib/gif/gif_io.h
@@ -0,0 +1,51 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to read and write images in GIF format.
+//
+// The advantage over image/codec/png{enc,dec}ocder.h is that this library
+// supports both 8 and 16 bit images.
+//
+// The decoding routine accepts binary image data as a StringPiece.  These are
+// implicitly constructed from strings or char* so they're completely
+// transparent to the caller.  They're also very cheap to construct so this
+// doesn't introduce any additional overhead.
+//
+// The primary benefit of StringPieces being, in this case, that APIs already
+// returning StringPieces (e.g., Bigtable Scanner) or Cords (e.g., IOBuffer;
+// only when they're flat, though) or protocol buffer fields typed to either of
+// these can be decoded without copying the data into a C++ string.
+
+#ifndef TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
+#define TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gif {
+
+uint8* Decode(const void* srcdata, int datasize,
+              std::function<uint8*(int, int, int, int)> allocate_output);
+
+}  // namespace gif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
diff --git a/tensorflow/core/lib/gif/testdata/optimized.gif b/tensorflow/core/lib/gif/testdata/optimized.gif
new file mode 100644
index 0000000000..137a0ad0da
--- /dev/null
+++ b/tensorflow/core/lib/gif/testdata/optimized.gif
diff --git a/tensorflow/core/lib/gif/testdata/scan.gif b/tensorflow/core/lib/gif/testdata/scan.gif
new file mode 100644
index 0000000000..7ba430a145
--- /dev/null
+++ b/tensorflow/core/lib/gif/testdata/scan.gif
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 75252f0dad..ec0bfa3284 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -739,7 +739,7 @@ REGISTER_OP("Reverse")
     .Input("tensor: T")
     .Input("dims: bool")
     .Output("output: T")
-    .Attr("T: {uint8, int8, int32, bool, half, float, double}")
+    .Attr("T: {uint8, int8, int32, bool, half, float, double, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       const Shape* input = c->input(0);
       const Shape* dims;
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 81e7c5e1d2..5098c44437 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -441,9 +441,26 @@ contents: 0-D. PNG-encoded image.
 )doc");
 
 // --------------------------------------------------------------------------
+REGISTER_OP("DecodeGif")
+    .Input("contents: string")
+    .Output("image: uint8")
+    .Doc(R"doc(
+Decode the first frame of a GIF-encoded image to a uint8 tensor.
+
+GIF with frame or transparency compression are not supported
+convert animated GIF from compressed to uncompressed by:
+
+convert $src.gif -coalesce $dst.gif
+
+contents: 0-D.  The GIF-encoded image.
+image: 4-D with shape `[num_frames, height, width, 3]`. RGB order
+)doc");
+
+// --------------------------------------------------------------------------
 REGISTER_OP("RGBToHSV")
-    .Input("images: float")
-    .Output("output: float")
+    .Input("images: T")
+    .Output("output: T")
+    .Attr("T: {float, double} = DT_FLOAT")
     .SetShapeFn(ColorspaceShapeFn)
     .Doc(R"doc(
 Converts one or more images from RGB to HSV.
@@ -462,8 +479,9 @@ output: `images` converted to HSV.
 
 // --------------------------------------------------------------------------
 REGISTER_OP("HSVToRGB")
-    .Input("images: float")
-    .Output("output: float")
+    .Input("images: T")
+    .Output("output: T")
+    .Attr("T: {float, double} = DT_FLOAT")
     .SetShapeFn(ColorspaceShapeFn)
     .Doc(R"doc(
 Convert one or more images from HSV to RGB.
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 01e2fd134c..b8a7efc2c5 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1831,4 +1831,76 @@ b: Another tensor, of same type and shape as `a`.
 product: Pairwise cross product of the vectors in `a` and `b`.
 )doc");
 
+// --------------------------------------------------------------------------
+
+REGISTER_OP("Cumsum")
+    .Input("x: T")
+    .Input("axis: int32")
+    .Attr("exclusive: bool = false")
+    .Attr("reverse: bool = false")
+    .Output("out: T")
+    .Attr("T: numbertype")
+    .Doc(R"doc(
+Compute the cumulative sum of the tensor `x` along `axis`.
+
+By default, this op performs an inclusive cumsum, which means that the first
+element of the input is identical to the first element of the output:
+```prettyprint
+tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+performed instead:
+```prettyprint
+tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+opposite direction:
+```prettyprint
+tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+```
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+```prettyprint
+tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+```
+)doc");
+
+REGISTER_OP("Cumprod")
+    .Input("x: T")
+    .Input("axis: int32")
+    .Attr("exclusive: bool = false")
+    .Attr("reverse: bool = false")
+    .Output("out: T")
+    .Attr("T: numbertype")
+    .Doc(R"doc(
+Compute the cumulative product of the tensor `x` along `axis`.
+
+By default, this op performs an inclusive cumprod, which means that the first
+element of the input is identical to the first element of the output:
+```prettyprint
+tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed instead:
+```prettyprint
+tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+opposite direction:
+```prettyprint
+tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
+```
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+```prettyprint
+tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+```
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 8b53833aec..1569336a19 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4343,6 +4343,42 @@ op {
   description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels."
 }
 op {
+  name: "DecodeGif"
+  input_arg {
+    name: "contents"
+    description: "0-D.  The GIF-encoded image."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    description: "3-D with shape `[height, width, channels]`."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Number of color channels for the decoded image."
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  summary: "Decode a GIF-encoded image to a uint8 or uint16 tensor."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the GIF-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the GIF-encoded image is transformed to match the requested number\nof color channels."
+}
+op {
   name: "DecodeRaw"
   input_arg {
     name: "bytes"
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 28af0bbfe2..b260a588e8 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -488,11 +488,13 @@ REGISTER_OP("ApplyMomentum")
     .Output("out: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyMomentumShapeFn(c, false /* sparse */);
     })
     .Doc(R"doc(
-Update '*var' according to the momentum scheme.
+Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+want to use Nesterov momentum.
 
 accum = accum * momentum + grad
 var -= lr * accum
@@ -506,6 +508,9 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be 
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
 )doc");
 
 REGISTER_OP("SparseApplyMomentum")
@@ -519,11 +524,13 @@ REGISTER_OP("SparseApplyMomentum")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyMomentumShapeFn(c, true /* sparse */);
     })
     .Doc(R"doc(
 Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+Set use_nesterov = True if you want to use Nesterov momentum.
 
 That is for rows we have grad for, we update var and accum as follows:
 
@@ -540,6 +547,9 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be 
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
 )doc");
 
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 1619be9201..f372d2ef0d 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -57,12 +57,13 @@ cc_library(
     name = "platformlib",
     copts = tf_copts(),
     deps = [
+        "//tensorflow/core:protos_cc",
         "@farmhash_archive//:farmhash",
+        "@gif_archive//:gif",
+        "@highwayhash//:sip_hash",
         "@jpeg_archive//:jpeg",
         "@png_archive//:png",
-        "@highwayhash//:sip_hash",
         "@re2//:re2",
-        "//tensorflow/core:protos_cc",
     ],
 )
 
diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h
new file mode 100644
index 0000000000..d5567abeea
--- /dev/null
+++ b/tensorflow/core/platform/gif.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_GIF_H_
+#define TENSORFLOW_CORE_PLATFORM_GIF_H_
+
+#include "tensorflow/core/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/core/platform/google/build_config/gif.h"
+#elif defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)
+#include "giflib-5.1.4/lib/gif_lib.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_GIF_H_
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
index 8c696e1c78..8a43158062 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
@@ -94,8 +94,8 @@ def run_training():
     saver = tf.train.Saver()
 
     # Create the op for initializing variables.
-    init_op = tf.initialize_all_variables()
-
+    init_op = tf.group(tf.initialize_all_variables(),
+                       tf.initialize_local_variables())
     # Create a session for running Ops on the Graph.
     sess = tf.Session()
 
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index e4cc11dbe0..7faf1cef61 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -99,8 +99,10 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
   if (tensorflow::StringPiece(file_name).ends_with(".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
+  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+    image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
   } else {
-    // Assume if it's not a PNG then it must be a JPEG.
+    // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
     image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
                               DecodeJpeg::Channels(wanted_channels));
   }
diff --git a/tensorflow/examples/skflow/resnet.py b/tensorflow/examples/skflow/resnet.py
index 03a5d5e519..d67022d457 100644..100755
--- a/tensorflow/examples/skflow/resnet.py
+++ b/tensorflow/examples/skflow/resnet.py
@@ -52,13 +52,13 @@ def res_net(x, y, activation=tf.nn.relu):
     Predictions and loss tensors.
   """
 
-  # Configurations for each bottleneck block.
-  BottleneckBlock = namedtuple(
-      'BottleneckBlock', ['num_layers', 'num_filters', 'bottleneck_size'])
-  blocks = [BottleneckBlock(3, 128, 32),
-            BottleneckBlock(3, 256, 64),
-            BottleneckBlock(3, 512, 128),
-            BottleneckBlock(3, 1024, 256)]
+  # Configurations for each bottleneck group.
+  BottleneckGroup = namedtuple(
+      'BottleneckGroup', ['num_blocks', 'num_filters', 'bottleneck_size'])
+  groups = [BottleneckGroup(3, 128, 32),
+            BottleneckGroup(3, 256, 64),
+            BottleneckGroup(3, 512, 128),
+            BottleneckGroup(3, 1024, 256)]
 
   input_shape = x.get_shape().as_list()
 
@@ -78,19 +78,19 @@ def res_net(x, y, activation=tf.nn.relu):
 
   # First chain of resnets
   with tf.variable_scope('conv_layer2'):
-    net = learn.ops.conv2d(net, blocks[0].num_filters,
+    net = learn.ops.conv2d(net, groups[0].num_filters,
                            [1, 1], [1, 1, 1, 1],
                            padding='VALID', bias=True)
 
-  # Create each bottleneck building block for each layer
-  for block_i, block in enumerate(blocks):
-    for layer_i in range(block.num_layers):
-
-      name = 'block_%d/layer_%d' % (block_i, layer_i)
+  # Create the bottleneck groups, each of which contains `num_blocks`
+  # bottleneck groups.
+  for group_i, group in enumerate(groups):
+    for block_i in range(group.num_blocks):
+      name = 'group_%d/block_%d' % (group_i, block_i)
 
       # 1x1 convolution responsible for reducing dimension
       with tf.variable_scope(name + '/conv_in'):
-        conv = learn.ops.conv2d(net, block.bottleneck_size,
+        conv = learn.ops.conv2d(net, group.bottleneck_size,
                                 [1, 1], [1, 1, 1, 1],
                                 padding='VALID',
                                 activation=activation,
@@ -98,7 +98,7 @@ def res_net(x, y, activation=tf.nn.relu):
                                 bias=False)
 
       with tf.variable_scope(name + '/conv_bottleneck'):
-        conv = learn.ops.conv2d(conv, block.bottleneck_size,
+        conv = learn.ops.conv2d(conv, group.bottleneck_size,
                                 [3, 3], [1, 1, 1, 1],
                                 padding='SAME',
                                 activation=activation,
@@ -107,7 +107,8 @@ def res_net(x, y, activation=tf.nn.relu):
 
       # 1x1 convolution responsible for restoring dimension
       with tf.variable_scope(name + '/conv_out'):
-        conv = learn.ops.conv2d(conv, block.num_filters,
+        input_dim = net.get_shape()[-1].value
+        conv = learn.ops.conv2d(conv, input_dim,
                                 [1, 1], [1, 1, 1, 1],
                                 padding='VALID',
                                 activation=activation,
@@ -118,16 +119,16 @@ def res_net(x, y, activation=tf.nn.relu):
       # residual function (identity shortcut)
       net = conv + net
 
-      try:
-        # upscale to the next block size
-        next_block = blocks[block_i + 1]
-        with tf.variable_scope('block_%d/conv_upscale' % block_i):
-          net = learn.ops.conv2d(net, next_block.num_filters,
-                                 [1, 1], [1, 1, 1, 1],
-                                 bias=False,
-                                 padding='SAME')
-      except IndexError:
-        pass
+    try:
+      # upscale to the next group size
+      next_group = groups[group_i + 1]
+      with tf.variable_scope('block_%d/conv_upscale' % group_i):
+        net = learn.ops.conv2d(net, next_group.num_filters,
+                               [1, 1], [1, 1, 1, 1],
+                               bias=False,
+                               padding='SAME')
+    except IndexError:
+      pass
 
   net_shape = net.get_shape().as_list()
   net = tf.nn.avg_pool(net,
@@ -139,18 +140,12 @@ def res_net(x, y, activation=tf.nn.relu):
 
   return learn.models.logistic_regression(net, y)
 
-
 # Download and load MNIST data.
 mnist = input_data.read_data_sets('MNIST_data')
 
 # Restore model if graph is saved into a folder.
 if os.path.exists('models/resnet/graph.pbtxt'):
   classifier = learn.TensorFlowEstimator.restore('models/resnet/')
-else:
-  # Create a new resnet classifier.
-  classifier = learn.TensorFlowEstimator(
-      model_fn=res_net, n_classes=10, batch_size=100, steps=100,
-      learning_rate=0.001, continue_training=True)
 
 while True:
   # Train model and save summaries into logdir.
@@ -161,6 +156,3 @@ while True:
   score = metrics.accuracy_score(
       mnist.test.labels, classifier.predict(mnist.test.images, batch_size=64))
   print('Accuracy: {0:f}'.format(score))
-
-  # Save model graph and checkpoints.
-  classifier.save('models/resnet/')
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 2936deb9e3..c83239217a 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -49,7 +49,7 @@ def train():
 
   # Create a multilayer model.
 
-  # Input placehoolders
+  # Input placeholders
   with tf.name_scope('input'):
     x = tf.placeholder(tf.float32, [None, 784], name='x-input')
     y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index 4743ab557b..1b0e5df5ee 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -6,7 +6,11 @@ Course information can be found at https://www.udacity.com/course/deep-learning-
 Running the Docker container from the Google Cloud repository
 -------------------------------------------------------------
 
-    docker run -p 8888:8888 -it b.gcr.io/tensorflow-udacity/assignments:0.5.0
+    docker run -p 8888:8888 --name tensorflow-udacity -it b.gcr.io/tensorflow-udacity/assignments:0.5.0
+
+Note that if you ever exit the container, you can return to it using:
+
+    docker start -ai tensorflow-udacity
 
 Accessing the Notebooks
 -----------------------
@@ -19,21 +23,6 @@ On mac, find the virtual machine's IP using:
 
 Then go to: http://IP:8888 (likely http://192.168.99.100:8888)
 
-Saving Your Progress
---------------------
-
-Because of the `--rm` flag above, stopping the docker container removes it, so any changes you've made will disappear. One way around this is to remove the `--rm` flag, and name the container for easy restarting:
-```sh
-# you only need to "run" the container the first time:
-docker run -p 8888:8888 -it --name tensorflow-udacity b.gcr.io/tensorflow-udacity/assignments:0.5.0
-# …do various things…
-# when you're done, control-C to kill jupyter and stop the container
-# when you're ready to do more things, you can now just "start" the container:
-docker start -ai tensorflow-udacity
-# …do more things…
-# …repeat…
-```
-
 FAQ
 ---
 
diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md
index e1cece4faa..92f77b27b0 100644
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@@ -44,7 +44,7 @@ management system used to install and manage software packages written in
 Python.
 
 The packages that will be installed or upgraded during the pip install are listed in the
-[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
+[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py).
 
 Install pip (or pip3 for python3) if it is not already installed:
 
@@ -231,7 +231,7 @@ packages needed by TensorFlow.
 
 Install Anaconda:
 
-Follow the instructions on the [Anaconda download site](https://www.continuum.io/downloads)
+Follow the instructions on the [Anaconda download site](https://www.continuum.io/downloads).
 
 Create a conda environment called `tensorflow`:
 
@@ -377,6 +377,8 @@ The option `-p 8888:8888` is used to publish the Docker container᾿s internal p
 
 The format of the port mapping is `hostPort:containerPort`. You can specify any valid port number for the host port but have to use `8888` for the container port portion.
 
+If you're using a container with GPU support, some additional flags must be passed to expose the GPU device to the container.
+
 For NVidia GPU support install latest NVidia drivers and
 [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
 Run with
@@ -385,7 +387,15 @@ Run with
 $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
 ```
 
-For more details see (TensorFlow docker readme)[https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker].
+If you have a problem running `nvidia-docker`, then using the default config, we include a
+[script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/docker_run_gpu.sh)
+in the repo with these flags, so the command-line would look like
+
+```bash
+$ path/to/repo/tensorflow/tools/docker/docker_run_gpu.sh -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
+```
+
+For more details see [TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker).
 
 You can now [test your installation](#test-the-tensorflow-installation) within the Docker container.
 
@@ -479,7 +489,7 @@ of tensorflow. If you want to install a specific branch (such as a release branc
 pass `-b <branchname>` to the `git clone` command and `--recurse-submodules` for
 r0.8 and earlier to fetch the protobuf library that TensorFlow depends on.
 
-### Installation for Linux
+### Prepare environment for Linux
 
 #### Install Bazel
 
@@ -508,19 +518,6 @@ $ sudo apt-get install python-numpy swig python-dev python-wheel
 $ sudo apt-get install python3-numpy swig python3-dev python3-wheel
 ```
 
-#### Configure the installation
-
-Run the `configure` script at the root of the tree.  The configure script
-asks you for the path to your python interpreter and allows (optional)
-configuration of the CUDA libraries (see [below](#configure-tensorflows-canonical-view-of-cuda-libraries)).
-
-This step is used to locate the python and numpy header files.
-
-```bash
-$ ./configure
-Please specify the location of python. [Default is /usr/bin/python]:
-```
-
 #### Optional: Install CUDA (GPUs on Linux)
 
 In order to build or run TensorFlow with GPU support, both NVIDIA's Cuda Toolkit (>= 7.0) and
@@ -564,83 +561,7 @@ sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
 sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
 ```
 
-##### Configure TensorFlow's canonical view of Cuda libraries
-
-When running the `configure` script from the root of your source tree, select
-the option `Y` when asked to build TensorFlow with GPU support. If you have
-several versions of Cuda or cuDNN installed, you should definitely select
-one explicitly instead of relying on the system default. You should see
-prompts like the following:
-
-``` bash
-$ ./configure
-Please specify the location of python. [Default is /usr/bin/python]:
-Do you wish to build TensorFlow with GPU support? [y/N] y
-GPU support will be enabled for TensorFlow
-
-Please specify which gcc nvcc should use as the host compiler. [Default is
-/usr/bin/gcc]: /usr/bin/gcc-4.9
-
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave
-empty to use system default]: 7.5
-
-Please specify the location where CUDA 7.5 toolkit is installed. Refer to
-README.md for more details. [default is: /usr/local/cuda]: /usr/local/cuda
-
-Please specify the cuDNN version you want to use. [Leave empty to use system
-default]: 4.0.4
-
-Please specify the location where the cuDNN 4.0.4 library is installed. Refer to
-README.md for more details. [default is: /usr/local/cuda]: /usr/local/cudnn-r4-rc/
-
-Please specify a list of comma-separated Cuda compute capabilities you want to
-build with. You can find the compute capability of your device at:
-https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your
-build time and binary size. [Default is: \"3.5,5.2\"]: 3.5
-
-Setting up Cuda include
-Setting up Cuda lib64
-Setting up Cuda bin
-Setting up Cuda nvvm
-Setting up CUPTI include
-Setting up CUPTI lib64
-Configuration finished
-```
-
-This creates a canonical set of symbolic links to the Cuda libraries on your system.
-Every time you change the Cuda library paths you need to run this step again before
-you invoke the bazel build command. For the cuDNN libraries, use '6.5' for R2, '7.0'
-for R3, and '4.0.4' for R4-RC.
-
-
-##### Build your target with GPU support
-From the root of your source tree, run:
-
-```bash
-$ bazel build -c opt --config=cuda //tensorflow/cc:tutorials_example_trainer
-
-$ bazel-bin/tensorflow/cc/tutorials_example_trainer --use_gpu
-# Lots of output. This tutorial iteratively calculates the major eigenvalue of
-# a 2x2 matrix, on GPU. The last few lines look like this.
-000009/000005 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
-000006/000001 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
-000009/000009 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
-```
-
-Note that "--config=cuda" is needed to enable the GPU support.
-
-##### Known issues
-
-* Although it is possible to build both Cuda and non-Cuda configs under the same
-source tree, we recommend to run `bazel clean` when switching between these two
-configs in the same source tree.
-
-* You have to run configure before running bazel build. Otherwise, the build
-will fail with a clear error message. In the future, we might consider making
-this more convenient by including the configure step in our build process.
-
-### Installation for Mac OS X
+### Prepare environment for Mac OS X
 
 We recommend using [homebrew](http://brew.sh) to install the bazel and SWIG
 dependencies, and installing python dependencies using easy_install or pip.
@@ -713,15 +634,20 @@ $ sudo mv lib/libcudnn* /Developer/NVIDIA/CUDA-7.5/lib
 $ sudo ln -s /Developer/NVIDIA/CUDA-7.5/lib/libcudnn* /usr/local/cuda/lib/
 ```
 
-#### Configure the installation
+### Configure the installation
 
 Run the `configure` script at the root of the tree.  The configure script
-asks you for the path to your python interpreter.
+asks you for the path to your python interpreter and allows (optional)
+configuration of the CUDA libraries.
 
 This step is used to locate the python and numpy header files as well as
-enabling GPU support if you have a CUDA enabled GPU and Toolkit installed. For
-example:
+enabling GPU support if you have a CUDA enabled GPU and Toolkit installed.
+Select the option `Y` when asked to build TensorFlow with GPU support.
+
+If you have several versions of Cuda or cuDNN installed, you should definitely
+select one explicitly instead of relying on the system default.
 
+For example:
 
 ```bash
 $ ./configure
@@ -748,6 +674,38 @@ Setting up CUPTI lib64
 Configuration finished
 ```
 
+This creates a canonical set of symbolic links to the Cuda libraries on your system.
+Every time you change the Cuda library paths you need to run this step again before
+you invoke the bazel build command. For the cuDNN libraries, use '6.5' for R2, '7.0'
+for R3, and '4.0.4' for R4-RC.
+
+#### Build your target with GPU support
+From the root of your source tree, run:
+
+```bash
+$ bazel build -c opt --config=cuda //tensorflow/cc:tutorials_example_trainer
+
+$ bazel-bin/tensorflow/cc/tutorials_example_trainer --use_gpu
+# Lots of output. This tutorial iteratively calculates the major eigenvalue of
+# a 2x2 matrix, on GPU. The last few lines look like this.
+000009/000005 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
+000006/000001 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
+000009/000009 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
+```
+
+Note that "--config=cuda" is needed to enable the GPU support.
+
+#### Known issues
+
+* Although it is possible to build both Cuda and non-Cuda configs under the same
+source tree, we recommend to run `bazel clean` when switching between these two
+configs in the same source tree.
+
+* You have to run configure before running bazel build. Otherwise, the build
+will fail with a clear error message. In the future, we might consider making
+this more convenient by including the configure step in our build process.
+
+
 ### Create the pip package and install
 
 When building from source, you will still build a pip package and install that.
diff --git a/tensorflow/g3doc/how_tos/image_retraining/index.md b/tensorflow/g3doc/how_tos/image_retraining/index.md
index 60de27d36b..278398f2c0 100644
--- a/tensorflow/g3doc/how_tos/image_retraining/index.md
+++ b/tensorflow/g3doc/how_tos/image_retraining/index.md
@@ -131,7 +131,7 @@ Once TensorBoard is running, navigate your web browser to `localhost:6006` to vi
 
 The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
 
-The [TensorBoard README](../../../tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
+The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
 
 ## Using the Retrained Model
 
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
index bf412774e8..0d733ce994 100644
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@@ -8,7 +8,7 @@ your TensorFlow graph, plot quantitative metrics about the execution of your
 graph, and show additional data like images that pass through it. When
 TensorBoard is fully configured, it looks like this:
 
-[![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")](http://tensorflow.org/tensorboard)
+[![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")](http://tensorflow.org/tensorboard)  
 [*Click try a TensorBoard with data from this tutorial!*](http://tensorflow.org/tensorboard)
 
 This tutorial is intended to get you started with simple TensorBoard usage.
diff --git a/tensorflow/g3doc/resources/index.md b/tensorflow/g3doc/resources/index.md
index 2c5d06946c..f00bfd31b4 100644
--- a/tensorflow/g3doc/resources/index.md
+++ b/tensorflow/g3doc/resources/index.md
@@ -37,6 +37,7 @@ The TensorFlow community has created many great projects around TensorFlow, incl
 * [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
 * [Scikit Flow - Simplified Interface for TensorFlow](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/learn/python/learn)
 * [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
+* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
 
 ### Development
 
diff --git a/tensorflow/g3doc/tutorials/mnist/pros/index.md b/tensorflow/g3doc/tutorials/mnist/pros/index.md
index aa3ae13fa6..f0bb36220a 100644
--- a/tensorflow/g3doc/tutorials/mnist/pros/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/pros/index.md
@@ -190,11 +190,11 @@ accomplished by repeatedly running `train_step`.
 
 ```python
 for i in range(1000):
-  batch = mnist.train.next_batch(50)
+  batch = mnist.train.next_batch(100)
   train_step.run(feed_dict={x: batch[0], y_: batch[1]})
 ```
 
-Each training iteration we load 50 training examples. We then run the
+Each training iteration we load 100 training examples. We then run the
 `train_step` operation, using `feed_dict` to replace the `placeholder` tensors
 `x` and `y_` with the training examples.
 Note that you can replace any tensor in your computation graph using `feed_dict`
diff --git a/tensorflow/g3doc/tutorials/recurrent/index.md b/tensorflow/g3doc/tutorials/recurrent/index.md
index 5ed26a5e04..5215563332 100644
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ b/tensorflow/g3doc/tutorials/recurrent/index.md
@@ -178,6 +178,7 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_starte
 [bazel](https://github.com/bazelbuild/bazel)).
 
 Next:
+
 ```bash
 cd tensorflow/models/rnn/ptb
 python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small
diff --git a/tensorflow/g3doc/tutorials/tflearn/index.md b/tensorflow/g3doc/tutorials/tflearn/index.md
index 9480e899d8..88f7da506a 100644
--- a/tensorflow/g3doc/tutorials/tflearn/index.md
+++ b/tensorflow/g3doc/tutorials/tflearn/index.md
@@ -240,10 +240,11 @@ second sample is *Iris virginica*.
 * For further reference materials on tf.contrib.learn, see the official
 [API docs](../../api_docs/python/contrib.learn.md).
 
-<!-- David, will the below be live when this tutorial is released? -->
 * To learn more about using tf.contrib.learn to create linear models, see 
 [Large-scale Linear Models with TensorFlow](../linear/).
 
+* To build your own Estimator using tf.contrib.learn APIs, check out [Building Machine Learning Estimator in TensorFlow](http://terrytangyuan.github.io/2016/07/08/understand-and-build-tensorflow-estimator/).
+
 * To experiment with neural network modeling and visualization in the browser,
 check out [Deep Playground](http://playground.tensorflow.org/).
 
diff --git a/tensorflow/models/embedding/word2vec.py b/tensorflow/models/embedding/word2vec.py
index b8afbd2ced..97b38b7f47 100644
--- a/tensorflow/models/embedding/word2vec.py
+++ b/tensorflow/models/embedding/word2vec.py
@@ -378,7 +378,8 @@ class Word2Vec(object):
     opts = self._options
     with open(os.path.join(opts.save_path, "vocab.txt"), "w") as f:
       for i in xrange(opts.vocab_size):
-        f.write("%s %d\n" % (tf.compat.as_text(opts.vocab_words[i]),
+        vocab_word = tf.compat.as_text(opts.vocab_words[i]).encode("utf-8")
+        f.write("%s %d\n" % (vocab_word,
                              opts.vocab_counts[i]))
 
   def _train_thread_body(self):
diff --git a/tensorflow/models/image/mnist/convolutional.py b/tensorflow/models/image/mnist/convolutional.py
index 1893e68121..26e4a6ac8f 100644
--- a/tensorflow/models/image/mnist/convolutional.py
+++ b/tensorflow/models/image/mnist/convolutional.py
@@ -82,10 +82,10 @@ def extract_data(filename, num_images):
   print('Extracting', filename)
   with gzip.open(filename) as bytestream:
     bytestream.read(16)
-    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images)
+    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
     data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
     data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
-    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, 1)
+    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
     return data
 
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3d119799ab..678a745299 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -146,6 +146,7 @@ cuda_py_tests(
         "reverse_sequence_op_test.py",
         "rnn_cell_test.py",
         "scalar_strict_test.py",
+        "scan_ops_test.py",
         "session_ops_test.py",
         "shape_ops_test.py",
         "softmax_op_test.py",
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 4332a0facc..d908f9a7f6 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -198,14 +198,19 @@ class ReverseTest(test_util.TensorFlowTestCase):
         x_tf = array_ops.reverse(x_np, []).eval()
         self.assertAllEqual(x_tf, x_np)
 
-  def testReverse1DimAuto(self):
-    x_np = [1, 4, 9]
+  def _reverse1DimAuto(self, np_dtype):
+    x_np = np.array([1, 2, 3, 4, 5], dtype=np_dtype)
 
     for use_gpu in [False, True]:
       with self.test_session(use_gpu=use_gpu):
         x_tf = array_ops.reverse(x_np, [True]).eval()
         self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
 
+  def testReverse1DimAuto(self):
+    for dtype in [np.uint8, np.int8, np.int32, np.bool, np.float16,
+                  np.float32, np.float64, np.complex64, np.complex128]:
+      self._reverse1DimAuto(dtype)
+
   def testUnknownDims(self):
     data_t = tf.placeholder(tf.float32)
     dims_known_t = tf.placeholder(tf.bool, shape=[3])
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index dd96446662..cb013dee00 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -432,16 +432,13 @@ class ProdReductionTest(tf.test.TestCase):
     self._compareAll(np_arr, [0, 2])
     self._compareAll(np_arr, [0, 1, 2])
 
-  def testGradient(self):
-    s = [2, 3, 4, 2]
-    # NOTE(kearnes): divide by 20 so product is a reasonable size
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20.
+  def _compareGradient(self, x):
     with self.test_session():
       t = tf.convert_to_tensor(x)
 
       su = tf.reduce_prod(t, [])
       jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                   su,
                                                   [2, 3, 4, 2],
                                                   x_init_value=x,
@@ -450,7 +447,7 @@ class ProdReductionTest(tf.test.TestCase):
 
       su = tf.reduce_prod(t, [1, 2])
       jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                   su,
                                                   [2, 2],
                                                   x_init_value=x,
@@ -459,26 +456,34 @@ class ProdReductionTest(tf.test.TestCase):
 
       su = tf.reduce_prod(t, [0, 1, 2, 3])
       jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                   su,
                                                   [1],
                                                   x_init_value=x,
                                                   delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
-    # NOTE(kearnes): the current gradient calculation gives NaNs for 0 inputs
-    x = np.arange(0.0, 48.0).reshape(s).astype(np.float32) / 20.
-    with self.test_session():
-      t = tf.convert_to_tensor(x)
-      su = tf.reduce_prod(t, [])
-      jacob_t, _ = tf.test.compute_gradient(t,
-                                            s,
-                                            su,
-                                            [2, 3, 4, 2],
-                                            x_init_value=x,
-                                            delta=1)
-      with self.assertRaisesOpError("Tensor had NaN values"):
-        tf.check_numerics(jacob_t, message="_ProdGrad NaN test").op.run()
+  def testGradientWithZeros(self):
+    s = [2, 3, 4, 2]
+    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20.
+    # No zeros in input
+    self._compareGradient(x)
+    # Zero at beginning
+    x1 = x.copy()
+    x1[:,:,0,:] = 0
+    self._compareGradient(x1)
+    # Zero at end
+    x2 = x.copy()
+    x2[:,:,-1,:] = 0
+    self._compareGradient(x2)
+    # Zero in middle
+    x3 = x.copy()
+    x3[:,:,2,:] = 0
+    self._compareGradient(x3)
+    # All zeros
+    x4 = x.copy()
+    x4[:,:,:,:] = 0
+    self._compareGradient(x4)
 
   def testEmptyGradients(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
new file mode 100644
index 0000000000..1197b49a5f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -0,0 +1,229 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional tests for scan ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from itertools import combinations
+
+import numpy as np
+import tensorflow as tf
+
+
+def numpy_reverse(x, axis):
+  ix = [slice(None, None, -1)
+        if i == axis else slice(None) for i in range(len(x.shape))]
+  return x[ix]
+
+def handle_options(func, x, axis, exclusive, reverse):
+  """Adds tf options to numpy scan ops"""
+  if reverse:
+    x = numpy_reverse(x, axis)
+
+  if exclusive:
+    ix_head = [slice(0, 1) if i == axis else slice(None)
+                 for i in range(len(x.shape))]
+    ix_init = [slice(0, -1) if i == axis else slice(None)
+                 for i in range(len(x.shape))]
+    if func == np.cumsum:
+      init = np.zeros_like(x[ix_head])
+    elif func == np.cumprod:
+      init = np.ones_like(x[ix_head])
+    else:
+      raise ValueError("Unknown scan function")
+    x = np.concatenate([init, func(x[ix_init], axis)], axis=axis)
+  else:
+    x = func(x, axis=axis)
+
+  if reverse:
+    x = numpy_reverse(x, axis)
+  return x
+
+class CumsumTest(tf.test.TestCase):
+
+  valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
+                  np.float64, np.complex64, np.complex128]
+
+  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
+    np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
+    with self.test_session(use_gpu=use_gpu):
+      tf_out = tf.cumsum(x, axis, exclusive, reverse).eval()
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        for use_gpu in [True, False]:
+          self._compare(x, axis, exclusive, reverse, use_gpu)
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      self._compareAll(x, 0)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
+      self._compareAll(x, 0)
+      self._compareAll(x, 1)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
+      self._compareAll(x, 0)
+      self._compareAll(x, 1)
+      self._compareAll(x, 2)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    input_tensor = tf.convert_to_tensor(x)
+    with self.test_session():
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range" in str(e)):
+        tf.cumsum(input_tensor, -1).eval()
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range" in str(e)):
+        tf.cumsum(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        tf.cumsum(input_tensor, [0]).eval()
+
+  def _compareGradient(self, shape, axis, exclusive, reverse):
+    x = np.arange(0, 50).reshape(shape).astype(np.float64)
+    with self.test_session():
+      t = tf.convert_to_tensor(x)
+      result = tf.cumsum(t, axis, exclusive, reverse)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  shape,
+                                                  result,
+                                                  shape,
+                                                  x_init_value=x,
+                                                  delta=1)
+    self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
+
+  def testGradient(self):
+    self._compareGradient([50], 0, False, False)
+
+  def testGradientReverse(self):
+    self._compareGradient([50], 0, False, True)
+
+  def testGradientExclusive(self):
+    self._compareGradient([50], 0, True, False)
+
+  def testGradientExclusiveReverse(self):
+    self._compareGradient([50], 0, True, True)
+
+  def testGradient2D(self):
+    for axis in [0, 1]:
+      for exclusive in [True, False]:
+        for reverse in [True, False]:
+          self._compareGradient([5, 10], axis, exclusive, reverse)
+
+
+class CumprodTest(tf.test.TestCase):
+
+  valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
+                  np.float64, np.complex64, np.complex128]
+
+  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
+    np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
+    with self.test_session(use_gpu=use_gpu):
+      tf_out = tf.cumprod(x, axis, exclusive, reverse).eval()
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        for use_gpu in [True, False]:
+          self._compare(x, axis, exclusive, reverse, use_gpu)
+
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      self._compareAll(x, 0)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
+      self._compareAll(x, 0)
+      self._compareAll(x, 1)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
+      self._compareAll(x, 0)
+      self._compareAll(x, 1)
+      self._compareAll(x, 2)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    input_tensor = tf.convert_to_tensor(x)
+    with self.test_session():
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range" in str(e)):
+        tf.cumprod(input_tensor, -1).eval()
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range" in str(e)):
+        tf.cumprod(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        tf.cumprod(input_tensor, [0]).eval()
+
+  def _compareGradient(self, shape, axis, exclusive, reverse):
+    x = np.arange(1, 9).reshape(shape).astype(np.float64)
+    with self.test_session():
+      t = tf.convert_to_tensor(x)
+      result = tf.cumprod(t, axis, exclusive, reverse)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  shape,
+                                                  result,
+                                                  shape,
+                                                  x_init_value=x,
+                                                  delta=1)
+    self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
+
+  def testGradient(self):
+    self._compareGradient([8], 0, False, False)
+
+  def testGradientReverse(self):
+    self._compareGradient([8], 0, False, True)
+
+  def testGradientExclusive(self):
+    self._compareGradient([8], 0, True, False)
+
+  def testGradientExclusiveReverse(self):
+    self._compareGradient([8], 0, True, True)
+
+  def testGradient2D(self):
+    for axis in [0, 1]:
+      for exclusive in [True, False]:
+        for reverse in [True, False]:
+          self._compareGradient([2, 4], axis, exclusive, reverse)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index c2a15bba69..f63cf81247 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -1021,6 +1021,12 @@ def _ResizeShape(op):
   return [tensor_shape.TensorShape(
       [input_shape[0], height, width, input_shape[3]])]
 
+@ops.RegisterShape('DecodeGif')
+def _ImageDecodeShape(op):
+  """Shape function for decode gif."""
+  unused_input_shape = op.inputs[0].get_shape().merge_with(
+      tensor_shape.scalar())
+  return [tensor_shape.TensorShape([None, None, None, 3])]
 
 @ops.RegisterShape('DecodeJpeg')
 @ops.RegisterShape('DecodePng')
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 6f30a851c9..0c8824d10a 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -27,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -42,34 +43,37 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
     np.random.seed(7)
     batch_size = 5
     shape = (batch_size, 2, 7, 3)
-    inp = np.random.rand(*shape).astype(np.float32)
 
-    # Convert to HSV and back, as a batch and individually
-    with self.test_session() as sess:
-      batch0 = constant_op.constant(inp)
-      batch1 = image_ops.rgb_to_hsv(batch0)
-      batch2 = image_ops.hsv_to_rgb(batch1)
-      split0 = array_ops.unpack(batch0)
-      split1 = list(map(image_ops.rgb_to_hsv, split0))
-      split2 = list(map(image_ops.hsv_to_rgb, split1))
-      join1 = array_ops.pack(split1)
-      join2 = array_ops.pack(split2)
-      batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
-
-    # Verify that processing batch elements together is the same as separate
-    self.assertAllClose(batch1, join1)
-    self.assertAllClose(batch2, join2)
-    self.assertAllClose(batch2, inp)
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to HSV and back, as a batch and individually
+      with self.test_session() as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_hsv(batch0)
+        batch2 = image_ops.hsv_to_rgb(batch1)
+        split0 = array_ops.unpack(batch0)
+        split1 = list(map(image_ops.rgb_to_hsv, split0))
+        split2 = list(map(image_ops.hsv_to_rgb, split1))
+        join1 = array_ops.pack(split1)
+        join2 = array_ops.pack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1)
+      self.assertAllClose(batch2, join2)
+      self.assertAllClose(batch2, inp)
 
   def testRGBToHSVRoundTrip(self):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    rgb_np = np.array(data, dtype=np.float32).reshape([2, 2, 3]) / 255.
-    for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
-        hsv = image_ops.rgb_to_hsv(rgb_np)
-        rgb = image_ops.hsv_to_rgb(hsv)
-        rgb_tf = rgb.eval()
-    self.assertAllClose(rgb_tf, rgb_np)
+    for nptype in [np.float32, np.float64]:
+      rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
+      for use_gpu in [True, False]:
+        with self.test_session(use_gpu=use_gpu):
+          hsv = image_ops.rgb_to_hsv(rgb_np)
+          rgb = image_ops.hsv_to_rgb(hsv)
+          rgb_tf = rgb.eval()
+      self.assertAllClose(rgb_tf, rgb_np)
 
 
 class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
@@ -1609,6 +1613,56 @@ class PngTest(test_util.TensorFlowTestCase):
                          [None, None, channels or None])
 
 
+class GifTest(test_util.TensorFlowTestCase):
+
+  def testValid(self):
+    # Read some real GIFs
+    prefix = 'tensorflow/core/lib/gif/testdata/'
+    filename = 'scan.gif'
+    WIDTH = 20
+    HEIGHT = 40
+    STRIDE = 5
+    shape = (12, HEIGHT, WIDTH, 3)
+
+    with self.test_session() as sess:
+      gif0 = io_ops.read_file(prefix + filename)
+      image0 = image_ops.decode_gif(gif0)
+      gif0, image0 = sess.run([gif0, image0])
+
+      self.assertEqual(image0.shape, shape)
+
+      for frame_idx, frame in enumerate(image0):
+        gt = np.zeros(shape[1:], dtype=np.uint8)
+        start = frame_idx * STRIDE
+        end = (frame_idx + 1) * STRIDE
+        print(frame_idx)
+        if end <= WIDTH:
+          gt[:, start:end, :] = 255
+        else:
+          start -= WIDTH
+          end -= WIDTH
+          gt[start:end, :, :] = 255
+
+        self.assertAllClose(frame, gt)
+
+  def testInValid(self):
+    # Read some real GIFs
+    prefix = 'tensorflow/core/lib/gif/testdata/'
+    filename = 'optimized.gif'
+
+    with self.test_session() as sess:
+      gif0 = io_ops.read_file(prefix + filename)
+      image0 = image_ops.decode_gif(gif0)
+      with self.assertRaises(errors.InvalidArgumentError):
+        gif0, image0 = sess.run([gif0, image0])
+
+  def testShape(self):
+      with self.test_session() as sess:
+        gif = constant_op.constant('nonsense')
+        image = image_ops.decode_gif(gif)
+        self.assertEqual(image.get_shape().as_list(),
+                [None, None, None, 3])
+
 class ConvertImageTest(test_util.TensorFlowTestCase):
 
   def _convert(self, original, original_dtype, output_dtype, expected):
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 9e7a922b2a..0620a3da2c 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -109,13 +109,41 @@ def _MeanGrad(op, grad):
 @ops.RegisterGradient("Prod")
 def _ProdGrad(op, grad):
   """Gradient for Prod."""
-  # TODO(kearnes): this gives NaNs for 0s in the input tensor
+  # The gradient can be expressed by dividing the product by each entry of the
+  # input tensor, but this approach can't deal with zeros in the input.
+  # Here, we avoid this problem by composing the output as a product of two
+  # cumprod operations.
+
   input_shape = array_ops.shape(op.inputs[0])
+
+  # Expand grad to full input shape
   output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
   tile_scaling = _safe_shape_div(input_shape, output_shape_kept_dims)
-  grad = array_ops.reshape(grad * op.outputs[0], output_shape_kept_dims)
-  grad = math_ops.div(array_ops.tile(grad, tile_scaling), op.inputs[0])
-  return grad, None
+  grad = array_ops.reshape(grad, output_shape_kept_dims)
+  grad = array_ops.tile(grad, tile_scaling)
+
+  # Pack all reduced dimensions into a single one, so we can perform the
+  # cumprod ops. If the reduction dims list is empty, it defaults to float32,
+  # so we need to cast here.
+  reduced = math_ops.cast(op.inputs[1], dtypes.int32)
+  idx = math_ops.range(0, array_ops.rank(op.inputs[0]))
+  other, _ = array_ops.listdiff(idx, reduced)
+  perm = array_ops.concat(0, [reduced, other])
+  reduced_num = math_ops.reduce_prod(array_ops.gather(input_shape, reduced))
+  other_num = math_ops.reduce_prod(array_ops.gather(input_shape, other))
+  permuted = array_ops.transpose(op.inputs[0], perm)
+  permuted_shape = array_ops.shape(permuted)
+  reshaped = array_ops.reshape(permuted, (reduced_num, other_num))
+
+  # Calculate product, leaving out the current entry
+  left = math_ops.cumprod(reshaped, axis=0, exclusive=True)
+  right = math_ops.cumprod(reshaped, axis=0, exclusive=True, reverse=True)
+  y = array_ops.reshape(left * right, permuted_shape)
+
+  # Invert the transpose and reshape operations.
+  # Make sure to set the statically known shape information through a reshape.
+  out = grad * array_ops.transpose(y, array_ops.invert_permutation(perm))
+  return array_ops.reshape(out, input_shape), None
 
 
 @ops.RegisterGradient("SegmentSum")
@@ -839,3 +867,26 @@ def _CrossGrad(op, grad):
   u = op.inputs[0]
   v = op.inputs[1]
   return (math_ops.cross(v, grad), math_ops.cross(grad, u))
+
+
+@ops.RegisterGradient("Cumsum")
+def _CumsumGrad(op, grad):
+  axis = op.inputs[1]
+  exclusive = op.get_attr("exclusive")
+  reverse = op.get_attr("reverse")
+  return [math_ops.cumsum(grad, axis, exclusive=exclusive,
+                          reverse=not reverse), None]
+
+
+@ops.RegisterGradient("Cumprod")
+def _CumprodGrad(op, grad):
+  x = op.inputs[0]
+  axis = op.inputs[1]
+  exclusive = op.get_attr("exclusive")
+  reverse = op.get_attr("reverse")
+
+  # TODO This fails when x contains 0 and should be fixed
+  prod = math_ops.cumprod(x, axis, exclusive=exclusive, reverse=reverse)
+  out = math_ops.cumsum(prod * grad, axis, exclusive=exclusive,
+                        reverse=not reverse)
+  return [out / x, None]
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index cac14f286e..fc7b299978 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## Arithmetic Operators
+"""Note: Elementwise binary operations in TensorFlow follow [numpy-style
+broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+## Arithmetic Operators
 
 TensorFlow provides several operations that you can use to add basic arithmetic
 operators to your graph.
@@ -145,6 +148,14 @@ common math computations that reduce various dimensions of a tensor.
 
 @@accumulate_n
 
+## Scan
+
+TensorFlow provides several operations that you can use to perform scans
+(running totals) across one axis of a tensor.
+
+@@cumsum
+@@cumprod
+
 ## Segmentation
 
 TensorFlow provides several operations that you can use to perform common
@@ -1585,6 +1596,94 @@ def tanh(x, name=None):
       return gen_math_ops._tanh(x, name=name)
 
 
+def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
+    """Compute the cumulative sum of the tensor `x` along `axis`.
+
+    By default, this op performs an inclusive cumsum, which means that the first
+    element of the input is identical to the first element of the output:
+    ```prettyprint
+    tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+    ```
+
+    By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
+    instead:
+    ```prettyprint
+    tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+    ```
+
+    By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+    opposite direction:
+    ```prettyprint
+    tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+    ```
+    This is more efficient than using separate `tf.reverse` ops.
+
+    The `reverse` and `exclusive` kwargs can also be combined:
+    ```prettyprint
+    tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+    ```
+
+    Args:
+      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+       `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+       `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+      axis: A `Tensor` of type `int32` (default: 0).
+      reverse: A `bool` (default: False).
+      name: A name for the operation (optional).
+
+    Returns:
+      A `Tensor`. Has the same type as `x`.
+    """
+    with ops.op_scope([x], name, "Cumsum") as name:
+      x = ops.convert_to_tensor(x, name="x")
+      return gen_math_ops.cumsum(x, axis, exclusive=exclusive,
+                                 reverse=reverse, name=name)
+
+
+def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
+    """Compute the cumulative product of the tensor `x` along `axis`.
+
+    By default, this op performs an inclusive cumprod, which means that the first
+    element of the input is identical to the first element of the output:
+    ```prettyprint
+    tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+    ```
+
+    By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
+    instead:
+    ```prettyprint
+    tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+    ```
+
+    By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+    opposite direction:
+    ```prettyprint
+    tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
+    ```
+    This is more efficient than using separate `tf.reverse` ops.
+
+    The `reverse` and `exclusive` kwargs can also be combined:
+    ```prettyprint
+    tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+    ```
+
+    Args:
+      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+       `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+       `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+      axis: A `Tensor` of type `int32` (default: 0).
+      reverse: A `bool` (default: False).
+      name: A name for the operation (optional).
+
+    Returns:
+      A `Tensor`. Has the same type as `x`.
+    """
+    with ops.op_scope([x], name, "Cumprod") as name:
+      x = ops.convert_to_tensor(x, name="x")
+      return gen_math_ops.cumprod(x, axis, exclusive=exclusive,
+                                  reverse=reverse, name=name)
+
+
 ops.RegisterShape("Abs")(common_shapes.unchanged_shape)
 ops.RegisterShape("Acos")(common_shapes.unchanged_shape)
 ops.RegisterShape("Asin")(common_shapes.unchanged_shape)
@@ -1632,6 +1731,8 @@ ops.RegisterShape("BatchFFT3D")(common_shapes.unchanged_shape)
 ops.RegisterShape("BatchIFFT3D")(common_shapes.unchanged_shape)
 ops.RegisterShape("TanhGrad")(common_shapes.unchanged_shape)
 ops.RegisterShape("SigmoidGrad")(common_shapes.unchanged_shape)
+ops.RegisterShape("Cumsum")(common_shapes.unchanged_shape)
+ops.RegisterShape("Cumprod")(common_shapes.unchanged_shape)
 
 
 @ops.RegisterShape("Add")
diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py
index 51ef932ab9..0bb34754e3 100644
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@@ -648,7 +648,7 @@ class DropoutWrapper(RNNCell):
                        % input_keep_prob)
     if (isinstance(output_keep_prob, float) and
         not (output_keep_prob >= 0.0 and output_keep_prob <= 1.0)):
-      raise ValueError("Parameter input_keep_prob must be between 0 and 1: %d"
+      raise ValueError("Parameter output_keep_prob must be between 0 and 1: %d"
                        % output_keep_prob)
     self._cell = cell
     self._input_keep_prob = input_keep_prob
diff --git a/tensorflow/python/platform/default/__init__.py b/tensorflow/python/platform/default/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
--- a/tensorflow/python/platform/default/__init__.py
+++ /dev/null
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index c3bcb7c2f5..b2657e83a2 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -395,13 +395,14 @@ def Walk(top, topdown=1, onerror=None):
   optional argument "onerror" is specified, it should be a function.  It
   will be called with one argument, an os.error instance.  It can return
   to continue with the walk, or reraise the exception to abort the walk.
+  By default, the walk follows symlinks that resolve into directories.
 
   Yields:
     # Each yield is a 3-tuple:  the pathname of a directory, followed
     # by lists of all its subdirectories and leaf files.
     (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
   """
-  return os.walk(top, topdown=topdown, onerror=onerror)
+  return os.walk(top, topdown=topdown, onerror=onerror, followlinks=True)
 
 
 def Stat(path):   # pylint: disable=invalid-name
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index cf1ae55f95..aafe776c78 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -92,7 +92,7 @@ def input_producer(input_tensor, element_shape=None, num_epochs=None,
   """Output the rows of `input_tensor` to a queue for an input pipeline.
 
   Args:
-    input_tensor: A tensor with the rows to produce. Must be at
+    input_tensor: A tensor with the rows to produce. Must be at least
       one-dimensional. Must either have a fully-defined shape, or
       `element_shape` must be defined.
     element_shape: (Optional.) A `TensorShape` representing the shape of a
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 983ec88701..f24f1f4a08 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import control_flow_ops
@@ -40,7 +41,7 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
                           decay_rate ^ (global_step / decay_steps)
   ```
 
-  If the argument `staircase` is `True`, then `global_step /decay_steps` is an
+  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
   integer division and the decayed learning rate follows a staircase function.
 
   Example: decay every 100000 steps with a base of 0.96:
@@ -67,15 +68,16 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
       Must be positive.  See the decay computation above.
     decay_rate: A scalar `float32` or `float64` `Tensor` or a
       Python number.  The decay rate.
-    staircase: Boolean.  It `True` decay the learning rate at discrete intervals.
-    name: String.  Optional name of the operation.  Defaults to 'ExponentialDecay'
+    staircase: Boolean.  It `True` decay the learning rate at discrete intervals
+    name: String.  Optional name of the operation.  Defaults to 
+      'ExponentialDecay'
 
   Returns:
     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
     learning rate.
   """
   with ops.op_scope([learning_rate, global_step, decay_steps, decay_rate],
-                   name, "ExponentialDecay") as name:
+                    name, "ExponentialDecay") as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -89,19 +91,19 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
 
 def piecewise_constant(x, boundaries, values, name=None):
   """ Piecewise constant from boundaries and interval values.
-  
+
   Example: use a learning rate that's 1.0 for the first 100000 steps, 0.5
     for steps 100001 to 110000, and 0.1 for any additional steps.
-  
+
   ```python
   global_step = tf.Variable(0, trainable=False)
   boundaries = [100000, 110000]
   values = [1.0, 0.5, 0.1]
   learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
-  
+
   # Later, whenever we perform an optimization step, we increment global_step.
   ```
-  
+
   Args:
     x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
       `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
@@ -112,13 +114,13 @@ def piecewise_constant(x, boundaries, values, name=None):
       than `boundaries`, and all elements should have the same type.
     name: A string. Optional name of the operation. Defaults to
       'PiecewiseConstant'.
-  
+
   Returns:
     A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
     `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
     and values[-1] when `x > boundaries[-1]`.
   """
-  
+
   with ops.op_scope([x, boundaries, values, name],
                     name, 'PiecewiseConstant') as name:
     x = ops.convert_to_tensor(x)
@@ -131,7 +133,7 @@ def piecewise_constant(x, boundaries, values, name=None):
     values = ops.convert_n_to_tensor(values)
     if not all(v.dtype == values[0].dtype for v in values):
       raise ValueError('values must have elements all with the same dtype.')
-    
+
     pred_fn_pairs = {}
     pred_fn_pairs[x <= boundaries[0]] = lambda: values[0]
     pred_fn_pairs[x > boundaries[-1]] = lambda: values[-1]
@@ -139,7 +141,7 @@ def piecewise_constant(x, boundaries, values, name=None):
       # Need to bind v here; can do this with lambda v=v: ...
       pred = (x > low) & (x <= high)
       pred_fn_pairs[pred] = lambda v=v: v
-      
+
     # The default isn't needed here because our conditions are mutually
     # exclusive and exhaustive, but tf.case requires it.
     default = lambda: values[0]
@@ -237,3 +239,125 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
     return math_ops.add(math_ops.mul(learning_rate - end_learning_rate,
                                      math_ops.pow(1 - p, power)),
                         end_learning_rate, name=name)
+
+
+def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
+                      staircase=False, name=None):
+  """Applies natural exponential decay to the initial learning rate.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies an exponential decay function
+  to a provided initial learning rate.  It requires an `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns the decayed learning rate.  It is computed as:
+
+  ```python
+  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
+  ```
+
+  Example: decay exponetially with a base of 0.96:
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  learning_rate = 0.1
+  k = 0.5
+  learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
+
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.GradientDescentOptimizer(learning_rate)
+      .minimize(...my loss..., global_step=global_step)
+  )
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    decay_rate: A Python number.  The decay rate.
+    name: String.  Optional name of the operation.  Defaults to
+      'ExponentialTimeDecay'
+
+  Returns:
+    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+    learning rate.
+  """
+  with ops.op_scope([learning_rate, global_step, decay_rate],
+                    name, "NaturalExpDecay") as name:
+    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+    dtype = learning_rate.dtype
+    global_step = math_ops.cast(global_step, dtype)
+    decay_steps = math_ops.cast(decay_steps, dtype)
+    decay_rate = math_ops.cast(decay_rate, dtype)
+    p = global_step / decay_steps
+    if staircase:
+      p = math_ops.floor(p)
+    exponent = math_ops.exp(math_ops.mul(math_ops.neg(decay_rate), p))
+    return math_ops.mul(learning_rate, exponent, name=name)
+
+
+def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
+                       staircase=False, name=None):
+  """Applies inverse time decay to the initial learning rate.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies an inverse decay function
+  to a provided initial learning rate.  It requires an `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns the decayed learning rate.  It is computed as:
+
+  ```python
+  decayed_learning_rate = learning_rate / (1 + decay_rate * t)
+  ```
+
+  Example: decay 1/t with a rate of 0.5:
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  learning_rate = 0.1
+  k = 0.5
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
+
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.GradientDescentOptimizer(learning_rate)
+      .minimize(...my loss..., global_step=global_step)
+  )
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    decay_rate: A Python number.  The decay rate.
+    name: String.  Optional name of the operation.  Defaults to
+      'InverseTimeDecay'
+
+  with ops.op_scope([learning_rate, global_step, decay_rate],
+                    name, "InverseTimeDecay") as name:
+  Returns:
+    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+    learning rate.
+  """
+
+  with ops.op_scope([learning_rate, global_step, decay_rate],
+                    name, "InverseTimeDecay") as name:
+    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+    dtype = learning_rate.dtype
+    global_step = math_ops.cast(global_step, dtype)
+    decay_steps = math_ops.cast(decay_steps, dtype)
+    decay_rate = math_ops.cast(decay_rate, dtype)
+    p = global_step / decay_steps
+    if staircase:
+      p = math_ops.floor(p)
+    const = math_ops.cast(constant_op.constant(1), learning_rate.dtype)
+    denom = math_ops.add(const, math_ops.mul(decay_rate, p))
+    return math_ops.div(learning_rate, denom, name=name)
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index 867301d4a1..083e3c21fa 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
@@ -50,7 +52,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
       # Decayed learning rate
       assign_100.op.run()
-      expected = .1 * 0.96**(100 // 3)
+      expected = .1 * 0.96 ** (100 // 3)
       self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
 
   def testVariables(self):
@@ -69,7 +71,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
       # Decayed learning rate
       assign_100.op.run()
-      expected = .1 * 0.96**(100 // 3)
+      expected = .1 * 0.96 ** (100 // 3)
       self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
 
   def testPiecewiseConstant(self):
@@ -215,5 +217,83 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
       self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
 
 
+class ExponentialDecayTest(test_util.TensorFlowTestCase):
+
+  def testDecay(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = state_ops.variable_op([], dtypes.int32)
+    assign_step = state_ops.assign(step, 0)
+    increment_step = state_ops.assign_add(step, 1)
+    decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr, step,
+                                                       k, decay_rate)
+    with self.test_session():
+      assign_step.op.run()
+      for i in range(k+1):
+        expected = initial_lr * math.exp(-i / k * decay_rate)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+        increment_step.op.run()
+
+  def testStaircase(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = state_ops.variable_op([], dtypes.int32)
+    assign_step = state_ops.assign(step, 0)
+    increment_step = state_ops.assign_add(step, 1)
+    decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr,
+                                                       step,
+                                                       k,
+                                                       decay_rate,
+                                                       staircase=True)
+    with self.test_session():
+      assign_step.op.run()
+      for i in range(k+1):
+        expected = initial_lr * math.exp(-decay_rate * (i // k))
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+        increment_step.op.run()
+
+
+class InverseDecayTest(test_util.TensorFlowTestCase):
+
+  def testDecay(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = state_ops.variable_op([], dtypes.int32)
+    assign_step = state_ops.assign(step, 0)
+    increment_step = state_ops.assign_add(step, 1)
+    decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr,
+                                                        step,
+                                                        k,
+                                                        decay_rate)
+    with self.test_session():
+      assign_step.op.run()
+      for i in range(k+1):
+        expected = initial_lr / (1 + i / k * decay_rate)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+        increment_step.op.run()
+
+  def testStaircase(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = state_ops.variable_op([], dtypes.int32)
+    assign_step = state_ops.assign(step, 0)
+    increment_step = state_ops.assign_add(step, 1)
+    decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr,
+                                                        step,
+                                                        k,
+                                                        decay_rate,
+                                                        staircase=True)
+    with self.test_session():
+      assign_step.op.run()
+      for i in range(k+1):
+        expected = initial_lr / (1 + decay_rate * (i // k))
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+        increment_step.op.run()
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index 1586ddfdec..62f8028ce6 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -31,7 +31,7 @@ class MomentumOptimizer(optimizer.Optimizer):
   """
 
   def __init__(self, learning_rate, momentum,
-               use_locking=False, name="Momentum"):
+               use_locking=False, name="Momentum", use_nesterov=False):
     """Construct a new Momentum optimizer.
 
     Args:
@@ -44,6 +44,7 @@ class MomentumOptimizer(optimizer.Optimizer):
     super(MomentumOptimizer, self).__init__(use_locking, name)
     self._learning_rate = learning_rate
     self._momentum = momentum
+    self._use_nesterov = use_nesterov
 
   def _create_slots(self, var_list):
     for v in var_list:
@@ -62,7 +63,8 @@ class MomentumOptimizer(optimizer.Optimizer):
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         grad,
         math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
-        use_locking=self._use_locking).op
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
 
   def _apply_sparse(self, grad, var):
     mom = self.get_slot(var, "momentum")
@@ -71,4 +73,5 @@ class MomentumOptimizer(optimizer.Optimizer):
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         grad.values, grad.indices,
         math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
-        use_locking=self._use_locking).op
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 3807f9e8d3..a1cbf9bfb5 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -25,6 +25,13 @@ import tensorflow as tf
 
 class MomentumOptimizerTest(tf.test.TestCase):
 
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var = var + accum * lr * momentum
+    accum = accum * momentum + g
+    var = var - lr * accum
+    var = var - accum * lr * momentum
+    return var, accum
+
   def testBasic(self):
     for dtype in [tf.half, tf.float32, tf.float64]:
       with self.test_session():
@@ -80,6 +87,68 @@ class MomentumOptimizerTest(tf.test.TestCase):
                       3.98 - ((0.9 * 0.01 + 0.01) * 2.0)]),
             var1.eval())
 
+  def testNesterovMomentum(self):
+    for dtype in [tf.float32, tf.float64]:
+      with self.test_session():
+        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        cost = 5 * var0 * var0 + 3 * var1
+        global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step')
+        mom_op = tf.train.MomentumOptimizer(learning_rate=2.0, momentum=0.9,
+            use_nesterov=True)
+        opt_op = mom_op.minimize(cost, global_step, [var0, var1])
+        tf.initialize_all_variables().run()
+        for t in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
+              accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+              accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  def testSparseNesterovMomentum(self):
+    for dtype in [tf.float32, tf.float64]:
+      with self.test_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
+              accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+              accum1_np, 3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = tf.Variable(var0_np)
+        var1 = tf.Variable(var1_np)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = tf.train.MomentumOptimizer(learning_rate=2.0, momentum=0.9,
+            use_nesterov=True)
+        x_feed = tf.placeholder(dtype)
+        y_feed = tf.IndexedSlices(x_feed,tf.constant([0, 1]),tf.constant([2]))
+        grads_and_vars = [(y_feed, var0),
+            (tf.constant([3.0,3.0],dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        tf.initialize_all_variables().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict = {x_feed:grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
+              accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+              accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
   def testTensorLearningRateAndMomentum(self):
     for dtype in [tf.half, tf.float32, tf.float64]:
       with self.test_session():
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index a2fd85e7bc..54e00d5a74 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -314,8 +314,17 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
   if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
     // NOTE: OSX CUDA driver does not currently store the same driver version
     // in kCFBundleVersionKey as is returned by cuDriverGetVersion
-    const char * version = CFStringGetCStringPtr((CFStringRef)CFDictionaryGetValue(cuda_driver_info, kCFBundleVersionKey), kCFStringEncodingUTF8);
     CFRelease(kext_infos);
+    const CFStringRef str = (CFStringRef)CFDictionaryGetValue(
+        cuda_driver_info, kCFBundleVersionKey);
+    const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8);
+
+    // version can be NULL in which case treat it as empty string
+    // see
+    // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
+    if (version == NULL) {
+      return StringToDriverVersion("");
+    }
     return StringToDriverVersion(version);
   }
   CFRelease(kext_infos);
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index cfc35f0672..b042dda29f 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -54,6 +54,15 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
+// Returns the "Compatibility" version number from the CuDNN version number.
+// This is the number that tries to indicate ABI compatibility.
+//
+// For example, if cudnn_version is 5107, the compatibility version
+// number will be 5100.
+size_t cudnnCompatibilityVersion(size_t cudnn_version) {
+  return (cudnn_version / 100) * 100;
+}
+
 }  // namespace
 
 namespace perftools {
@@ -139,13 +148,6 @@ size_t cudnnGetVersion() {
   return callable();
 }
 
-// Returns whether the currently loaded cuDNN version is R2.
-bool IsCudnnR2() {
-  static auto version = cudnnGetVersion();
-  DCHECK_GE(version, 2000);
-  return version < 3000;
-}
-
 #define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                        \
   struct DynLoadShim__##__name {                                     \
     static const char* kName;                                        \
@@ -197,26 +199,13 @@ bool IsCudnnR2() {
   __macro(cudnnPoolingForward)                            \
   __macro(cudnnPoolingBackward)                           \
   __macro(cudnnLRNCrossChannelForward)                    \
-  __macro(cudnnLRNCrossChannelBackward)
-// clang-format on
-
-CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
-
-// clang-format off
-#if CUDNN_VERSION >= 4000 && CUDNN_VERSION < 5000
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor_v2)                              \
-  __macro(cudnnConvolutionBackwardData_v2)                \
-  __macro(cudnnConvolutionBackwardFilter_v2)
-#else
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
+  __macro(cudnnLRNCrossChannelBackward)                   \
   __macro(cudnnAddTensor)                                 \
   __macro(cudnnConvolutionBackwardData)                   \
   __macro(cudnnConvolutionBackwardFilter)
-#endif
 // clang-format on
 
-CUDNN_DNN_ROUTINE_EACH_R2(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
@@ -340,15 +329,21 @@ port::Status CudnnSupport::Init() {
     // Check whether loaded version of CuDNN matches what the source
     // was built with.
     size_t loaded_version = dynload::cudnnGetVersion();
-    bool library_loaded_matches_source = (loaded_version == CUDNN_VERSION);
+    size_t loaded_compat_version = cudnnCompatibilityVersion(loaded_version);
+    size_t compiled_compat_version = cudnnCompatibilityVersion(CUDNN_VERSION);
+    bool library_loaded_matches_source =
+        (loaded_compat_version == compiled_compat_version);
     if (!library_loaded_matches_source) {
       const string error =
-          port::StrCat("Loaded cudnn library: ", loaded_version,
-                       " but source was compiled against ", CUDNN_VERSION,
-                       ".  If using a binary install, upgrade your cudnn "
+          port::StrCat("Loaded runtime CuDNN library: ", loaded_version,
+                       " (compatibility version ", loaded_compat_version,
+                       ") but source was compiled with ", CUDNN_VERSION,
+                       " (compatibility version ", compiled_compat_version,
+                       ").  If using a binary install, upgrade your CuDNN "
                        "library to match.  If building from sources, "
-                       "make sure the library loaded matches the "
-                       "version you specified during compile configuration.");
+                       "make sure the library loaded at runtime matches a "
+                       "compatible version specified during compile "
+                       "configuration.");
       LOG(ERROR) << error;
       return port::Status{port::error::INTERNAL, error};
     }
@@ -1109,31 +1104,6 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                    CUDNN_DATA_FLOAT};
 
-#if CUDNN_VERSION < 5000
-#if CUDNN_VERSION >= 3000
-  if (dynload::IsCudnnR2()) {
-#endif
-#if CUDNN_VERSION >= 4000
-    status = dynload::cudnnConvolutionBackwardData_v2(
-#else
-  status = dynload::cudnnConvolutionBackwardData(
-#endif
-        parent_, ToHandle(dnn_handle_), &alpha, filter.handle(),
-        filter_data.opaque(), out_back_nd.handle(),
-        backward_output_data.opaque(), conv.handle(), &beta,
-        in_back_nd.handle(), backward_input_data->opaque());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
-      return false;
-    }
-    return true;
-#if CUDNN_VERSION >= 3000
-  }
-#endif
-#endif
-
-#if CUDNN_VERSION >= 3000
   const bool is_profiling = output_profile_result != nullptr;
   cudnnConvolutionBwdDataAlgo_t algo;
   DeviceMemory<uint8> scratch;
@@ -1284,7 +1254,6 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
     return false;
   }
   return true;
-#endif
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
@@ -1369,31 +1338,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
       CUDNN_DATA_FLOAT};
 
-#if CUDNN_VERSION < 5000
-#if CUDNN_VERSION >= 3000
-  if (dynload::IsCudnnR2()) {
-#endif
-#if CUDNN_VERSION >= 4000
-    status = dynload::cudnnConvolutionBackwardFilter_v2(
-#else
-  status = dynload::cudnnConvolutionBackwardFilter(
-#endif
-        parent_, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
-        input_data.opaque(), out_back_nd.handle(),
-        backward_output_data.opaque(), conv.handle(), &beta, filter.handle(),
-        backward_filter_data->opaque());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
-      return false;
-    }
-    return true;
-#if CUDNN_VERSION >= 3000
-  }
-#endif
-#endif
-
-#if CUDNN_VERSION >= 3000
   const bool is_profiling = output_profile_result != nullptr;
   cudnnConvolutionBwdFilterAlgo_t algo;
   DeviceMemory<uint8> scratch;
@@ -1544,7 +1488,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     return false;
   }
   return true;
-#endif
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -1824,33 +1767,15 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
 
   const float alpha = 1.0f;
   const float beta = 1.0f;
-#if CUDNN_VERSION >= 3000
-  if (dynload::IsCudnnR2()) {
-#endif
-
-#if CUDNN_VERSION < 5000
-#if CUDNN_VERSION >= 4000
-    status = dynload::cudnnAddTensor_v2(
-#else
-    status = dynload::cudnnAddTensor(
-#endif
-        parent_, ToHandle(dnn_handle_), CUDNN_ADD_SAME_C, &alpha,
-        bias_descriptor.handle(), biases.opaque(), &beta,
-        input_descriptor.handle(), output_data->opaque());
-#endif  // CUDNN_VERSION < 5000
 
-#if CUDNN_VERSION >= 3000
-  } else {
 #if CUDNN_VERSION >= 5000
-    status = dynload::cudnnAddTensor(
+  status = dynload::cudnnAddTensor(
 #else
-    status = dynload::cudnnAddTensor_v3(
-#endif
-        parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
-        biases.opaque(), &beta, input_descriptor.handle(),
-        output_data->opaque());
-  }
+  status = dynload::cudnnAddTensor_v3(
 #endif
+      parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
+      biases.opaque(), &beta, input_descriptor.handle(),
+      output_data->opaque());
 
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "stream " << stream << " could not enqueue bias addition.";
diff --git a/tensorflow/tensorboard/BUILD b/tensorflow/tensorboard/BUILD
index f9f728bd67..c957f3720f 100644
--- a/tensorflow/tensorboard/BUILD
+++ b/tensorflow/tensorboard/BUILD
@@ -10,10 +10,10 @@ exports_files(["LICENSE"])
 filegroup(
     name = "frontend",
     srcs = [
+        "TAG",
         "dist/index.html",
         "dist/tf-tensorboard.html",
-        "TAG",
-        "//tensorflow/tensorboard/bower:bower",
+        "//tensorflow/tensorboard/bower",
         "//tensorflow/tensorboard/lib:all_files",
     ],
 )
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index c3c8ade6c6..a53a80eb47 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -21,7 +21,7 @@ directory by creating a `SummaryWriter`:
 ``` python
 # sess.graph_def is the graph definition; that enables the Graph Visualizer.
 
-summary_writer = tf.train.SummaryWriter('/path/to/logs', sess.graph_def)
+summary_writer = tf.train.SummaryWriter('/path/to/logs', sess.graph)
 ```
 
 For more details, see [this
@@ -115,9 +115,9 @@ For example, here is a well-organized TensorBoard log directory, with two runs,
 
 # The Visualizations
 
-### Scalar Dashboard
+### Events Dashboard
 
-TensorBoard's Scalar Dashboard visualizes scalar statistics that vary over time;
+TensorBoard's Events Dashboard visualizes scalar statistics that vary over time;
 for example, you might want to track the model's loss or learning rate. As
 described in *Key Concepts*, you can compare multiple runs, and the data is
 organized by tag. The line charts have the following interactions:
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 56b75353b7..5ee57da4b3 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -49,10 +49,11 @@
 # to run.
 #
 
-# Constants:
 # Fixed naming patterns for wheel (.whl) files given different python versions
-declare -A WHL_TAGS
-WHL_TAGS=(["2.7"]="cp27-none" ["3.4"]="cp34-cp34m" ["3.5"]="cp35-cp35m")
+if [[ $(uname) == "Linux" ]]; then
+  declare -A WHL_TAGS
+  WHL_TAGS=(["2.7"]="cp27-none" ["3.4"]="cp34-cp34m" ["3.5"]="cp35-cp35m")
+fi
 
 
 INSTALL_EXTRA_PIP_PACKAGES=${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}
diff --git a/tensorflow/tools/ci_build/builds/test_installation.sh b/tensorflow/tools/ci_build/builds/test_installation.sh
index 5845bb51c9..5821e82f14 100755
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@@ -243,6 +243,8 @@ rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/jpeg
 cp -r tensorflow/core/lib/jpeg ${PY_TEST_DIR}/tensorflow/core/lib
 rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/png
 cp -r tensorflow/core/lib/png ${PY_TEST_DIR}/tensorflow/core/lib
+rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/gif
+cp -r tensorflow/core/lib/gif ${PY_TEST_DIR}/tensorflow/core/lib
 
 # Copy test data from tensorflow/contrib/ffmpeg
 
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index e834ad9520..b231a9c202 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -174,24 +174,57 @@ function get_cuda_capability_version() {
   fi
 }
 
-# Process container type
+# Container type, e.g., CPU, GPU
 CTYPE=${TF_BUILD_CONTAINER_TYPE}
+
+# Determine if Docker is available
 OPT_FLAG=""
+if [[ -z "$(which docker)" ]]; then
+  DO_DOCKER=0
+
+  echo "It appears that Docker is not available on this system. "\
+"Will perform build without Docker."
+  echo "Also, the additional option flags will be applied to the build:"
+  echo "  ${NO_DOCKER_OPT_FLAG}"
+  MAIN_CMD="${NO_DOCKER_MAIN_CMD} ${CTYPE}"
+  OPT_FLAG="${OPT_FLAG} ${NO_DOCKER_OPT_FLAG}"
+fi
+
+# Process container type
 if [[ ${CTYPE} == "cpu" ]]; then
   :
 elif [[ ${CTYPE} == "gpu" ]]; then
-  OPT_FLAG="--config=cuda"
-
-  # Attempt to determine CUDA capability version and use it
-  if [[ "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}" != \
-        *"TF_CUDA_COMPUTE_CAPABILITIES="* ]]; then
-    CUDA_CAPA_VER=$(get_cuda_capability_version)
-    if [[ ! -z ${CUDA_CAPA_VER} ]]; then
-      echo "TF_CUDA_COMPUTE_CAPABILITIES is not set."
-      echo "Using CUDA capability version from deviceQuery: ${CUDA_CAPA_VER}"
+  OPT_FLAG="${OPT_FLAG} --config=cuda"
+
+  # Attempt to determine CUDA capability version automatically and use it if
+  # CUDA capability version is not specified by the environment variables.
+  CUDA_CAPA_VER=$(get_cuda_capability_version)
+
+  if [[ ! -z ${CUDA_CAPA_VER} ]]; then
+    AUTO_CUDA_CAPA_VER=0
+    if [[ ${DO_DOCKER} == "1" ]] && \
+       [[ "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}" != \
+           *"TF_CUDA_COMPUTE_CAPABILITIES="* ]]; then
+      AUTO_CUDA_CAPA_VER=1
       TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
 "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e "\
 "TF_CUDA_COMPUTE_CAPABILITIES=${CUDA_CAPA_VER}"
+
+      echo "Docker GPU build: TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS="\
+"\"${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}\""
+    elif [[ ${DO_DOCKER} == "0" ]] && \
+         [[ -z "${TF_CUDA_COMPUTE_CAPABILITIES}" ]]; then
+      AUTO_CUDA_CAPA_VER=1
+      TF_CUDA_COMPUTE_CAPABILITIES="${CUDA_CAPA_VER}"
+
+      echo "Non-Docker GPU build: TF_CUDA_COMPUTE_CAPABILITIES="\
+"\"${TF_CUDA_COMPUTE_CAPABILITIES}\""
+    fi
+
+    if [[ ${AUTO_CUDA_CAPA_VER} == "1" ]]; then
+      echo "TF_CUDA_COMPUTE_CAPABILITIES is not set:"
+      echo "Using CUDA capability version from deviceQuery: ${CUDA_CAPA_VER}"
+      echo ""
     fi
   fi
 elif [[ ${CTYPE} == "android" ]]; then
@@ -203,19 +236,6 @@ fi
 
 EXTRA_PARAMS=""
 
-# Determine if Docker is available
-if [[ -z "$(which docker)" ]]; then
-  DO_DOCKER=0
-
-  echo "It appears that Docker is not available on this system. "\
-"Will perform build without Docker."
-  echo "Also, the additional option flags will be applied to the build:"
-  echo "  ${NO_DOCKER_OPT_FLAG}"
-  MAIN_CMD="${NO_DOCKER_MAIN_CMD} ${CTYPE}"
-  OPT_FLAG="${OPT_FLAG} ${NO_DOCKER_OPT_FLAG}"
-
-fi
-
 # Determine if this is a benchmarks job
 RUN_BENCHMARKS=0
 if [[ ! -z "${TF_BUILD_RUN_BENCHMARKS}" ]] &&
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 74f41ca746..2be630b48c 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -80,7 +80,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 
-RUN git clone --recursive https://github.com/tensorflow/tensorflow.git && \
+RUN git clone -b r0.9 --recursive --recurse-submodules https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
     git checkout r0.9
 WORKDIR /tensorflow
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index 43e7da7743..be3ad40b15 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -16,7 +16,9 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
 
 # Install nightly TensorFlow pip
 RUN pip install \
-   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-py2-none-any.whl
+   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
 
 # Copy test files
-COPY python/gcs_smoke.py /
+RUN mkdir -p /gcs-smoke/python
+COPY gcs_smoke_wrapper.sh /gcs-smoke/
+COPY python/gcs_smoke.py /gcs-smoke/python/
diff --git a/tensorflow/tools/gcs_test/gcs_smoke.sh b/tensorflow/tools/gcs_test/gcs_smoke.sh
index df59781ea8..6deff2e919 100755
--- a/tensorflow/tools/gcs_test/gcs_smoke.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke.sh
@@ -67,30 +67,8 @@ docker build --no-cache \
 
 # Run the docker image with the GCS key file mapped and the gcloud-required
 # environment variables set.
-LOG_FILE="/tmp/tf-gcs-test.log"
-rm -rf ${LOG_FILE}
-
 docker run --rm \
     -v ${GCLOUD_JSON_KEY_PATH}:/gcloud-key.json \
     -e "GOOGLE_APPLICATION_CREDENTIALS=/gcloud-key.json" \
     "${DOCKER_IMG}" \
-    python /gcs_smoke.py --gcs_bucket_url="${GCS_BUCKET_URL}" \
-    2>&1 > "${LOG_FILE}"
-
-if [[ $? != "0" ]]; then
-  cat ${LOG_FILE}
-  die "FAIL: End-to-end test of GCS access from TensorFlow failed."
-fi
-
-cat ${LOG_FILE}
-echo ""
-
-# Clean up the newly created tfrecord file in GCS bucket
-NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
-                awk '{print $NF}')
-if [[ -z ${NEW_TFREC_URL} ]]; then
-  die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
-fi
-gsutil rm "${NEW_TFREC_URL}" && \
-    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
-    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
-\ No newline at end of file
+    /gcs-smoke/gcs_smoke_wrapper.sh "${GCS_BUCKET_URL}"
diff --git a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
new file mode 100755
index 0000000000..89a0dd9169
--- /dev/null
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# In-container wrapper for GCS smoke test.
+#
+# This script invokes gcs_smoke.py and performs tear down afterwards.
+#
+# Usage:
+#   gcs_smoke_wrapper.sh <GCS_BUCKET_URL>
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Helper function: Exit on failure.
+die () {
+  echo $@
+  exit 1
+}
+
+print_usage() {
+  echo "Usage: gcs_smoke_wrapper.sh <GCS_BUCKET_URL>"
+  echo ""
+}
+
+# Sanity check on command-line arguments.
+GCS_BUCKET_URL=$1
+if [[ -z "${GCS_BUCKET_URL}" ]]; then
+  print_usage
+  die "ERROR: Command-line argument GCS_BUCKET_URL is not supplied"
+fi
+
+# Check that gcloud and gsutil binaries are available.
+GCLOUD_BIN="/var/gcloud/google-cloud-sdk/bin/gcloud"
+if [[ ! -f "${GCLOUD_BIN}" ]]; then
+  die "ERROR: Unable to find gcloud at path ${GCLOUD_BIN}"
+fi
+
+GSUTIL_BIN="/var/gcloud/google-cloud-sdk/bin/gsutil"
+if [[ ! -f "${GSUTIL_BIN}" ]]; then
+  die "ERROR: Unable to find gsutil at path ${GSUTIL_BIN}"
+fi
+
+# Check environment variable for gcloud credentials
+if [[ -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then
+  die "ERROR: Required gcloud environment variable "\
+"${GOOGLE_APPLICATION_CREDENTIALS} is not set."
+fi
+
+# Locate main Python file
+GCS_SMOKE_PY="${SCRIPT_DIR}/python/gcs_smoke.py"
+if [[ ! -f "${GCS_SMOKE_PY}" ]]; then
+  die "ERROR: Unable to find Python file at ${GCS_SMOKE_PY}"
+fi
+
+
+LOG_FILE="/tmp/tf-gcs-test.log"
+rm -rf ${LOG_FILE} || \
+    die "ERROR: Failed to remove existing log file ${LOG_FILE}"
+
+# Invoke main Python file
+python "${GCS_SMOKE_PY}" --gcs_bucket_url="${GCS_BUCKET_URL}" \
+    2>&1 > "${LOG_FILE}"
+
+if [[ $? != "0" ]]; then
+  cat ${LOG_FILE}
+  die "FAIL: End-to-end test of GCS access from TensorFlow failed."
+fi
+
+cat ${LOG_FILE}
+echo ""
+
+
+# Clean up the newly created tfrecord file in GCS bucket.
+# First, activate gcloud service account
+"${GCLOUD_BIN}" auth activate-service-account \
+    --key-file "${GOOGLE_APPLICATION_CREDENTIALS}" || \
+    die "ERROR: Failed to activate gcloud service account with JSON key file"
+
+NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
+                awk '{print $NF}')
+if [[ -z ${NEW_TFREC_URL} ]]; then
+  die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
+fi
+"${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
+    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
+    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+\ No newline at end of file
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 690e5cca8d..d1828018e6 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -8,8 +8,8 @@ load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 transitive_hdrs(
     name = "other_headers",
     deps = [
-        "//third_party/eigen3",
         "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
     ],
 )
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 07e876df16..b3787c0edc 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -108,21 +108,16 @@ class InstallHeaders(Command):
     # directories for -I
     install_dir = re.sub('/google/protobuf/src', '', install_dir)
 
-    # Copy eigen code into tensorflow/include,
-    # tensorflow/include/external/eigen_archive/eigen-eigen-<revision>,
-    # and tensorflow/include/eigen-eigen-<revision>.
+    # Copy eigen code into tensorflow/include.
     # A symlink would do, but the wheel file that gets created ignores
     # symlink within the directory hierarchy.
     # NOTE(keveman): Figure out how to customize bdist_wheel package so
     # we can do the symlink.
-    if re.search(r'(external/eigen_archive/eigen-eigen-\w+)', install_dir):
-      extra_dirs = [re.sub('/external/eigen_archive', '', install_dir),
-                    re.sub(r'external/eigen_archive/eigen-eigen-\w+', '',
-                           install_dir)]
-      for extra_dir in extra_dirs:
-        if not os.path.exists(extra_dir):
-          self.mkpath(extra_dir)
-        self.copy_file(header, extra_dir)
+    if 'external/eigen_archive/' in install_dir:
+      extra_dir = install_dir.replace('external/eigen_archive', '')
+      if not os.path.exists(extra_dir):
+        self.mkpath(extra_dir)
+      self.copy_file(header, extra_dir)
 
     if not os.path.exists(install_dir):
       self.mkpath(install_dir)
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index adb4fe6569..b9bab8a79d 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -4,10 +4,17 @@
 # within the workspace (e.g. "tensorflow/"), and tf_repo_name is the name of the
 # local_repository rule (e.g. "@tf").
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
+
+  # These lines need to be changed when updating Eigen. They are parsed from
+  # this file by the cmake and make builds to determine the eigen version and hash.
+  eigen_version = "b4fa9622b809"
+  eigen_sha256 = "2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62"
+
   native.new_http_archive(
     name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/b4fa9622b809.tar.gz",
-    sha256 = "2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62",
+    url = "https://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
+    sha256 = eigen_sha256,
+    strip_prefix = "eigen-eigen-" + eigen_version,
     build_file = path_prefix + "eigen.BUILD",
   )
 
@@ -57,6 +64,13 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   )
 
   native.new_http_archive(
+    name = "gif_archive",
+    url = "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
+    sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
+    build_file = path_prefix + "gif.BUILD",
+  )
+
+  native.new_http_archive(
     name = "six_archive",
     url = "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz#md5=34eed507548117b2ab523ab14b2f8b55",
     sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
@@ -92,8 +106,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   )
 
   native.bind(
-      name = "python_headers",
-      actual = tf_repo_name + "//util/python:python_headers",
+    name = "python_headers",
+    actual = tf_repo_name + "//util/python:python_headers",
   )
 
   # grpc expects //external:protobuf_clib and //external:protobuf_compiler
@@ -141,9 +155,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   )
 
   native.git_repository(
-      name = "boringssl_git",
-      remote = "https://github.com/google/boringssl.git",
-      commit = "bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",  # 2016-07-11
+    name = "boringssl_git",
+    remote = "https://github.com/google/boringssl.git",
+    commit = "bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",  # 2016-07-11
   )
 
   native.new_git_repository(
diff --git a/third_party/avro/BUILD b/third_party/avro/BUILD
index 5d154c195c..f631b6df06 100644
--- a/third_party/avro/BUILD
+++ b/third_party/avro/BUILD
@@ -1,4 +1,3 @@
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
-
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index 74d4212d1e..9062ed2ec0 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -13,7 +13,6 @@ cc_library(
         "unsupported/Eigen/CXX11/FixedPoint",
         "unsupported/Eigen/CXX11/src/FixedPoint/*.h",
     ]),
-    includes = ["."],
     visibility = ["//visibility:public"],
     deps = [
         "@eigen_archive//:eigen",
diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky
index eaa82ee9a8..c199a0255a 100644
--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
@@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Cholesky"
+#include "Eigen/Cholesky"
diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core
index 90e8342131..d4b036772e 100644
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
@@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Core"
+#include "Eigen/Core"
diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues
index be5e8bacec..bf739b9b85 100644
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Eigenvalues"
+#include "Eigen/Eigenvalues"
diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU
index 60d382d438..536149cea6 100644
--- a/third_party/eigen3/Eigen/LU
+++ b/third_party/eigen3/Eigen/LU
@@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/LU"
+#include "Eigen/LU"
diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR
index 1dcc8d74f3..be067d3ed2 100644
--- a/third_party/eigen3/Eigen/QR
+++ b/third_party/eigen3/Eigen/QR
@@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/QR"
+#include "Eigen/QR"
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index f1fb5ed77b..41db119921 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/unsupported/Eigen/CXX11/Tensor"
+#include "unsupported/Eigen/CXX11/Tensor"
diff --git a/third_party/eigen3/unsupported/Eigen/SpecialFunctions b/third_party/eigen3/unsupported/Eigen/SpecialFunctions
index 47036387df..ad13359ab3 100644
--- a/third_party/eigen3/unsupported/Eigen/SpecialFunctions
+++ b/third_party/eigen3/unsupported/Eigen/SpecialFunctions
@@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/unsupported/Eigen/SpecialFunctions"
-\ No newline at end of file
+#include "unsupported/Eigen/SpecialFunctions"
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc
index 071997ca44..389444e731 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -100,36 +100,15 @@ def GetHostCompilerOptions(argv):
   args, _ = parser.parse_known_args(argv)
 
   opts = ''
-  # This is a temporary workaround for b/12960069.
-  # NVIDIA is going to fix this in CUDA 6.5, but until then this workaround
-  # will let us compile Thrust with the cuda crosstool.
-  # bazel passes all include directories as '-isystem dir' to the crosstool.
-  # This causes nvcc to think that there are kernel launches from system
-  # directories (which apparently is not supported by the compiler). This
-  # workaround changes '-isystem third_party/gpus/cuda/include' to
-  # '-iquote third_party/gpus/cuda/include'.
-  isystem_args = [x for x in args.isystem
-                  if 'third_party/gpus/cuda/include' not in x]
-  iquote_args = (args.iquote +
-                 [x for x in args.isystem
-                  if 'third_party/gpus/cuda/include' in x])
-  # This hack is needed so that we can compile eigen3. We need to include
-  # third_party/eigen3 with -I. Some eigen file include using the
-  # include <Eigen/Core> syntax, and -iquote doesn't work for that.
-  has_eigen = ['third_party/eigen3'] in isystem_args
-  if has_eigen:
-    isystem_args.remove(['third_party/eigen3'])
-
-  if isystem_args:
-    opts += '-isystem ' + ' -isystem '.join(sum(isystem_args, []))
-  if iquote_args:
-    opts += ' -iquote ' + ' -iquote '.join(sum(iquote_args, []))
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
   if args.g:
     opts += ' -g' + ' -g'.join(sum(args.g, []))
   if args.sysroot:
     opts += ' --sysroot ' + args.sysroot[0]
-  if has_eigen:
-    opts += ' -I third_party/eigen3'
 
   return opts
 
diff --git a/third_party/gpus/cuda/BUILD b/third_party/gpus/cuda/BUILD
index 354377555b..79c6227687 100644
--- a/third_party/gpus/cuda/BUILD
+++ b/third_party/gpus/cuda/BUILD
@@ -51,7 +51,10 @@ cc_library(
     hdrs = glob([
         "**/*.h",
     ]),
-    includes = [".", "include"],
+    includes = [
+        ".",
+        "include",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -66,7 +69,7 @@ cc_library(
         "-lpthread",
     ] + select({
         "//tensorflow:darwin": [],
-        "//conditions:default": ["-lrt"]
+        "//conditions:default": ["-lrt"],
     }),
     visibility = ["//visibility:public"],
 )
@@ -74,65 +77,65 @@ cc_library(
 cc_library(
     name = "cudart",
     srcs = [
-        cuda_library_path("cudart")
+        cuda_library_path("cudart"),
     ],
     data = [
-        cuda_library_path("cudart")
+        cuda_library_path("cudart"),
     ],
     includes = ["include/"],
-    visibility = ["//visibility:public"],
     linkstatic = 1,
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cublas",
     srcs = [
-        cuda_library_path("cublas")
+        cuda_library_path("cublas"),
     ],
     data = [
-        cuda_library_path("cublas")
+        cuda_library_path("cublas"),
     ],
     includes = ["include/"],
-    visibility = ["//visibility:public"],
     linkstatic = 1,
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn",
     srcs = [
-        cudnn_library_path()
+        cudnn_library_path(),
     ],
     data = [
-        cudnn_library_path()
+        cudnn_library_path(),
     ],
     includes = ["include/"],
-    visibility = ["//visibility:public"],
     linkstatic = 1,
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cufft",
     srcs = [
-        cuda_library_path("cufft")
+        cuda_library_path("cufft"),
     ],
     data = [
-        cuda_library_path("cufft")
+        cuda_library_path("cufft"),
     ],
     includes = ["include/"],
-    visibility = ["//visibility:public"],
     linkstatic = 1,
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda",
+    visibility = ["//visibility:public"],
     deps = [
+        ":cublas",
         ":cuda_headers",
         ":cudart",
-        ":cublas",
         ":cudnn",
         ":cufft",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -140,7 +143,10 @@ cc_library(
     hdrs = glob([
         "**/*.h",
     ]),
-    includes = [".", "extras/CUPTI/include/"],
+    includes = [
+        ".",
+        "extras/CUPTI/include/",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -152,7 +158,6 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-
 # TODO(opensource): for now, we have to invoke the cuda_config.sh manually in the source tree.
 # This rule checks if Cuda libraries in the source tree has been properly configured.
 # The output list makes bazel runs this rule first if the Cuda files are missing.
@@ -182,21 +187,21 @@ genrule(
 
         # Under non-cuda config, create all dummy files to make the build go through
         ";".join([
-           "mkdir -p $(@D)/include",
-           "mkdir -p $(@D)/lib64",
-           "mkdir -p $(@D)/extras/CUPTI/include",
-           "mkdir -p $(@D)/extras/CUPTI/lib64",
-           "touch $(@D)/include/cuda.h",
-           "touch $(@D)/include/cublas.h",
-           "touch $(@D)/include/cudnn.h",
-           "touch $(@D)/extras/CUPTI/include/cupti.h",
-           "touch $(@D)/{}".format(cuda_static_library_path("cudart")),
-           "touch $(@D)/{}".format(cuda_library_path("cublas")),
-           "touch $(@D)/{}".format(cudnn_library_path()),
-           "touch $(@D)/{}".format(cuda_library_path("cudart")),
-           "touch $(@D)/{}".format(cuda_library_path("cufft")),
-           "touch $(@D)/{}".format(cupti_library_path()),
-         ]),
+            "mkdir -p $(@D)/include",
+            "mkdir -p $(@D)/lib64",
+            "mkdir -p $(@D)/extras/CUPTI/include",
+            "mkdir -p $(@D)/extras/CUPTI/lib64",
+            "touch $(@D)/include/cuda.h",
+            "touch $(@D)/include/cublas.h",
+            "touch $(@D)/include/cudnn.h",
+            "touch $(@D)/extras/CUPTI/include/cupti.h",
+            "touch $(@D)/{}".format(cuda_static_library_path("cudart")),
+            "touch $(@D)/{}".format(cuda_library_path("cublas")),
+            "touch $(@D)/{}".format(cudnn_library_path()),
+            "touch $(@D)/{}".format(cuda_library_path("cudart")),
+            "touch $(@D)/{}".format(cuda_library_path("cufft")),
+            "touch $(@D)/{}".format(cupti_library_path()),
+        ]),
     ),
     local = 1,
 )
@@ -212,7 +217,7 @@ genrule(
 
         # Under non-cuda config, create the dummy file
         ";".join([
-         "touch $(@D)/cuda.config",
+            "touch $(@D)/cuda.config",
         ]),
     ),
     local = 1,
diff --git a/util/python/BUILD b/util/python/BUILD
index af05de2004..29688b875d 100644
--- a/util/python/BUILD
+++ b/util/python/BUILD
@@ -15,7 +15,7 @@ genrule(
     name = "python_check",
     srcs = [
         "python_config.sh",
-        "configure_files"
+        "configure_files",
     ],
     outs = [
         "python_checked",
@@ -27,6 +27,6 @@ genrule(
 filegroup(
     name = "configure_files",
     data = glob([
-      "*",
-    ])
+        "*",
+    ]),
 )