65 files changed, 912 insertions, 138 deletions
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
index 2b64235bb2..51e084b79c 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
@@ -662,7 +662,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
               apply_averaging=False,
               center_bias=False,
               reduce_dim=False))
-      # The first example will get bias class 1 -0.2  and -2 for class 2 from
+      # The first example will get bias class 1 -0.2 and -2 for class 2 from
       # first tree and leaf 2 payload (sparse feature missing) of 0.5 hence
       # 0.5, -0.2], the second example will get the same bias and leaf 3 payload
       # of class 1 1.2 and class 2-0.7 hence [0.0, 1.0, -2.7].
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses_test.py b/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
index 1b8d8ac43e..886e5ce2c8 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
@@ -59,7 +59,7 @@ class LossesTest(test_util.TensorFlowTestCase):
       pos_loss = loss_for_positives.eval()
       neg_loss = loss_for_negatives.eval()
       # For positive labels, points <= 0.3 get max loss of e.
-      # For negative labels, these points have minimum loss  of 1/e.
+      # For negative labels, these points have minimum loss of 1/e.
       for i in range(2):
         self.assertEqual(math.exp(1), pos_loss[i])
         self.assertEqual(math.exp(-1), neg_loss[i])
diff --git a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
index cc8644bfd5..76c6bc05ff 100644
--- a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
+++ b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
@@ -48,7 +48,7 @@ class BigQueryReader(io_ops.ReaderBase):
                                                 features=features)
 
     # Populate a queue with the BigQuery Table partitions.
-    queue = tf.training.string_input_producer(reader.partitions())
+    queue = tf.train.string_input_producer(reader.partitions())
 
     # Read and parse examples.
     row_id, examples_serialized = reader.read(queue)
diff --git a/tensorflow/contrib/data/python/util/nest.py b/tensorflow/contrib/data/python/util/nest.py
index a29c3c562b..6e9d588d79 100644
--- a/tensorflow/contrib/data/python/util/nest.py
+++ b/tensorflow/contrib/data/python/util/nest.py
@@ -242,7 +242,7 @@ def map_structure(func, *structure, **check_types_dict):
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
     **check_types_dict: only valid keyword argument is `check_types`. If set to
-      `True` (default) the types of iterables within the  structures have to be
+      `True` (default) the types of iterables within the structures have to be
       same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
       exception). To allow this set this argument to `False`.
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
index 6a5c37da16..d8698788c1 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
@@ -349,7 +349,7 @@ class Affine(bijector.Bijector):
       y *= self._scale
       if self.shift is not None:
         return y + self.shift
-      return  y
+      return y
     y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
         y, expand_batch_dim=False)
     with ops.control_dependencies(self._maybe_check_scale() if
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index 516d7b60fe..5fb6f0c7ea 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -364,7 +364,7 @@ class _DistributionShape(object):
     """Reshapes/transposes `Distribution` `Tensor` from S+B+E to B_+E_+S_.
 
     Where:
-      - `B_ = B if B or not expand_batch_dim  else [1]`,
+      - `B_ = B if B or not expand_batch_dim else [1]`,
       - `E_ = E if E else [1]`,
       - `S_ = [tf.reduce_prod(S)]`.
 
@@ -402,7 +402,7 @@ class _DistributionShape(object):
     """Reshapes/transposes `Distribution` `Tensor` from B_+E_+S_ to S+B+E.
 
     Where:
-      - `B_ = B if B or not expand_batch_dim  else [1]`,
+      - `B_ = B if B or not expand_batch_dim else [1]`,
       - `E_ = E if E else [1]`,
       - `S_ = [tf.reduce_prod(S)]`.
 
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
index 41fd114f0f..41f986dd07 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
@@ -8,7 +8,7 @@
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
diff --git a/tensorflow/contrib/graph_editor/select.py b/tensorflow/contrib/graph_editor/select.py
index 706c409118..3ea6ff4d61 100644
--- a/tensorflow/contrib/graph_editor/select.py
+++ b/tensorflow/contrib/graph_editor/select.py
@@ -620,7 +620,7 @@ def select_ops(*args, **kwargs):
   """Helper to select operations.
 
   Args:
-    *args: list of 1) regular expressions (compiled or not) or  2) (array of)
+    *args: list of 1) regular expressions (compiled or not) or 2) (array of)
       `tf.Operation`. `tf.Tensor` instances are silently ignored.
     **kwargs: 'graph': `tf.Graph` in which to perform the regex query.This is
       required when using regex.
@@ -686,7 +686,7 @@ def select_ts(*args, **kwargs):
   """Helper to select tensors.
 
   Args:
-    *args: list of 1) regular expressions (compiled or not) or  2) (array of)
+    *args: list of 1) regular expressions (compiled or not) or 2) (array of)
       `tf.Tensor`. `tf.Operation` instances are silently ignored.
     **kwargs: 'graph': `tf.Graph` in which to perform the regex query.This is
       required when using regex.
@@ -752,7 +752,7 @@ def select_ops_and_ts(*args, **kwargs):
   """Helper to select operations and tensors.
 
   Args:
-    *args: list of 1) regular expressions (compiled or not) or  2) (array of)
+    *args: list of 1) regular expressions (compiled or not) or 2) (array of)
       `tf.Operation` 3) (array of) tf.Tensor. Regular expressions matching
       tensors must start with the comment `"(?#ts)"`, for instance:
       `"(?#ts)^foo/.*"`.
diff --git a/tensorflow/contrib/graph_editor/subgraph.py b/tensorflow/contrib/graph_editor/subgraph.py
index 18485f2039..6650e996d7 100644
--- a/tensorflow/contrib/graph_editor/subgraph.py
+++ b/tensorflow/contrib/graph_editor/subgraph.py
@@ -630,7 +630,7 @@ def make_view(*args, **kwargs):
   """Create a SubGraphView from selected operations and passthrough tensors.
 
   Args:
-    *args: list of 1) regular expressions (compiled or not) or  2) (array of)
+    *args: list of 1) regular expressions (compiled or not) or 2) (array of)
       `tf.Operation` 3) (array of) `tf.Tensor`. Those objects will be converted
       into a list of operations and a list of candidate for passthrough tensors.
     **kwargs: keyword graph is used 1) to check that the ops and ts are from
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 0ba9873f3a..409f7d27bd 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -15,13 +15,13 @@
 """This API defines FeatureColumn abstraction.
 
 FeatureColumns provide a high level abstraction for ingesting and representing
-features in tf.learn Estimator models.
+features in `Estimator` models.
 
 FeatureColumns are the primary way of encoding features for pre-canned
-tf.learn Estimators.
+`Estimator` models.
 
-When using FeatureColumns with tf.learn models, the type of feature column you
-should choose depends on (1) the feature type and (2) the model type.
+When using FeatureColumns with `Estimator` models, the type of feature column
+you should choose depends on (1) the feature type and (2) the model type.
 
 (1) Feature type:
 
@@ -74,7 +74,7 @@ should choose depends on (1) the feature type and (2) the model type.
       columns=[department_column, bucketized_age_column],
       hash_bucket_size=1000)
 
-Example of building tf.learn model using FeatureColumns:
+Example of building an `Estimator` model using FeatureColumns:
 
   # Define features and transformations
   deep_feature_columns = [age_column, embedded_dept_column]
@@ -104,7 +104,7 @@ FeatureColumns can also be transformed into a generic input layer for
 custom models using `input_from_feature_columns` within
 `feature_column_ops.py`.
 
-Example of building non-tf.learn model using FeatureColumns:
+Example of building a non-`Estimator` model using FeatureColumns:
 
   # Building model via layers
 
@@ -1184,7 +1184,7 @@ def _embeddings_from_arguments(column,
           raise ValueError(
               "The embedding variable with name {} already "
               "exists, but its shape does not match required "
-              "embedding shape  here. Please make sure to use "
+              "embedding shape here. Please make sure to use "
               "different shared_embedding_name for different "
               "shared embeddings.".format(args.shared_embedding_name))
     else:
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 797a7c11db..fbfa0e32de 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -1173,7 +1173,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
         features, [real_valued, bucket, embeded_sparse],
         weight_collections=["my_collection"],
         trainable=True)
-    # There should  one trainable variable for embeded sparse
+    # There should one trainable variable for embeded sparse
     self.assertEqual(1, len(variables_lib.trainable_variables()))
 
   def testInputLayerWithNonTrainableEmbeddingForDNN(self):
diff --git a/tensorflow/contrib/layers/python/layers/regularizers.py b/tensorflow/contrib/layers/python/layers/regularizers.py
index 2c3774c560..95d57e6186 100644
--- a/tensorflow/contrib/layers/python/layers/regularizers.py
+++ b/tensorflow/contrib/layers/python/layers/regularizers.py
@@ -119,7 +119,7 @@ def l1_l2_regularizer(scale_l1=1.0, scale_l2=1.0, scope=None):
 
   Returns:
     A function with signature `l1_l2(weights)` that applies a weighted sum of
-    L1 L2  regularization.
+    L1 L2 regularization.
 
   Raises:
     ValueError: If scale is negative or if scale is not a float.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
index f316c5c980..a19426a248 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
@@ -600,7 +600,7 @@ class TensorFlowDataFrame(df.DataFrame):
                   shuffle=True,
                   seed=None,
                   data_name="pandas_data"):
-    """Create a `tf.learn.DataFrame` from a `pandas.DataFrame`.
+    """Create a `DataFrame` from a `pandas.DataFrame`.
 
     Args:
       pandas_dataframe: `pandas.DataFrame` that serves as a data source.
@@ -615,7 +615,7 @@ class TensorFlowDataFrame(df.DataFrame):
       data_name: a scope name identifying the data.
 
     Returns:
-      A `tf.learn.DataFrame` that contains batches drawn from the given
+      A `DataFrame` that contains batches drawn from the given
       `pandas_dataframe`.
     """
     pandas_source = in_memory_source.PandasSource(
@@ -643,7 +643,7 @@ class TensorFlowDataFrame(df.DataFrame):
                  shuffle=True,
                  seed=None,
                  data_name="numpy_data"):
-    """Creates a `tf.learn.DataFrame` from a `numpy.ndarray`.
+    """Creates a `DataFrame` from a `numpy.ndarray`.
 
     The returned `DataFrame` contains two columns: 'index' and 'value'. The
     'value' column contains a row from the array. The 'index' column contains
@@ -662,7 +662,7 @@ class TensorFlowDataFrame(df.DataFrame):
       data_name: a scope name identifying the data.
 
     Returns:
-      A `tf.learn.DataFrame` that contains batches drawn from the given
+      A `DataFrame` that contains batches drawn from the given
       array.
     """
     numpy_source = in_memory_source.NumpySource(
@@ -690,7 +690,7 @@ class TensorFlowDataFrame(df.DataFrame):
                        shuffle=True,
                        seed=None,
                        data_name="numpy_data"):
-    """Creates a `tf.learn.DataFrame` from an `OrderedDict` of `numpy.ndarray`.
+    """Creates a `DataFrame` from an `OrderedDict` of `numpy.ndarray`.
 
     The returned `DataFrame` contains a column for each key of the dict plus an
     extra 'index' column. The 'index' column contains the row number. Each of
@@ -710,7 +710,7 @@ class TensorFlowDataFrame(df.DataFrame):
       data_name: a scope name identifying the data.
 
     Returns:
-      A `tf.learn.DataFrame` that contains batches drawn from the given arrays.
+      A `DataFrame` that contains batches drawn from the given arrays.
 
     Raises:
       ValueError: `ordered_dict_of_arrays` contains the reserved name 'index'.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index c184b14654..7c72e516c9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -363,7 +363,7 @@ class BaseEstimator(
     sklearn.BaseEstimator, evaluable.Evaluable, trainable.Trainable):
   """Abstract BaseEstimator class to train and evaluate TensorFlow models.
 
-  Users should not instantiate or subclass this class. Instead, use `Estimator`.
+  Users should not instantiate or subclass this class. Instead, use an `Estimator`.
   """
   __metaclass__ = abc.ABCMeta
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index a473cf46d5..a92302420f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of k-means clustering on top of tf.learn API."""
+"""Implementation of k-means clustering on top of `Estimator` API."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
index d03057087c..fb339160d5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@@ -111,7 +111,7 @@ def LogisticRegressor(  # pylint: disable=invalid-name
                       into the model.
 
   Returns:
-    A `tf.contrib.learn.Estimator` instance.
+    An `Estimator` instance.
   """
   return estimator.Estimator(
       model_fn=_get_model_fn_with_logistic_metrics(model_fn),
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 0642c5900a..060c7a37be 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -64,7 +64,7 @@ class TaskType(object):
 class ClusterConfig(object):
   """This class specifies the configurations for a distributed run.
 
-  If you're using `tf.learn` `Estimators`, you should probably use the subclass
+  If you're using an `Estimator`, you should probably use the subclass
   RunConfig instead.
   """
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm.py b/tensorflow/contrib/learn/python/learn/estimators/svm.py
index 5a991da891..72920d73c0 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@@ -43,9 +43,9 @@ class SVM(estimator.Estimator):
   num_loss_partitions is larger or equal to this value, convergence is
   guaranteed but becomes slower as num_loss_partitions increases. If it is set
   to a smaller value, the optimizer is more aggressive in reducing the global
-  loss but convergence is not guaranteed. The recommended value in tf.learn
-  (where there is one process per worker) is the number of workers running the
-  train steps. It defaults to 1 (single machine).
+  loss but convergence is not guaranteed. The recommended value in an
+  `Estimator` (where there is one process per worker) is the number of workers
+  running the train steps. It defaults to 1 (single machine).
 
   Example:
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py
index bd9fcde8ce..8d84d230da 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@@ -159,7 +159,7 @@ def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
       constructor; `hparams` used as the hyper-paremeters of the model).
       It must return an `Experiment`. For this case, `output_dir` must be None.
     output_dir: Base output directory [Deprecated].
-    schedule: The name of the  method in the `Experiment` to run.
+    schedule: The name of the method in the `Experiment` to run.
     run_config: `RunConfig` instance. The `run_config.model_dir` must be
       non-empty. If `run_config` is set, `output_dir` must be None.
     hparams: `HParams` instance. The default hyper-parameters, which will be
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index 2f2c931786..ed6683abed 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -233,7 +233,7 @@ class MetricSpec(object):
   `Estimator` then knows which predictions, labels, and weight to use to call a
   given metric function.
 
-  When building the ops to run in evaluation, `Estimator` will call
+  When building the ops to run in evaluation, an `Estimator` will call
   `create_metric_ops`, which will connect the given `metric_fn` to the model
   as detailed in the docstring for `create_metric_ops`, and return the metric.
 
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index f4961ab9db..701fc1c059 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -227,7 +227,7 @@ class _SDCAEstimator(estimator.Estimator):
         will be multiplied by the loss of the example.
       model_dir: Directory to save model parameters, graph etc. This can also be
         used to load checkpoints from the directory into an estimator to
-        continue  training a previously saved model.
+        continue training a previously saved model.
       head: type of head. Currently, _BinaryLogisticHead and _BinarySvmHead are
         supported for classification and _RegressionHead for regression. It
         should be a subclass of _SingleHead.
@@ -312,7 +312,7 @@ class SDCALogisticClassifier(_SDCAEstimator):
   ```
 
   The input_fn provided to `fit`, `evaluate` and predict_* methods should return
-  the following features, otherwise there  will be a `KeyError`:
+  the following features, otherwise there will be a `KeyError`:
     * A feature with `key=example_id_column` whose value is a `Tensor` of dtype
       string.
     * If `weight_column_name` is not `None`, a feature with
@@ -438,7 +438,7 @@ class SDCALinearRegressor(_SDCAEstimator):
   ```
 
   The input_fn provided to `fit`, `evaluate` and predict_* methods should return
-  the following features, otherwise there  will be a `KeyError`:
+  the following features, otherwise there will be a `KeyError`:
     * A feature with `key=example_id_column` whose value is a `Tensor` of dtype
       string.
     * If `weight_column_name` is not `None`, a feature with
@@ -478,7 +478,7 @@ class SDCALinearRegressor(_SDCAEstimator):
         will be multiplied by the loss of the example.
       model_dir: Directory to save model parameters, graph etc. This can also be
         used to load checkpoints from the directory into an estimator to
-        continue  training a previously saved model.
+        continue training a previously saved model.
       l1_regularization: L1-regularization parameter. Refers to global L1
         regularization (across all examples).
       l2_regularization: L2-regularization parameter. Refers to global L2
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 6d66a257bc..b5d8c95678 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -334,7 +334,7 @@ def streaming_mean(values, weights=None, metrics_collections=None,
   returned as `mean` which is an idempotent operation that simply divides
   `total` by `count`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` operation that updates these variables and returns the `mean`.
   `update_op` increments `total` with the reduced sum of the product of `values`
   and `weights`, and it increments `count` with the reduced sum of `weights`.
@@ -381,7 +381,7 @@ def streaming_mean_tensor(values, weights=None, metrics_collections=None,
   `values`. This average is ultimately returned as `mean` which is an idempotent
   operation that simply divides `total` by `count`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` operation that updates these variables and returns the `mean`.
   `update_op` increments `total` with the reduced sum of the product of `values`
   and `weights`, and it increments `count` with the reduced sum of `weights`.
@@ -425,7 +425,7 @@ def streaming_accuracy(predictions, labels, weights=None,
   matches `labels`. This frequency is ultimately returned as `accuracy`: an
   idempotent operation that simply divides `total` by `count`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` operation that updates these variables and returns the `accuracy`.
   Internally, an `is_correct` operation computes a `Tensor` with elements 1.0
   where the corresponding elements of `predictions` and `labels` match and 0.0
@@ -477,7 +477,7 @@ def streaming_precision(predictions, labels, weights=None,
   operation that simply divides `true_positives` by the sum of `true_positives`
   and `false_positives`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` operation that updates these variables and returns the
   `precision`. `update_op` weights each prediction by the corresponding value in
   `weights`.
@@ -526,7 +526,7 @@ def streaming_recall(predictions, labels, weights=None,
   ultimately returned as `recall`, an idempotent operation that simply divides
   `true_positives` by the sum of `true_positives`  and `false_negatives`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` that updates these variables and returns the `recall`. `update_op`
   weights each prediction by the corresponding value in `weights`.
 
@@ -1571,7 +1571,7 @@ def streaming_sparse_average_precision_at_k(predictions,
   Returns:
     mean_average_precision: Scalar `float64` `Tensor` with the mean average
       precision values.
-    update: `Operation` that increments  variables appropriately, and whose
+    update: `Operation` that increments variables appropriately, and whose
       value matches `metric`.
   """
   return metrics.sparse_average_precision_at_k(
@@ -1628,7 +1628,7 @@ def streaming_sparse_average_precision_at_top_k(top_k_predictions,
   Returns:
     mean_average_precision: Scalar `float64` `Tensor` with the mean average
       precision values.
-    update: `Operation` that increments  variables appropriately, and whose
+    update: `Operation` that increments variables appropriately, and whose
       value matches `metric`.
 
   Raises:
diff --git a/tensorflow/contrib/resampler/python/ops/resampler_ops.py b/tensorflow/contrib/resampler/python/ops/resampler_ops.py
index 355d15f0c7..8b632527f6 100644
--- a/tensorflow/contrib/resampler/python/ops/resampler_ops.py
+++ b/tensorflow/contrib/resampler/python/ops/resampler_ops.py
@@ -9,7 +9,7 @@
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
diff --git a/tensorflow/contrib/resampler/python/ops/resampler_ops_test.py b/tensorflow/contrib/resampler/python/ops/resampler_ops_test.py
index 6a4360150c..9aa1e05628 100644
--- a/tensorflow/contrib/resampler/python/ops/resampler_ops_test.py
+++ b/tensorflow/contrib/resampler/python/ops/resampler_ops_test.py
@@ -9,7 +9,7 @@
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
index e0d063a1b6..eda8cb3c12 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
@@ -156,7 +156,7 @@ class StackBidirectionalRNNTest(test.TestCase):
     # - Reset states, and iterate for 5 steps. Last state is state_5.
     # - Reset the sets to state_3 and iterate for 2 more steps,
     #   last state will be state_5'.
-    # - Check that the  state_5 and state_5' (forward and backward) are the
+    # - Check that the state_5 and state_5' (forward and backward) are the
     #   same for the first layer (it does not apply for the second layer since
     #   it has forward-backward dependencies).
     with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
@@ -340,7 +340,7 @@ class StackBidirectionalRNNTest(test.TestCase):
     # - Reset states, and iterate for 5 steps. Last state is state_5.
     # - Reset the sets to state_3 and iterate for 2 more steps,
     #   last state will be state_5'.
-    # - Check that the  state_5 and state_5' (forward and backward) are the
+    # - Check that the state_5 and state_5' (forward and backward) are the
     #   same for the first layer (it does not apply for the second layer since
     #   it has forward-backward dependencies).
     with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index ecce1d22f0..090d28a078 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2026,7 +2026,7 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
         Here output_dim is:
            num_proj if num_proj was set,
            num_units otherwise.
-      - LSTMStateTuple representing the new state of G-LSTM  cell
+      - LSTMStateTuple representing the new state of G-LSTM cell
         after reading `inputs` when the previous state was `state`.
 
     Raises:
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.py b/tensorflow/contrib/session_bundle/bundle_shim.py
index 0aeda0f0de..062c9cc680 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.py
+++ b/tensorflow/contrib/session_bundle/bundle_shim.py
@@ -39,7 +39,7 @@ def _add_input_to_signature_def(tensor_name, map_key, signature_def):
   Args:
     tensor_name: string name of tensor to add to signature_def inputs
     map_key: string key to key into signature_def inputs map
-    signature_def: object of type  meta_graph_pb2.SignatureDef()
+    signature_def: object of type meta_graph_pb2.SignatureDef()
 
   Sideffect:
     adds a TensorInfo with tensor_name to signature_def inputs map keyed with
@@ -55,7 +55,7 @@ def _add_output_to_signature_def(tensor_name, map_key, signature_def):
   Args:
     tensor_name: string name of tensor to add to signature_def outputs
     map_key: string key to key into signature_def outputs map
-    signature_def: object of type  meta_graph_pb2.SignatureDef()
+    signature_def: object of type meta_graph_pb2.SignatureDef()
 
   Sideffect:
     adds a TensorInfo with tensor_name to signature_def outputs map keyed with
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index 6082af008a..e97f500572 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -175,7 +175,7 @@ def parallel_read(data_sources,
                   scope=None):
   """Reads multiple records in parallel from data_sources using n readers.
 
-  It uses a ParallelReader to read from multiple files in  parallel using
+  It uses a ParallelReader to read from multiple files in parallel using
   multiple readers created using `reader_class` with `reader_kwargs'.
 
   If shuffle is True the common_queue would be a RandomShuffleQueue otherwise
diff --git a/tensorflow/contrib/solvers/python/ops/lanczos.py b/tensorflow/contrib/solvers/python/ops/lanczos.py
index 8631002a53..af1b293743 100644
--- a/tensorflow/contrib/solvers/python/ops/lanczos.py
+++ b/tensorflow/contrib/solvers/python/ops/lanczos.py
@@ -206,7 +206,7 @@ def bidiag_matmul(matrix, alpha, beta, adjoint_b=False, name="bidiag_matmul"):
 
     A * B =  A[:, :-1] * diag(alpha) + A[:, 1:] * diag(beta)
 
-  If  adjoint_b is True, computes A * B[:-1, :]' as follows
+  If adjoint_b is True, computes A * B[:-1, :]' as follows
 
     A * B[:-1, :]' =
       A * diag(alpha) + [zeros(m,1), A[:, :-1] * diag(beta[:-1])]
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/models/stochastic_soft_decisions_to_data_then_nn.py b/tensorflow/contrib/tensor_forest/hybrid/python/models/stochastic_soft_decisions_to_data_then_nn.py
index 2db4249f5d..c2f3f60341 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/models/stochastic_soft_decisions_to_data_then_nn.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/models/stochastic_soft_decisions_to_data_then_nn.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 from tensorflow.contrib.tensor_forest.hybrid.python.layers import decisions_to_data
 from tensorflow.contrib.tensor_forest.hybrid.python.layers import fully_connected
 from tensorflow.contrib.tensor_forest.hybrid.python.models import hard_decisions_to_data_then_nn
-from tensorflow.python.training  import adagrad
+from tensorflow.python.training import adagrad
 
 
 class StochasticSoftDecisionsToDataThenNN(
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 9312070e52..c21900a0c8 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -1814,7 +1814,7 @@ def _reconstruct_sparse_tensor_seq(sequence,
 
     Counter-part of `_flatten_tensor` which is called on the input of
     `_restore_sparse` while this method is called on the output of it.
-    Together they  work around the limitation of `_restore_sparse` to only
+    Together they work around the limitation of `_restore_sparse` to only
     accept 1D handles.
 
     The `indices` in `sp_tensor` is a 2D `Tensor` of `shape [N, ndims]`, where
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6887d8cfb6..fdf45beed3 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3035,7 +3035,7 @@ tf_kernel_library(
         "maxpooling_op.h",
         "pooling_ops_3d.h",
         "pooling_ops_common.h",
-    ],
+    ] + if_sycl(["pooling_ops_3d_sycl.h"]),
     gpu_srcs = [
         "avgpooling_op.h",
         "avgpooling_op_gpu.cu.cc",
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 538dca24ae..a406317213 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -37,10 +37,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/kernels/pooling_ops_3d_sycl.h"
+#endif  // TENSORFLOW_USE_SYCL
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
 
 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
                                    const std::vector<int32>& ksize,
@@ -89,11 +97,6 @@ TensorShape Pool3dParameters::forward_output_shape() {
                          {{out_plane, out_height, out_width}}, depth);
 }
 
-enum PoolingType { MAX, AVG };
-
-template <typename Device, typename T, PoolingType Type>
-struct LaunchPoolingOp;
-
 template <typename T>
 struct LaunchPoolingOp<CPUDevice, T, AVG> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
@@ -200,9 +203,6 @@ class Pooling3DOp : public UnaryOp<T> {
   TensorFormat data_format_;
 };
 
-template <typename Device, typename T>
-struct LaunchMaxPooling3dGradOp;
-
 template <typename T>
 struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
@@ -377,9 +377,6 @@ class MaxPooling3dGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-template <typename Device, typename T>
-struct LaunchAvgPooling3dGradOp;
-
 template <typename T>
 struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context,
@@ -541,9 +538,6 @@ class AvgPooling3dGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-template <typename Device, typename T>
-struct LaunchMaxPooling3dGradGradOp;
-
 template <typename T>
 struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, const Pool3dParameters& params,
@@ -837,6 +831,12 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
+#undef REGISTER_SYCL_KERNELS
+#endif  // TENSORFLOW_USE_SYCL
+
 #undef REGISTER_KERNELS
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
index 7954e2cf83..d1be3ba407 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.h
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -22,6 +22,20 @@ limitations under the License.
 
 namespace tensorflow {
 
+enum PoolingType { MAX, AVG };
+
+template <typename Device, typename T, PoolingType Type>
+struct LaunchPoolingOp;
+
+template <typename Device, typename T>
+struct LaunchAvgPooling3dGradOp;
+
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradOp;
+
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradGradOp;
+
 // A helper class to manage sizes and shapes for 3d pooling operations.
 struct Pool3dParameters {
   // Updates context->status if there is an invalid input.
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
new file mode 100644
index 0000000000..d8cbc589a1
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -0,0 +1,759 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
+
+#include "tensorflow/core/kernels/pooling_ops_3d.h"
+
+namespace tensorflow {
+
+typedef Eigen::SyclDevice SYCLDevice;
+
+// Helper struct to contain the various pool parameters used in the SYCL
+// pooling kernels. Similar to the Pool3dParameters, but with a number of
+// convenient constructors.
+struct SYCL3DPoolParams {
+  SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
+                   const int in_rows, const int in_cols, const int out_planes,
+                   const int out_rows, const int out_cols,
+                   const std::array<int64, 3>& window,
+                   const std::array<int64, 3>& stride,
+                   const std::array<int64, 3>& padding)
+      : depth_(depth),
+        batch_(batch),
+        in_planes_(in_planes),
+        in_rows_(in_rows),
+        in_cols_(in_cols),
+        window_planes_(window[2]),
+        window_rows_(window[1]),
+        window_cols_(window[0]),
+        stride_planes_(stride[2]),
+        stride_rows_(stride[1]),
+        stride_cols_(stride[0]),
+        out_planes_(out_planes),
+        out_rows_(out_rows),
+        out_cols_(out_cols),
+        pad_planes_(padding[2]),
+        pad_rows_(padding[1]),
+        pad_cols_(padding[0]) {}
+
+  SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
+                   const int in_rows, const int in_cols,
+                   const std::array<int64, 3>& out_shape,
+                   const std::array<int64, 3>& window,
+                   const std::array<int64, 3>& stride,
+                   const std::array<int64, 3>& padding)
+      : SYCL3DPoolParams(depth, batch, in_planes, in_rows, in_cols,
+                         out_shape[2], out_shape[1], out_shape[0], window,
+                         stride, padding) {}
+
+  SYCL3DPoolParams(const Pool3dParameters& params)
+      : depth_(params.depth),
+        batch_(params.tensor_in_batch),
+        in_planes_(params.tensor_in_planes),
+        in_rows_(params.tensor_in_rows),
+        in_cols_(params.tensor_in_cols),
+        window_planes_(params.window_planes),
+        window_rows_(params.window_rows),
+        window_cols_(params.window_cols),
+        stride_planes_(params.plane_stride),
+        stride_rows_(params.row_stride),
+        stride_cols_(params.col_stride),
+        out_planes_(params.out_plane),
+        out_rows_(params.out_height),
+        out_cols_(params.out_width),
+        pad_planes_(params.pad_planes),
+        pad_rows_(params.pad_rows),
+        pad_cols_(params.pad_cols) {}
+
+  const int depth_;
+  const int batch_;
+  const int in_planes_;
+  const int in_rows_;
+  const int in_cols_;
+
+  const int window_planes_;
+  const int window_rows_;
+  const int window_cols_;
+
+  const int stride_planes_;
+  const int stride_rows_;
+  const int stride_cols_;
+
+  const int out_planes_;
+  const int out_rows_;
+  const int out_cols_;
+
+  const int pad_planes_;
+  const int pad_rows_;
+  const int pad_cols_;
+};
+// MaxPool3d SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output tensor.
+//
+// For each output element, find the corresponding input window and run over
+// all values in the window to find the maximum value. This value is then
+// copied into that output element.
+template <typename T>
+class MaxPool3DSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  MaxPool3DSYCL(const int depth, const int batch, const int in_planes,
+                const int in_rows, const int in_cols, const int out_planes,
+                const int out_rows, const int out_cols,
+                const std::array<int64, 3>& window,
+                const std::array<int64, 3>& stride,
+                const std::array<int64, 3>& padding,
+                const read_accessor input_accessor,
+                write_accessor output_accessor)
+      : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
+           out_cols, window, stride, padding),
+        input_accessor_(input_accessor),
+        output_accessor_(output_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
+    T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
+
+    int index = item.get_linear_id();
+    int n = index;
+    int d = n % p_.depth_;
+    n /= p_.depth_;
+    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+    cstart = std::max(cstart, 0);
+    n /= p_.out_cols_;
+    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+    rstart = std::max(rstart, 0);
+    n /= p_.out_rows_;
+    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+    pstart = std::max(pstart, 0);
+    n /= p_.out_planes_;
+    T maxval = Eigen::NumTraits<T>::lowest();
+    const T* input_data_n =
+        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+    for (int p = pstart; p < pend; ++p) {
+      for (int r = rstart; r < rend; ++r) {
+        for (int c = cstart; c < cend; ++c) {
+          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+          if (input_data_n[idx] > maxval) {
+            maxval = input_data_n[idx];
+          }
+        }
+      }
+    }
+    output_data[index] = maxval;
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+  const read_accessor input_accessor_;
+  write_accessor output_accessor_;
+};
+template <typename T>
+struct LaunchPoolingOp<SYCLDevice, T, MAX> {
+  static void launch(OpKernelContext* context, const Tensor& tensor_in,
+                     const std::array<int64, 3>& window,
+                     const std::array<int64, 3>& stride,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Padding padding_type,
+                     Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+    const int out_planes = GetTensorDim(*output, data_format, '0');
+    const int out_rows = GetTensorDim(*output, data_format, '1');
+    const int out_cols = GetTensorDim(*output, data_format, '2');
+    const int batch = GetTensorDim(tensor_in, data_format, 'N');
+    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+    const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+    const int num_threads = output->NumElements();
+
+    auto input_buffer =
+        device.get_sycl_buffer(tensor_in.template flat<T>().data());
+    auto output_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_access =
+          input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_access =
+          output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
+      MaxPool3DSYCL<T> max_pool(depth, batch, in_planes, in_rows, in_cols,
+                                out_planes, out_rows, out_cols, window, stride,
+                                padding, input_access, output_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(num_threads), max_pool);
+    });
+  }
+};
+// MaxPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output backprop tenor (i.e. the number of elements
+// in the input data tensor).
+//
+// For each output backprop element we compute the possible window of values in
+// the input backprop tensor which might contribute to this element. Then for
+// each error in this window, compute the corresponding input window which was
+// pooled into that element in the output. Walk through this input window to
+// determine whether the input value is the first maximum value, and so the
+// error should be propagated back to the corresponding backprop element.
+template <typename T>
+class MaxPool3DGradSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  MaxPool3DGradSYCL(const int depth, const int batch, const int in_planes,
+                    const int in_rows, const int in_cols,
+                    const std::array<int64, 3>& output_shape,
+                    const std::array<int64, 3>& window,
+                    const std::array<int64, 3>& stride,
+                    const std::array<int64, 3>& padding,
+                    const read_accessor input_data_accessor,
+                    const read_accessor output_data_accessor,
+                    const read_accessor input_backprop_accessor,
+                    write_accessor output_backprop_accessor)
+      : p_(depth, batch, in_planes, in_rows, in_cols, output_shape, window,
+           stride, padding),
+        input_data_accessor_(input_data_accessor),
+        output_data_accessor_(output_data_accessor),
+        input_backprop_accessor_(input_backprop_accessor),
+        output_backprop_accessor_(output_backprop_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
+    T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
+    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+    const int index = item.get_linear_id();
+    T output_value = 0;
+    int n = index;
+    const int d = n % p_.depth_;
+    n /= p_.depth_;
+    const int c = (n % p_.in_cols_) + p_.pad_cols_;
+    const int poolcstart =
+        (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
+    const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
+    n /= p_.in_cols_;
+    const int r = (n % p_.in_rows_) + p_.pad_rows_;
+    const int poolrstart =
+        (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
+    const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
+    n /= p_.in_rows_;
+    const int p = (n % p_.in_planes_) + p_.pad_planes_;
+    const int poolpstart =
+        (p < p_.window_planes_)
+            ? 0
+            : (p - p_.window_planes_) / p_.stride_planes_ + 1;
+    const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
+    n /= p_.in_planes_;
+    const int index_no_n =
+        index - n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+
+    const T* input_data_n =
+        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+    const T* output_data_n =
+        output_data +
+        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n =
+        input_backprop +
+        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
+      int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
+      const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+      pstart = std::max(pstart, 0);
+
+      for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
+        int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
+        const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+        rstart = std::max(rstart, 0);
+
+        for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
+          int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
+          const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+          cstart = std::max(cstart, 0);
+
+          const int output_data_idx =
+              ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
+                  p_.depth_ +
+              d;
+          bool should_continue = true;
+          bool is_max = (input_data[index] == output_data_n[output_data_idx]);
+          for (int win_p = pstart; win_p < pend && should_continue; ++win_p) {
+            for (int win_r = rstart; win_r < rend && should_continue; ++win_r) {
+              for (int win_c = cstart; win_c < cend && should_continue;
+                   ++win_c) {
+                const int input_data_idx =
+                    ((win_p * p_.in_rows_ + win_r) * p_.in_cols_ + win_c) *
+                        p_.depth_ +
+                    d;
+                if (input_data_idx == index_no_n) {
+                  should_continue = false;
+                } else if (input_data_n[input_data_idx] ==
+                           output_data_n[output_data_idx]) {
+                  should_continue = false;
+                  is_max = false;
+                }
+              }
+            }
+          }
+          if (is_max) {
+            output_value += input_backprop_n[output_data_idx];
+          }
+        }
+      }
+    }
+    output_backprop[index] = output_value;
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+
+  const read_accessor input_data_accessor_;
+  const read_accessor output_data_accessor_;
+  const read_accessor input_backprop_accessor_;
+  write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchMaxPooling3dGradOp<SYCLDevice, T> {
+  static void launch(OpKernelContext* context, const Tensor& tensor_in,
+                     const Tensor& tensor_out, const Tensor& out_backprop,
+                     const std::array<int64, 3>& window,
+                     const std::array<int64, 3>& stride,
+                     const std::array<int64, 3>& out,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+    const int batch = GetTensorDim(tensor_in, data_format, 'N');
+    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+    const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+    const int output_size = output->NumElements();
+
+    auto input_data_buffer =
+        device.get_sycl_buffer(tensor_in.template flat<T>().data());
+    auto output_data_buffer =
+        device.get_sycl_buffer(tensor_out.template flat<T>().data());
+    auto input_backprop_buffer =
+        device.get_sycl_buffer(out_backprop.template flat<T>().data());
+    auto output_backprop_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_data_access =
+          input_data_buffer.template get_access<cl::sycl::access::mode::read>(
+              cgh);
+      auto output_data_access =
+          output_data_buffer.template get_access<cl::sycl::access::mode::read>(
+              cgh);
+      auto input_backprop_access =
+          input_backprop_buffer
+              .template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_backprop_access =
+          output_backprop_buffer
+              .template get_access<cl::sycl::access::mode::write>(cgh);
+      MaxPool3DGradSYCL<T> max_pool(
+          depth, batch, in_planes, in_rows, in_cols, out, window, stride,
+          padding, input_data_access, output_data_access, input_backprop_access,
+          output_backprop_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(output_size), max_pool);
+    });
+  }
+};
+// MaxPool3DGradGrad SYCL kernel. Expects the number of threads to be equal to
+// the number of elements in the output backprop tensor, i.e. the number of
+// elements in the output tensor.
+//
+// For each element in the output backprop tensor, find the corresponding input
+// window, and compare the input and output data to find the index of the
+// maximum value in the input tensor. This is then the index of the gradient to
+// pass through to the output backprop tensor.
+template <typename T>
+class MaxPool3DGradGradSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  MaxPool3DGradGradSYCL(const Pool3dParameters& params,
+                        const read_accessor input_data_accessor,
+                        const read_accessor output_data_accessor,
+                        const read_accessor input_backprop_accessor,
+                        write_accessor output_backprop_accessor)
+      : p_(params),
+        input_data_accessor_(input_data_accessor),
+        output_data_accessor_(output_data_accessor),
+        input_backprop_accessor_(input_backprop_accessor),
+        output_backprop_accessor_(output_backprop_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
+    T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
+    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+    int index = item.get_linear_id();
+    int n = index;
+    int d = n % p_.depth_;
+    n /= p_.depth_;
+    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+    cstart = std::max(cstart, 0);
+    n /= p_.out_cols_;
+    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+    rstart = std::max(rstart, 0);
+    n /= p_.out_rows_;
+    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+    pstart = std::max(pstart, 0);
+    n /= p_.out_planes_;
+    int maxidx = -1;
+    bool should_stop = false;
+    const T* input_data_n =
+        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+    for (int p = pstart; p < pend && !should_stop; ++p) {
+      for (int r = rstart; r < rend && !should_stop; ++r) {
+        for (int c = cstart; c < cend && !should_stop; ++c) {
+          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+          if (output_data[index] == input_data_n[idx]) {
+            maxidx = idx;
+            should_stop = true;
+          }
+        }
+      }
+    }
+    if (maxidx != -1) {
+      output_backprop[index] = input_backprop[n * p_.in_planes_ * p_.in_rows_ *
+                                                  p_.in_cols_ * p_.depth_ +
+                                              maxidx];
+    }
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+
+  const read_accessor input_data_accessor_;
+  const read_accessor output_data_accessor_;
+  const read_accessor input_backprop_accessor_;
+  write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<SYCLDevice, T> {
+  static void launch(OpKernelContext* context, const Pool3dParameters& params,
+                     const Tensor& tensor_in, const Tensor& tensor_out,
+                     const Tensor& out_backprop, Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+
+    const int num_threads = output->NumElements();
+
+    auto input_data_buffer =
+        device.get_sycl_buffer(tensor_in.template flat<T>().data());
+    auto output_data_buffer =
+        device.get_sycl_buffer(tensor_out.template flat<T>().data());
+    auto input_backprop_buffer =
+        device.get_sycl_buffer(out_backprop.template flat<T>().data());
+    auto output_backprop_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_data_access =
+          input_data_buffer.template get_access<cl::sycl::access::mode::read>(
+              cgh);
+      auto output_data_access =
+          output_data_buffer.template get_access<cl::sycl::access::mode::read>(
+              cgh);
+      auto input_backprop_access =
+          input_backprop_buffer
+              .template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_backprop_access =
+          output_backprop_buffer
+              .template get_access<cl::sycl::access::mode::write>(cgh);
+      MaxPool3DGradGradSYCL<T> functor(
+          params, input_data_access, output_data_access, input_backprop_access,
+          output_backprop_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
+    });
+  }
+};
+// AvgPool3D SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output tensor.
+//
+// For each output value find the corresponding input window, and run through
+// the window accumulating the values to form an average. We divide each value
+// before accumulating to prevent the accumulator from becoming significantly
+// bigger than the values we are adding and so decrease any errors.
+template <typename T>
+class AvgPool3DSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  AvgPool3DSYCL(const int depth, const int batch, const int in_planes,
+                const int in_rows, const int in_cols, const int out_planes,
+                const int out_rows, const int out_cols,
+                const std::array<int64, 3>& window,
+                const std::array<int64, 3>& stride,
+                const std::array<int64, 3>& padding,
+                const read_accessor input_accessor,
+                write_accessor output_accessor)
+      : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
+           out_cols, window, stride, padding),
+        input_accessor_(input_accessor),
+        output_accessor_(output_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
+    T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
+
+    int index = item.get_linear_id();
+    int n = index;
+    int d = n % p_.depth_;
+    n /= p_.depth_;
+    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+    cstart = std::max(cstart, 0);
+    n /= p_.out_cols_;
+    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+    rstart = std::max(rstart, 0);
+    n /= p_.out_rows_;
+    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+    pstart = std::max(pstart, 0);
+    n /= p_.out_planes_;
+    T accum = T(0);
+    T count =
+        static_cast<T>((pend - pstart) * (rend - rstart) * (cend - cstart));
+    const T* input_data_n =
+        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+    for (int p = pstart; p < pend; ++p) {
+      for (int r = rstart; r < rend; ++r) {
+        for (int c = cstart; c < cend; ++c) {
+          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+          accum += input_data_n[idx] / count;
+        }
+      }
+    }
+    output_data[index] = accum;
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+  const read_accessor input_accessor_;
+  write_accessor output_accessor_;
+};
+template <typename T>
+struct LaunchPoolingOp<SYCLDevice, T, AVG> {
+  static void launch(OpKernelContext* context, const Tensor& tensor_in,
+                     const std::array<int64, 3>& window,
+                     const std::array<int64, 3>& stride,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Padding padding_type,
+                     Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+    const int out_planes = GetTensorDim(*output, data_format, '0');
+    const int out_rows = GetTensorDim(*output, data_format, '1');
+    const int out_cols = GetTensorDim(*output, data_format, '2');
+    const int batch = GetTensorDim(tensor_in, data_format, 'N');
+    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+    const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+    const int num_threads = output->NumElements();
+
+    auto input_buffer =
+        device.get_sycl_buffer(tensor_in.template flat<T>().data());
+    auto output_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_access =
+          input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_access =
+          output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
+      AvgPool3DSYCL<T> avg_pool(depth, batch, in_planes, in_rows, in_cols,
+                                out_planes, out_rows, out_cols, window, stride,
+                                padding, input_access, output_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(num_threads), avg_pool);
+    });
+  }
+};
+// AvgPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output backprop tensor, i.e. the number of
+// elements in the input tensor.
+//
+// For each output backprop index find a window in the input backprop tensor
+// which corresponds to all the values of the output which were affected by the
+// input value at this index. Then for each gradient in this window, compute
+// the size of the input window which was averaged to give this output, and use
+// this size to scale the gradient accordingly. Add this scaled gradient to the
+// output backprop value.
+template <typename T>
+class AvgPool3DGradSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  AvgPool3DGradSYCL(const int depth, const int batch, const int in_planes,
+                    const int in_rows, const int in_cols,
+                    const std::array<int64, 3>& out_shape,
+                    const std::array<int64, 3>& window,
+                    const std::array<int64, 3>& stride,
+                    const std::array<int64, 3>& padding,
+                    const read_accessor input_backprop_accessor,
+                    write_accessor output_backprop_accessor)
+      : p_(depth, batch, in_planes, in_rows, in_cols, out_shape, window, stride,
+           padding),
+        input_backprop_accessor_(input_backprop_accessor),
+        output_backprop_accessor_(output_backprop_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+    const int index = item.get_linear_id();
+    int n = index;
+    const int d = n % p_.depth_;
+    n /= p_.depth_;
+    const int c = (n % p_.in_cols_) + p_.pad_cols_;
+    const int poolcstart =
+        (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
+    const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
+    n /= p_.in_cols_;
+    const int r = (n % p_.in_rows_) + p_.pad_rows_;
+    const int poolrstart =
+        (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
+    const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
+    n /= p_.in_rows_;
+    const int p = (n % p_.in_planes_) + p_.pad_planes_;
+    const int poolpstart =
+        (p < p_.window_planes_)
+            ? 0
+            : (p - p_.window_planes_) / p_.stride_planes_ + 1;
+    const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
+    n /= p_.in_planes_;
+
+    T gradient = T(0);
+    const T* input_backprop_n =
+        input_backprop +
+        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
+      int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
+      const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+      pstart = std::max(pstart, 0);
+      const int plane_window_size = pend - pstart;
+      for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
+        int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
+        const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+        rstart = std::max(rstart, 0);
+        const int row_window_size = rend - rstart;
+        for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
+          const int idx =
+              ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
+                  p_.depth_ +
+              d;
+          int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
+          const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+          cstart = std::max(cstart, 0);
+          const int col_window_size = cend - cstart;
+          const int window_size =
+              plane_window_size * row_window_size * col_window_size;
+          gradient += input_backprop_n[idx] / static_cast<T>(window_size);
+        }
+      }
+    }
+    output_backprop[index] = gradient;
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+  const read_accessor input_backprop_accessor_;
+  write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchAvgPooling3dGradOp<SYCLDevice, T> {
+  static void launch(OpKernelContext* context,
+                     const TensorShape& tensor_in_shape,
+                     const Tensor& out_backprop,
+                     const std::array<int64, 3>& window,
+                     const std::array<int64, 3>& stride,
+                     const std::array<int64, 3>& output_shape,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+    const int batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+    const int in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
+    const int in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
+    const int in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
+    const int depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+
+    const int num_threads = output->NumElements();
+
+    auto input_backprop_buffer =
+        device.get_sycl_buffer(out_backprop.template flat<T>().data());
+    auto output_backprop_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_backprop_access =
+          input_backprop_buffer
+              .template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_backprop_access =
+          output_backprop_buffer
+              .template get_access<cl::sycl::access::mode::write>(cgh);
+      AvgPool3DGradSYCL<T> functor(
+          depth, batch, in_planes, in_rows, in_cols, output_shape, window,
+          stride, padding, input_backprop_access, output_backprop_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
+    });
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 5b71f046c3..1bfa37f5a7 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -710,9 +710,9 @@ bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
 bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
 height of the underlying image.
 
-For example, if an image is 100 x 200 pixels and the bounding box is
-`[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
-bounding box will be `(10, 40)` to `(50, 180)`.
+For example, if an image is 100 x 200 pixels (height x width) and the bounding 
+box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of 
+the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
 
 Parts of the bounding box may fall outside the image.
 
diff --git a/tensorflow/core/platform/default/gpu_tracer.cc b/tensorflow/core/platform/default/gpu_tracer.cc
index 86ab70afdd..50c27b3cf6 100644
--- a/tensorflow/core/platform/default/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/gpu_tracer.cc
@@ -205,7 +205,7 @@ Status CUPTIManager::DisableTrace() {
   CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
   CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2));
   CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(ActivityFlushAll(0));
+  CUPTI_CALL(ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
   {
     // Don't acquire this lock until Flush returns, since Flush
     // will potentially cause callbacks into BufferCompleted.
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 69198019cd..70b91c37e4 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -25,7 +25,7 @@ bool CountAsAcceleratorTime(const string& device) {
 }
 
 bool CountAsCPUTime(const string& device) {
-  return RE2::FullMatch(device, ".*/(gpu|cpu):\\d+");
+  return RE2::FullMatch(device, ".*/(gpu|cpu|device:sycl):\\d+");
 }
 
 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
@@ -145,7 +145,7 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
   // See run_metadata_test.py
   // It can be /job:0/replica:0/xxxx/gpu:0, or simply /gpu:0.
   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
-  if (IsCanonicalDevice(device)) {
+  if (IsCanonicalDevice(dev)) {
     if (!canonical_device_.empty()) {
       if (canonical_device_ != dev) {
         fprintf(stderr, "Unexpected: graph node changed device: %s->%s.\n",
@@ -155,7 +155,11 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
     } else {
       canonical_device_ = dev;
       // TODO(xpan): Support things other than gpu?
-      host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+      if (dev.find("sycl") != dev.npos) {
+        host_device_ = StringReplace(dev, "device:sycl:\\d+", "cpu:0");
+      } else {
+        host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+      }
       AddOpType(canonical_device_);
     }
   }
@@ -229,7 +233,8 @@ TensorShapeProto VecToShapeProto(const std::vector<int64> shape_vec) {
 }
 
 bool IsPlacedOnAccelerator(const string& device) {
-  return device.find("gpu") != device.npos;
+  return device.find("gpu") != device.npos ||
+         device.find("sycl") != device.npos;
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 4b1d1b6e34..a51a607031 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -178,9 +178,7 @@ suggested implementation is to:
    file, but the specialization for the GPUDevice is defined in a .cu.cc file,
    since it will be compiled with the CUDA compiler.
 
-<!--zippy-->
-
-Expand this to see the example implementation.
+Here is an example implementation.
 
 ```c++
 // example.h
@@ -307,8 +305,6 @@ template struct ExampleFunctor<GPUDevice, int32>;
 #endif  // GOOGLE_CUDA
 ```
 
-<!--endzippy-->
-
 ## Build the op library
 ### Compile the op using your system compiler (TensorFlow binary installation)
 
@@ -763,7 +759,7 @@ Your op registration now specifies that the input's type must be `float`, or
 >   """
 > ```
 
-<pre><pre class="prettyprint"><code class="lang-cpp">
+<pre class="prettyprint"><code class="lang-cpp">
 \#include "tensorflow/core/framework/op_kernel.h"<br/>
 class ZeroOut<b>Int32</b>Op : public OpKernel {
   // as before
@@ -803,7 +799,7 @@ REGISTER\_KERNEL\_BUILDER(
     .Device(DEVICE\_CPU)
     .TypeConstraint&lt;float&gt;("T"),
     ZeroOutFloatOp);
-</b></code></pre></pre>
+</b></code></pre>
 
 > To preserve [backwards compatibility](#backwards-compatibility), you should
 > specify a [default value](#default-values-constraints) when adding an attr to
diff --git a/tensorflow/examples/image_retraining/label_image.py b/tensorflow/examples/image_retraining/label_image.py
index ecfa672462..de2713fc10 100644
--- a/tensorflow/examples/image_retraining/label_image.py
+++ b/tensorflow/examples/image_retraining/label_image.py
@@ -99,7 +99,7 @@ def run_graph(image_data, labels, input_layer_name, output_layer_name,
               num_top_predictions):
   with tf.Session() as sess:
     # Feed the image_data as input to the graph.
-    #   predictions  will contain a two-dimensional array, where one
+    #   predictions will contain a two-dimensional array, where one
     #   dimension represents the input image count, and the other has
     #   predictions per class
     softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name)
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 2e2e578050..3549891461 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -293,7 +293,7 @@ def run_bottleneck_on_image(sess, image_data, image_data_tensor,
     sess: Current active TensorFlow Session.
     image_data: String of raw JPEG data.
     image_data_tensor: Input data layer in the graph.
-    decoded_image_tensor: Output of initial image resizing and  preprocessing.
+    decoded_image_tensor: Output of initial image resizing and preprocessing.
     resized_input_tensor: The input node of the recognition graph.
     bottleneck_tensor: Layer before the final softmax.
 
@@ -391,9 +391,9 @@ def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
     label_name: Label string we want to get an image for.
     index: Integer offset of the image we want. This will be modulo-ed by the
     available number of images for the label, so it can be arbitrarily large.
-    image_dir: Root folder string  of the subfolders containing the training
+    image_dir: Root folder string of the subfolders containing the training
     images.
-    category: Name string of which  set to pull images from - training, testing,
+    category: Name string of which set to pull images from - training, testing,
     or validation.
     bottleneck_dir: Folder string holding cached files of bottleneck values.
     jpeg_data_tensor: The tensor to feed loaded jpeg data into.
@@ -969,7 +969,7 @@ def main(_):
   # See https://github.com/tensorflow/tensorflow/issues/3047
   tf.logging.set_verbosity(tf.logging.INFO)
 
-  # Prepare necessary directories  that can be used during training
+  # Prepare necessary directories that can be used during training
   prepare_file_system()
 
   # Gather information about the model architecture we'll be using.
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index 21d98e9ea2..26e6e086b3 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -110,9 +110,9 @@ def main(unused_argv):
   # Prepare training and testing data
   dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
-  x_train = pandas.DataFrame(dbpedia.train.data)[1]
+  x_train = pandas.Series(dbpedia.train.data[:,1])
   y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.DataFrame(dbpedia.test.data)[1]
+  x_test = pandas.Series(dbpedia.test.data[:,1])
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 0cec75cf99..15e7ae18bb 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -341,7 +341,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(a_val)
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(c_val)
-      # List of lists, tuples, namedtuple, and  dict
+      # List of lists, tuples, namedtuple, and dict
       res = sess.run([[a, b, c], (a, b, c), ABC(a=a, b=b, c=c),
                       {'a': a.name, 'c': c, 'b': b}])
       self.assertTrue(isinstance(res, list))
@@ -365,7 +365,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(a_val, res[3]['a'])
       self.assertEqual(b_val, res[3]['b'])
       self.assertEqual(c_val, res[3]['c'])
-      # Tuple of lists, tuples, namedtuple, and  dict
+      # Tuple of lists, tuples, namedtuple, and dict
       res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c),
                       {'a': a, 'c': c, 'b': b}))
       self.assertTrue(isinstance(res, tuple))
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index a51d8a7774..044a91a7ce 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -720,7 +720,7 @@ class DebugDumpDir(object):
     """Load `DebugTensorDatum` instances from the dump root of a given device.
 
     Populates a map {device_name: a list of `DebugTensorDatum`}, where the list
-    is sorted by  ascending timestamp.
+    is sorted by ascending timestamp.
 
     This sorting order reflects the order in which the TensorFlow executor
     processed the nodes of the graph. It is (one of many possible) topological
diff --git a/tensorflow/python/debug/lib/debug_gradients.py b/tensorflow/python/debug/lib/debug_gradients.py
index 8689a68875..5306391613 100644
--- a/tensorflow/python/debug/lib/debug_gradients.py
+++ b/tensorflow/python/debug/lib/debug_gradients.py
@@ -345,7 +345,7 @@ class GradientsDebugger(object):
   def _get_tensor_name(self, tensor):
     if isinstance(tensor, (ops.Tensor, variables.Variable)):
       return tensor.name
-    elif  isinstance(tensor, six.string_types):
+    elif isinstance(tensor, six.string_types):
       return tensor
     else:
       raise TypeError(
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 686fb45238..4cf37797f9 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -401,7 +401,7 @@ class StepperTest(test_util.TensorFlowTestCase):
       elif i == 5:
         fetches = {"e": "e:0", "fz": {"f": "f:0", "z": "z:0"}}
 
-      with  NodeStepper(self.sess, fetches) as stepper:
+      with NodeStepper(self.sess, fetches) as stepper:
         sorted_nodes = stepper.sorted_nodes()
         self.assertEqual(13, len(sorted_nodes))
 
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index bc868a493f..a9681df56d 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -301,7 +301,7 @@ def _multi_class_head_with_softmax_cross_entropy_loss(n_classes,
       provided and labels are string.
 
   Returns:
-    An instance of `_Head` for  multi class classification.
+    An instance of `_Head` for multi class classification.
 
   Raises:
     ValueError: if `n_classes`, `metric_class_ids` or `label_keys` is invalid.
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 0480325604..847b27b904 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -286,7 +286,7 @@ def _enqueue_data(data,
 
   Args:
     data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or a generator
-       yielding `dict`s of numpy arrays  or pandas `DataFrame` that will be read
+       yielding `dict`s of numpy arrays or pandas `DataFrame` that will be read
        into the queue.
     capacity: the capacity of the queue.
     shuffle: whether or not to shuffle the rows of the array.
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index 1a023c971f..1a4b0c5fc0 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -58,9 +58,9 @@ class EstimatorSpec(
         'export_outputs', 'training_chief_hooks', 'training_hooks',
         'scaffold', 'evaluation_hooks'
     ])):
-  """Ops and objects returned from a `model_fn` and passed to `Estimator`.
+  """Ops and objects returned from a `model_fn` and passed to an `Estimator`.
 
-  `EstimatorSpec` fully defines the model to be run by `Estimator`.
+  `EstimatorSpec` fully defines the model to be run by an `Estimator`.
   """
 
   def __new__(cls,
@@ -82,8 +82,8 @@ class EstimatorSpec(
     * For `mode == ModeKeys.PREDICT`: required fields are `predictions`.
 
     model_fn can populate all arguments independent of mode. In this case, some
-    arguments will be ignored by `Estimator`. E.g. `train_op` will be ignored
-    in eval and infer modes. Example:
+    arguments will be ignored by an `Estimator`. E.g. `train_op` will be
+    ignored in eval and infer modes. Example:
 
     ```python
     def my_model_fn(mode, features, labels):
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 1f2d376b50..34295d8c20 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -190,7 +190,7 @@ class Declare(object):
   later during a graph construction.
 
   For example,
-    # Declares  a function Foo, which takes a tf.int32 named "n" and a
+    # Declares a function Foo, which takes a tf.int32 named "n" and a
     # tf.float32 named "x" as inputs and returns a tf.float32 named "z"
     # as its output.
     foo = Declare("Foo", [("n", tf.int32), ("x", tf.float32)],
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index f9e8158705..5d3ac45020 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -3474,7 +3474,7 @@ class Graph(object):
       additional mechanism to add control dependencies.
 
       Args:
-        graph: The graph that this controller is  managing.
+        graph: The graph that this controller is managing.
         control_inputs: List of ops to use as control inputs in addition
           to the current control dependencies.  None to indicate that
           the dependencies should be cleared.
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 2b981e7b19..b6b20d1b4a 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -125,7 +125,7 @@ class Bernoulli(distribution.Distribution):
           event, target_dtype=dtypes.bool)
 
     # TODO(jaana): The current sigmoid_cross_entropy_with_logits has
-    # inconsistent  behavior for logits = inf/-inf.
+    # inconsistent behavior for logits = inf/-inf.
     event = math_ops.cast(event, self.logits.dtype)
     logits = self.logits
     # sigmoid_cross_entropy_with_logits doesn't broadcast shape,
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index fab4c5cb0f..3b0a357b16 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -296,7 +296,7 @@ def mean(values, weights=None, metrics_collections=None,
   returned as `mean` which is an idempotent operation that simply divides
   `total` by `count`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` operation that updates these variables and returns the `mean`.
   `update_op` increments `total` with the reduced sum of the product of `values`
   and `weights`, and it increments `count` with the reduced sum of `weights`.
@@ -366,7 +366,7 @@ def accuracy(labels, predictions, weights=None, metrics_collections=None,
   matches `labels`. This frequency is ultimately returned as `accuracy`: an
   idempotent operation that simply divides `total` by `count`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` operation that updates these variables and returns the `accuracy`.
   Internally, an `is_correct` operation computes a `Tensor` with elements 1.0
   where the corresponding elements of `predictions` and `labels` match and 0.0
@@ -614,7 +614,7 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
   """
   with variable_scope.variable_scope(
       name, 'auc', (labels, predictions, weights)):
-    if curve != 'ROC' and  curve != 'PR':
+    if curve != 'ROC' and curve != 'PR':
       raise ValueError('curve must be either ROC or PR, %s unknown' %
                        (curve))
     kepsilon = 1e-7  # to account for floating point imprecisions
@@ -1067,7 +1067,7 @@ def mean_tensor(values, weights=None, metrics_collections=None,
   `values`. This average is ultimately returned as `mean` which is an idempotent
   operation that simply divides `total` by `count`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` operation that updates these variables and returns the `mean`.
   `update_op` increments `total` with the reduced sum of the product of `values`
   and `weights`, and it increments `count` with the reduced sum of `weights`.
@@ -1329,7 +1329,7 @@ def precision(labels, predictions, weights=None,
   operation that simply divides `true_positives` by the sum of `true_positives`
   and `false_positives`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` operation that updates these variables and returns the
   `precision`. `update_op` weights each prediction by the corresponding value in
   `weights`.
@@ -1522,7 +1522,7 @@ def recall(labels, predictions, weights=None,
   ultimately returned as `recall`, an idempotent operation that simply divides
   `true_positives` by the sum of `true_positives`  and `false_negatives`.
 
-  For estimation of the metric  over a stream of data, the function creates an
+  For estimation of the metric over a stream of data, the function creates an
   `update_op` that updates these variables and returns the `recall`. `update_op`
   weights each prediction by the corresponding value in `weights`.
 
@@ -2453,7 +2453,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
   Returns:
     mean_average_precision: Scalar `float64` `Tensor` with the mean average
       precision values.
-    update: `Operation` that increments  variables appropriately, and whose
+    update: `Operation` that increments variables appropriately, and whose
       value matches `metric`.
   """
   with ops.name_scope(name, 'average_precision_at_top_k',
@@ -2551,7 +2551,7 @@ def sparse_average_precision_at_k(labels,
   Returns:
     mean_average_precision: Scalar `float64` `Tensor` with the mean average
       precision values.
-    update: `Operation` that increments  variables appropriately, and whose
+    update: `Operation` that increments variables appropriately, and whose
       value matches `metric`.
 
   Raises:
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 0e29ee82cb..245495a620 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1721,9 +1721,9 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   Args:
     value: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
       `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
-    ksize: A list of ints that has length >= 4.
+    ksize: A 1-D int Tensor of 4 elements.
       The size of the window for each dimension of the input tensor.
-    strides: A list of ints that has length >= 4.
+    strides: A 1-D int Tensor of 4 elements
       The stride of the sliding window for each dimension of the
       input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
@@ -1750,9 +1750,9 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   Args:
     value: A 4-D `Tensor` with shape `[batch, height, width, channels]` and
       type `tf.float32`.
-    ksize: A list of ints that has length >= 4.  The size of the window for
+    ksize: A 1-D int Tensor of 4 elements.  The size of the window for
       each dimension of the input tensor.
-    strides: A list of ints that has length >= 4.  The stride of the sliding
+    strides: A 1-D int Tensor of 4 elements.  The stride of the sliding
       window for each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the @{tf.nn.convolution$comment here}
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 3c3c18b1c9..2860618bb7 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -890,7 +890,7 @@ def raw_rnn(cell, loop_fn,
       appropriate type and shape `[batch_size] + cell.state_size`.
       If `cell.state_size` is a (possibly nested) tuple of ints or
       `TensorShape`, this will be a tuple having the corresponding shapes.
-      The `emit_output` value may be  either `None` or a (possibly nested)
+      The `emit_output` value may be either `None` or a (possibly nested)
       tuple structure of tensors, e.g.,
       `(tf.zeros(shape_0, dtype=dtype_0), tf.zeros(shape_1, dtype=dtype_1))`.
       If this first `emit_output` return value is `None`,
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index db33541218..5a179048b1 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -268,7 +268,7 @@ def sparse_add(a, b, thresh=0):
   Then,
 
       * `thresh == 0` (the default): all 5 index/value pairs will be returned.
-      * `thresh == 0.11`: only .1 and 0  will vanish, and the remaining three
+      * `thresh == 0.11`: only .1 and 0 will vanish, and the remaining three
           index/value pairs will be returned.
       * `thresh == 0.21`: .1, 0, and -.2 will vanish.
 
@@ -1263,7 +1263,7 @@ def sparse_reset_shape(sp_input, new_shape=None):
 
   Returns:
     A `SparseTensor` indices and values unchanged from `input_sp`. Its shape is
-      `new_shape` if that is set. Otherwise it is  the tight bounding box of
+      `new_shape` if that is set. Otherwise it is the tight bounding box of
        `input_sp`
 
   Raises:
@@ -1720,7 +1720,7 @@ def sparse_tensor_dense_matmul(sp_a,
 def sparse_softmax(sp_input, name=None):
   """Applies softmax to a batched N-D `SparseTensor`.
 
-  The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+  The inputs represent an N-D SparseTensor with logical shape `[..., B, C]`
   (where `N >= 2`), and with indices sorted in the canonical lexicographic
   order.
 
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index e6dd29077a..7c12020263 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -229,7 +229,7 @@ class Variable(object):
         which is the initial value for the Variable. The initial value must have
         a shape specified unless `validate_shape` is set to False. Can also be a
         callable with no argument that returns the initial value when called.
-        (Note that initializer functions  from init_ops.py must first be bound
+        (Note that initializer functions from init_ops.py must first be bound
          to a shape before being used here.)
       trainable: If `True`, the default, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
index b97c02666c..0282554a6f 100644
--- a/tensorflow/python/summary/text_summary.py
+++ b/tensorflow/python/summary/text_summary.py
@@ -56,7 +56,7 @@ def text_summary(name, tensor, collections=None):
       summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
 
   Returns:
-    A  TensorSummary op that is configured so that TensorBoard will recognize
+    A TensorSummary op that is configured so that TensorBoard will recognize
     that it contains textual data. The TensorSummary is a scalar `Tensor` of
     type `string` which contains `Summary` protobufs.
 
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 9f71395c96..6263351a53 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -375,7 +375,7 @@ class SessionCreator(object):
 
 
 class ChiefSessionCreator(SessionCreator):
-  """Creates a tf.Session  for a chief."""
+  """Creates a tf.Session for a chief."""
 
   def __init__(self,
                scaffold=None,
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 03591480d4..4f1237f3a2 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -42,7 +42,7 @@ def _get_variable_for(v):
       if (isinstance(var, resource_variable_ops.ResourceVariable)
           and var.handle.op is v.op):
         return var
-    raise ValueError("Got %s but  could not locate source variable." % (str(v)))
+    raise ValueError("Got %s but could not locate source variable." % (str(v)))
   return v
 
 
@@ -738,7 +738,7 @@ class Optimizer(object):
       val: A `Tensor`.  The initial value of the slot.
       slot_name: Name for the slot.
       op_name: Name to use when scoping the Variable that
-        needs to be created for  the slot.
+        needs to be created for the slot.
 
     Returns:
       A `Variable` object.
@@ -759,7 +759,7 @@ class Optimizer(object):
       dtype: Type of the value of the slot.
       slot_name: Name for the slot.
       op_name: Name to use when scoping the Variable that
-        needs to be created for  the slot.
+        needs to be created for the slot.
 
     Returns:
       A `Variable` object.
@@ -777,7 +777,7 @@ class Optimizer(object):
       var: A `Variable` object.
       slot_name: Name for the slot.
       op_name: Name to use when scoping the Variable that
-        needs to be created for  the slot.
+        needs to be created for the slot.
 
     Returns:
       A `Variable` object.
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index c9612c857b..c8c0d2a047 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -370,9 +370,9 @@ def map_structure(func, *structure, **check_types_dict):
   Args:
     func: A callable that accepts as many arguments as there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
-      tuples/lists, or scalars.  Note: numpy arrays are considered  as scalars.
+      tuples/lists, or scalars.  Note: numpy arrays are considered as scalars.
     **check_types_dict: only valid keyword argument is `check_types`. If set to
-      `True` (default) the types of iterables within the  structures have to be
+      `True` (default) the types of iterables within the structures have to be
       same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
       exception). To allow this set this argument to `False`.
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 44fc21df94..3b3a2da6cd 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -62,7 +62,7 @@ pip3 install scipy==0.18.1
 pip2 install scikit-learn==0.18.1
 pip3 install scikit-learn==0.18.1
 
-# pandas required by tf.learn/inflow
+# pandas required by `inflow`
 pip2 install pandas==0.19.2
 pip3 install pandas==0.19.2
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 706d414746..e452c50221 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -80,7 +80,7 @@ pip3.5 install scipy==0.18.1
 
 pip3.5 install scikit-learn==0.18.1
 
-# pandas required by tf.learn/inflow
+# pandas required by `inflow`
 pip3 install pandas==0.19.2
 
 # Install recent-enough version of wheel for Python 3.5 wheel builds
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index ea88d8165f..b320a6222d 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -154,7 +154,7 @@ fi
 # Verify that the original Dockerfile exists
 ORIG_DOCKERFILE="${SCRIPT_DIR}/${ORIG_DOCKERFILE}"
 if [[ ! -f "${ORIG_DOCKERFILE}" ]]; then
-  die "ERROR: Cannot find Dockerilfe at: ${ORIG_DOCKERFILE}"
+  die "ERROR: Cannot find Dockerfile at: ${ORIG_DOCKERFILE}"
 fi
 
 echo ""
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 4dd3169d41..b85e565f36 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -106,7 +106,7 @@ def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
   else:
     inc_dirs = result.stderr[index1 + 1:index2].strip()
 
-  return [repository_ctx.path(_cxx_inc_convert(p))
+  return [str(repository_ctx.path(_cxx_inc_convert(p)))
           for p in inc_dirs.split("\n")]
 
 def get_cxx_inc_directories(repository_ctx, cc):