41 files changed, 921 insertions, 370 deletions
diff --git a/README.md b/README.md
index a88e72880b..b8cb4ba8ba 100644
--- a/README.md
+++ b/README.md
@@ -38,8 +38,8 @@ People who are a little more adventurous can also try our nightly binaries:
 * Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
 * Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
 * Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
diff --git a/RELEASE.md b/RELEASE.md
index 5b35f91477..64d9069e61 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,6 +1,7 @@
 # Release 1.2.0
 
 ## Major Features and Improvements
+* Python 3.6 support on Windows.
 * Added `tf.layers.conv3d_transpose` layer for spatio temporal deconvolution.
 * Added `tf.Session.make_callable()`, which provides a lower overhead means of running a similar step multiple times.
 * Added ibverbs-based RDMA support to contrib (courtesy @junshi15 from Yahoo).
@@ -48,6 +49,15 @@
   be replaced by calling `embedding_lookup` or `layers.dense` as pre- or post-
   processing of the rnn.  For RNN decoding, this functionality has been replaced
   with an alternative API in `tf.contrib.seq2seq`.
+* Intel MKL Integration (https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture). Intel developed a number of
+  optimized deep learning primitives: In addition to matrix multiplication and
+  convolution, these building blocks include:
+  Direct batched convolution
+  Pooling: maximum, minimum, average
+  Normalization: LRN, batch normalization
+  Activation: rectified linear unit (ReLU)
+  Data manipulation: multi-dimensional transposition (conversion), split,
+  concat, sum and scale.
 
 ## Deprecations
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index 2f7f8ebbae..68cd3623c0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -150,7 +150,8 @@ class MapDatasetTest(test.TestCase):
               results.append(sess.run(get_next))
             except errors.OutOfRangeError:
               return
-        threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
+        threads = [self.checkedThread(target=iterator_thread)
+                   for _ in range(64)]
         for t in threads:
           t.start()
         for t in threads:
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index 905ef13e14..ed2b251b31 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -261,16 +262,9 @@ def get_uid(prefix=''):
     2
   ```
   """
-  layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS')
-  if not layer_name_uids_collection:
-    layer_name_uids = {}
-    ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids)
-  else:
-    layer_name_uids = layer_name_uids_collection[0]
-  if prefix not in layer_name_uids:
-    layer_name_uids[prefix] = 1
-  else:
-    layer_name_uids[prefix] += 1
+  graph = ops.get_default_graph()
+  layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph]
+  layer_name_uids[prefix] += 1
   return layer_name_uids[prefix]
 
 
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
index 3b7f31a3e9..131637f03f 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -113,17 +113,20 @@ class Conv2DTest(test.TestCase):
         if padding == 'same' and strides != (1, 1):
           continue
 
-        with self.test_session():
-          testing_utils.layer_test(
-              keras.layers.Conv2D,
-              kwargs={
-                  'filters': filters,
-                  'kernel_size': kernel_size,
-                  'padding': padding,
-                  'strides': strides,
-                  'data_format': 'channels_first'
-              },
-              input_shape=(num_samples, stack_size, num_row, num_col))
+        with self.test_session(use_gpu=True):
+          # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+          # TODO(b/62340061): Support channels_first on CPU.
+          if test.is_gpu_available(cuda_only=True):
+            testing_utils.layer_test(
+                keras.layers.Conv2D,
+                kwargs={
+                    'filters': filters,
+                    'kernel_size': kernel_size,
+                    'padding': padding,
+                    'strides': strides,
+                    'data_format': 'channels_first'
+                },
+                input_shape=(num_samples, stack_size, num_row, num_col))
 
   def test_convolution_2d_regularization(self):
     # regularizers
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
index 76eaf50138..6808348414 100644
--- a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
@@ -100,15 +100,18 @@ class Pooling2DTest(test.TestCase):
                   'padding': 'valid',
                   'pool_size': (3, 3)},
           input_shape=(3, 5, 6, 4))
-      testing_utils.layer_test(
-          keras.layers.AveragePooling2D,
-          kwargs={
-              'strides': (1, 1),
-              'padding': 'valid',
-              'pool_size': (2, 2),
-              'data_format': 'channels_first'
-          },
-          input_shape=(3, 4, 5, 6))
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.AveragePooling2D,
+            kwargs={
+                'strides': (1, 1),
+                'padding': 'valid',
+                'pool_size': (2, 2),
+                'data_format': 'channels_first'
+            },
+            input_shape=(3, 4, 5, 6))
 
 
 class Pooling3DTest(test.TestCase):
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 03af377149..c7cae56049 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -237,6 +237,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index e1a27335ab..68159fe9b9 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -476,7 +476,7 @@ class _SparseColumn(
     return self._do_transform(input_tensor)
 
   @property
-  def _parse_example_config(self):
+  def _parse_example_spec(self):
     return self.config
 
   @property
@@ -802,7 +802,7 @@ class _WeightedSparseColumn(
         inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name))
 
   @property
-  def _parse_example_config(self):
+  def _parse_example_spec(self):
     return self.config
 
   @property
@@ -960,7 +960,7 @@ class _OneHotColumn(
     return self._to_dnn_input_layer(inputs.get(self.sparse_id_column))
 
   @property
-  def _parse_example_config(self):
+  def _parse_example_spec(self):
     return self.config
 
 
@@ -1110,7 +1110,7 @@ class _EmbeddingColumn(
     return inputs.get(self.sparse_id_column)
 
   @property
-  def _parse_example_config(self):
+  def _parse_example_spec(self):
     return self.config
 
 
@@ -1467,7 +1467,7 @@ class _ScatteredEmbeddingColumn(
     return inputs.get(self.column_name)
 
   @property
-  def _parse_example_config(self):
+  def _parse_example_spec(self):
     return self.config
 
 
@@ -1817,7 +1817,7 @@ class _RealValuedColumn(
         self._normalized_input_tensor(inputs.get(self.name)))
 
   @property
-  def _parse_example_config(self):
+  def _parse_example_spec(self):
     return self.config
 
 
@@ -2081,7 +2081,7 @@ class _BucketizedColumn(
         _LazyBuilderByColumnsToTensor(columns_to_tensors))
 
   @property
-  def _parse_example_config(self):
+  def _parse_example_spec(self):
     return self.config
 
   @property
@@ -2311,7 +2311,7 @@ class _CrossedColumn(
         _LazyBuilderByColumnsToTensor(columns_to_tensors))
 
   @property
-  def _parse_example_config(self):
+  def _parse_example_spec(self):
     return self.config
 
   @property
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index b6a8b6bdda..ce3bc23cf6 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -27,6 +27,7 @@ import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.layers.python.layers import feature_column_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -697,11 +698,6 @@ class FeatureColumnTest(test.TestCase):
                                                 "str_id_weights_column")
     real_valued_col1 = fc.real_valued_column("real_valued_column1")
     real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
-    real_valued_col3 = fc._real_valued_var_len_column(
-        "real_valued_column3", is_sparse=True)
-    real_valued_col4 = fc._real_valued_var_len_column(
-        "real_valued_column4", dtype=dtypes.int64, default_value=0,
-        is_sparse=False)
     bucketized_col1 = fc.bucketized_column(
         fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
     bucketized_col2 = fc.bucketized_column(
@@ -717,8 +713,8 @@ class FeatureColumnTest(test.TestCase):
     feature_columns = set([
         sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
         int64_sparse_id_col, real_valued_col1, real_valued_col2,
-        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
-        cross_col, one_hot_col, scattered_embedding_col
+        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
+        scattered_embedding_col
     ])
     expected_config = {
         "sparse_column":
@@ -739,11 +735,6 @@ class FeatureColumnTest(test.TestCase):
         "real_valued_column2":
             parsing_ops.FixedLenFeature(
                 [5], dtype=dtypes.float32),
-        "real_valued_column3":
-            parsing_ops.VarLenFeature(dtype=dtypes.float32),
-        "real_valued_column4":
-            parsing_ops.FixedLenSequenceFeature(
-                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
         "real_valued_column_for_bucketization1":
             parsing_ops.FixedLenFeature(
                 [1], dtype=dtypes.float32),
@@ -763,6 +754,10 @@ class FeatureColumnTest(test.TestCase):
     config = fc.create_feature_spec_for_parsing(feature_columns)
     self.assertDictEqual(expected_config, config)
 
+    # Tests that contrib feature columns work with core library:
+    config_core = fc_core.make_parse_example_spec(feature_columns)
+    self.assertDictEqual(expected_config, config_core)
+
     # Test that the same config is parsed out if we pass a dictionary.
     feature_columns_dict = {
         str(i): val
@@ -771,6 +766,23 @@ class FeatureColumnTest(test.TestCase):
     config = fc.create_feature_spec_for_parsing(feature_columns_dict)
     self.assertDictEqual(expected_config, config)
 
+  def testCreateFeatureSpec_ExperimentalColumns(self):
+    real_valued_col0 = fc._real_valued_var_len_column(
+        "real_valued_column0", is_sparse=True)
+    real_valued_col1 = fc._real_valued_var_len_column(
+        "real_valued_column1", dtype=dtypes.int64, default_value=0,
+        is_sparse=False)
+    feature_columns = set([real_valued_col0, real_valued_col1])
+    expected_config = {
+        "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32),
+        "real_valued_column1":
+            parsing_ops.FixedLenSequenceFeature(
+                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
+    }
+
+    config = fc.create_feature_spec_for_parsing(feature_columns)
+    self.assertDictEqual(expected_config, config)
+
   def testCreateFeatureSpec_RealValuedColumnWithDefaultValue(self):
     real_valued_col1 = fc.real_valued_column(
         "real_valued_column1", default_value=2)
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index dc159b93a3..d36d7e16de 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -47,6 +47,7 @@ _allowed_symbols = [
     "FinalBeamSearchDecoderOutput",
     "gather_tree",
     "GreedyEmbeddingHelper",
+    "SampleEmbeddingHelper",
     "ScheduledEmbeddingTrainingHelper",
     "ScheduledOutputTrainingHelper",
     "TrainingHelper",
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index ea34333360..99e51589c9 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper
 from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
@@ -64,7 +65,8 @@ class AttentionWrapperTest(test.TestCase):
 
   def assertAllCloseOrEqual(self, x, y, **kwargs):
     if isinstance(x, np.ndarray) or isinstance(x, float):
-      return super(AttentionWrapperTest, self).assertAllClose(x, y, **kwargs)
+      return super(AttentionWrapperTest, self).assertAllClose(
+          x, y, atol=1e-4, **kwargs)
     else:
       self.assertAllEqual(x, y, **kwargs)
 
@@ -84,7 +86,7 @@ class AttentionWrapperTest(test.TestCase):
                          expected_final_alignment_history=None,
                          attention_layer_size=6,
                          name=''):
-    encoder_sequence_length = [3, 2, 3, 1, 0]
+    encoder_sequence_length = [3, 2, 3, 1, 1]
     decoder_sequence_length = [2, 0, 1, 2, 3]
     batch_size = 5
     encoder_max_time = 8
@@ -98,10 +100,14 @@ class AttentionWrapperTest(test.TestCase):
     else:
       attention_depth = encoder_output_depth
 
-    decoder_inputs = np.random.randn(batch_size, decoder_max_time,
-                                     input_depth).astype(np.float32)
-    encoder_outputs = np.random.randn(batch_size, encoder_max_time,
-                                      encoder_output_depth).astype(np.float32)
+    decoder_inputs = array_ops.placeholder_with_default(
+        np.random.randn(batch_size, decoder_max_time,
+                        input_depth).astype(np.float32),
+        shape=(None, None, input_depth))
+    encoder_outputs = array_ops.placeholder_with_default(
+        np.random.randn(batch_size, encoder_max_time,
+                        encoder_output_depth).astype(np.float32),
+        shape=(None, None, encoder_output_depth))
 
     attention_mechanism = create_attention_mechanism(
         num_units=attention_mechanism_depth,
@@ -152,7 +158,7 @@ class AttentionWrapperTest(test.TestCase):
         # Remove the history from final_state for purposes of the
         # remainder of the tests.
         final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
-        self.assertEqual((None, batch_size, encoder_max_time),
+        self.assertEqual((None, batch_size, None),
                          tuple(state_alignment_history.get_shape().as_list()))
       else:
         state_alignment_history = ()
@@ -190,16 +196,17 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00083043973),
-        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052250605),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039763632),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040092287),
             h=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019849765)),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0020015112)),
         attention=ResultSummary(
-            shape=(5, 6), dtype=dtype('float32'), mean=-0.00081052497),
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.0052052638),
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -221,17 +228,17 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00040482997),
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00597103),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.8666666666666667))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039785588),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040052128),
             h=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019861322)),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019996136)),
         attention=ResultSummary(
-            shape=(5, 6), dtype=dtype('float32'), mean=-0.00038488387),
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00595117),
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -248,16 +255,17 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338),
-        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.004009536),
             h=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0020016613)),
         attention=ResultSummary(
-            shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603),
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.0051812846),
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -276,16 +284,17 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338),
-        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.004009536),
             h=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0020016613)),
         attention=ResultSummary(
-            shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603),
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.0051812846),
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -303,17 +312,17 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 10), dtype=dtype('float32'), mean=0.019546926),
+            shape=(5, 3, 10), dtype=dtype('float32'), mean=0.117389656),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=2.7999999999999998))
+            shape=(5, 3), dtype=dtype('int32'), mean=4.5999999999999996))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0041728448),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0063607907),
             h=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.002085865)),
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.00323448)),
         attention=ResultSummary(
-            shape=(5, 10), dtype=dtype('float32'), mean=0.019546915),
+            shape=(5, 10), dtype=dtype('float32'), mean=0.117389656,),
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index 600adea189..cb12bc9450 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -27,8 +27,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 # pylint: enable=g-import-not-at-top
 
@@ -189,6 +191,76 @@ class BasicDecoderTest(test.TestCase):
       self.assertAllEqual(expected_step_next_inputs,
                           sess_results["step_next_inputs"])
 
+  def testStepWithSampleEmbeddingHelper(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size  # cell's logits must match vocabulary size
+    input_depth = 10
+    np.random.seed(0)
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
+    end_token = 1
+
+    with self.test_session(use_gpu=True) as sess:
+      with variable_scope.variable_scope(
+          "testStepWithSampleEmbeddingHelper",
+          initializer=init_ops.constant_initializer(0.01)):
+        embeddings = np.random.randn(vocabulary_size,
+                                     input_depth).astype(np.float32)
+        cell = rnn_cell.LSTMCell(vocabulary_size)
+        helper = helper_py.SampleEmbeddingHelper(embeddings, start_tokens,
+                                                 end_token, seed=0)
+        my_decoder = basic_decoder.BasicDecoder(
+            cell=cell,
+            helper=helper,
+            initial_state=cell.zero_state(
+                dtype=dtypes.float32, batch_size=batch_size))
+        output_size = my_decoder.output_size
+        output_dtype = my_decoder.output_dtype
+        self.assertEqual(
+            basic_decoder.BasicDecoderOutput(cell_depth,
+                                             tensor_shape.TensorShape([])),
+            output_size)
+        self.assertEqual(
+            basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+            output_dtype)
+
+        (first_finished, first_inputs, first_state) = my_decoder.initialize()
+        (step_outputs, step_state, step_next_inputs,
+         step_finished) = my_decoder.step(
+             constant_op.constant(0), first_inputs, first_state)
+        batch_size_t = my_decoder.batch_size
+
+        self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+        self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+        self.assertTrue(
+            isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+        self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+        self.assertEqual((batch_size,), step_outputs[1].get_shape())
+        self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+        self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+        self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+        self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+        sess.run(variables.global_variables_initializer())
+        sess_results = sess.run({
+            "batch_size": batch_size_t,
+            "first_finished": first_finished,
+            "first_inputs": first_inputs,
+            "first_state": first_state,
+            "step_outputs": step_outputs,
+            "step_state": step_state,
+            "step_next_inputs": step_next_inputs,
+            "step_finished": step_finished
+        })
+
+        sample_ids = sess_results["step_outputs"].sample_id
+        expected_step_finished = (sample_ids == end_token)
+        expected_step_next_inputs = embeddings[sample_ids]
+        self.assertAllEqual(expected_step_finished,
+                            sess_results["step_finished"])
+        self.assertAllEqual(expected_step_next_inputs,
+                            sess_results["step_next_inputs"])
+
   def testStepWithScheduledEmbeddingTrainingHelper(self):
     sequence_length = [3, 4, 3, 1, 0]
     batch_size = 5
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 873a39154f..3d0627467a 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -141,6 +141,7 @@ class TestBeamStep(test.TestCase):
     outputs, next_beam_state = beam_search_decoder._beam_search_step(
         time=2,
         logits=logits,
+        next_cell_state=dummy_cell_state,
         beam_state=beam_state,
         batch_size=ops.convert_to_tensor(self.batch_size),
         beam_width=self.beam_width,
@@ -195,6 +196,7 @@ class TestBeamStep(test.TestCase):
     outputs, next_beam_state = beam_search_decoder._beam_search_step(
         time=2,
         logits=logits,
+        next_cell_state=dummy_cell_state,
         beam_state=beam_state,
         batch_size=ops.convert_to_tensor(self.batch_size),
         beam_width=self.beam_width,
@@ -224,8 +226,8 @@ class TestBeamStep(test.TestCase):
 class BeamSearchDecoderTest(test.TestCase):
 
   def _testDynamicDecodeRNN(self, time_major, has_attention):
-    encoder_sequence_length = [3, 2, 3, 1, 0]
-    decoder_sequence_length = [2, 0, 1, 2, 3]
+    encoder_sequence_length = np.array([3, 2, 3, 1, 1])
+    decoder_sequence_length = np.array([2, 0, 1, 2, 3])
     batch_size = 5
     decoder_max_time = 4
     input_depth = 7
@@ -240,11 +242,15 @@ class BeamSearchDecoderTest(test.TestCase):
     beam_width = 3
 
     with self.test_session() as sess:
+      batch_size_tensor = constant_op.constant(batch_size)
       embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
       cell = rnn_cell.LSTMCell(cell_depth)
+      initial_state = cell.zero_state(batch_size, dtypes.float32)
       if has_attention:
-        inputs = np.random.randn(batch_size, decoder_max_time,
-                                 input_depth).astype(np.float32)
+        inputs = array_ops.placeholder_with_default(
+            np.random.randn(batch_size, decoder_max_time,
+                            input_depth).astype(np.float32),
+            shape=(None, None, input_depth))
         tiled_inputs = beam_search_decoder.tile_batch(
             inputs, multiplier=beam_width)
         tiled_sequence_length = beam_search_decoder.tile_batch(
@@ -253,17 +259,22 @@ class BeamSearchDecoderTest(test.TestCase):
             num_units=attention_depth,
             memory=tiled_inputs,
             memory_sequence_length=tiled_sequence_length)
+        initial_state = beam_search_decoder.tile_batch(
+            initial_state, multiplier=beam_width)
         cell = attention_wrapper.AttentionWrapper(
             cell=cell,
             attention_mechanism=attention_mechanism,
             attention_layer_size=attention_depth,
             alignment_history=False)
       cell_state = cell.zero_state(
-          dtype=dtypes.float32, batch_size=batch_size * beam_width)
+          dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
+      if has_attention:
+        cell_state = cell_state.clone(
+            cell_state=initial_state)
       bsd = beam_search_decoder.BeamSearchDecoder(
           cell=cell,
           embedding=embedding,
-          start_tokens=batch_size * [start_token],
+          start_tokens=array_ops.fill([batch_size_tensor], start_token),
           end_token=end_token,
           initial_state=cell_state,
           beam_width=beam_width,
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 64ffb6ca40..637175674b 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -112,6 +112,18 @@ def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
   return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
 
 
+def _maybe_mask_score(score, memory_sequence_length, score_mask_value):
+  if memory_sequence_length is None:
+    return score
+  message = ("All values in memory_sequence_length must greater than zero.")
+  with ops.control_dependencies(
+      [check_ops.assert_positive(memory_sequence_length, message=message)]):
+    score_mask = array_ops.sequence_mask(
+        memory_sequence_length, maxlen=array_ops.shape(score)[1])
+    score_mask_values = score_mask_value * array_ops.ones_like(score)
+    return array_ops.where(score_mask, score, score_mask_values)
+
+
 class _BaseAttentionMechanism(AttentionMechanism):
   """A base AttentionMechanism class providing common functionality.
 
@@ -127,6 +139,7 @@ class _BaseAttentionMechanism(AttentionMechanism):
                memory_sequence_length=None,
                memory_layer=None,
                check_inner_dims_defined=True,
+               score_mask_value=float("-inf"),
                name=None):
     """Construct base AttentionMechanism class.
 
@@ -149,6 +162,9 @@ class _BaseAttentionMechanism(AttentionMechanism):
       check_inner_dims_defined: Python boolean.  If `True`, the `memory`
         argument's shape is checked to ensure all but the two outermost
         dimensions are fully defined.
+      score_mask_value: (optional): The mask value for score before passing into
+        `probability_fn`. The default is -inf. Only used if
+        `memory_sequence_length` is not None.
       name: Name to use when creating ops.
     """
     if (query_layer is not None
@@ -164,7 +180,10 @@ class _BaseAttentionMechanism(AttentionMechanism):
     if not callable(probability_fn):
       raise TypeError("probability_fn must be callable, saw type: %s" %
                       type(probability_fn).__name__)
-    self._probability_fn = probability_fn
+    self._probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
+        probability_fn(
+            _maybe_mask_score(score, memory_sequence_length, score_mask_value),
+            prev))
     with ops.name_scope(
         name, "BaseAttentionMechanismInit", nest.flatten(memory)):
       self._values = _prepare_memory(
@@ -245,6 +264,7 @@ class LuongAttention(_BaseAttentionMechanism):
                memory_sequence_length=None,
                scale=False,
                probability_fn=None,
+               score_mask_value=float("-inf"),
                name="LuongAttention"):
     """Construct the AttentionMechanism mechanism.
 
@@ -260,6 +280,9 @@ class LuongAttention(_BaseAttentionMechanism):
         probabilities.  The default is @{tf.nn.softmax}. Other options include
         @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
         Its signature should be: `probabilities = probability_fn(score)`.
+      score_mask_value: (optional): The mask value for score before passing into
+        `probability_fn`. The default is -inf. Only used if
+        `memory_sequence_length` is not None.
       name: Name to use when creating ops.
     """
     # For LuongAttention, we only transform the memory layer; thus
@@ -274,6 +297,7 @@ class LuongAttention(_BaseAttentionMechanism):
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
+        score_mask_value=score_mask_value,
         name=name)
     self._num_units = num_units
     self._scale = scale
@@ -362,6 +386,7 @@ class BahdanauAttention(_BaseAttentionMechanism):
                memory_sequence_length=None,
                normalize=False,
                probability_fn=None,
+               score_mask_value=float("-inf"),
                name="BahdanauAttention"):
     """Construct the Attention mechanism.
 
@@ -377,6 +402,9 @@ class BahdanauAttention(_BaseAttentionMechanism):
         probabilities.  The default is @{tf.nn.softmax}. Other options include
         @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
         Its signature should be: `probabilities = probability_fn(score)`.
+      score_mask_value: (optional): The mask value for score before passing into
+        `probability_fn`. The default is -inf. Only used if
+        `memory_sequence_length` is not None.
       name: Name to use when creating ops.
     """
     if probability_fn is None:
@@ -390,6 +418,7 @@ class BahdanauAttention(_BaseAttentionMechanism):
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
+        score_mask_value=score_mask_value,
         name=name)
     self._num_units = num_units
     self._normalize = normalize
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index c9be517fad..d86275f864 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -72,10 +72,30 @@ class FinalBeamSearchDecoderOutput(
   pass
 
 
+def _tile_batch(t, multiplier):
+  """Core single-tensor implementation of tile_batch."""
+  t = ops.convert_to_tensor(t, name="t")
+  shape_t = array_ops.shape(t)
+  if t.shape.ndims is None or t.shape.ndims < 1:
+    raise ValueError("t must have statically known rank")
+  tiling = [1] * (t.shape.ndims + 1)
+  tiling[1] = multiplier
+  tiled_static_batch_size = (
+      t.shape[0].value * multiplier if t.shape[0].value is not None else None)
+  tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
+  tiled = array_ops.reshape(
+      tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
+  tiled.set_shape(
+      tensor_shape.TensorShape(
+          [tiled_static_batch_size]).concatenate(t.shape[1:]))
+  return tiled
+
+
 def tile_batch(t, multiplier, name=None):
-  """Tile the batch dimension of tensor t.
+  """Tile the batch dimension of a (possibly nested structure of) tensor(s) t.
 
-  This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of
+  For each tensor t in a (possibly nested structure) of tensors,
+  this function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of
   minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
   `[batch_size * multiplier, s0, s1, ...]` composed of minibatch entries
   `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
@@ -87,27 +107,25 @@ def tile_batch(t, multiplier, name=None):
     name: Name scope for any created operations.
 
   Returns:
-    A `Tensor` shaped `[batch_size * multiplier, ...]`.
+    A (possibly nested structure of) `Tensor` shaped
+    `[batch_size * multiplier, ...]`.
 
   Raises:
-    ValueError: if `t` does not have a statically known rank or it's < 1.
+    ValueError: if tensor(s) `t` do not have a statically known rank or
+    the rank is < 1.
   """
-  with ops.name_scope(name, "tile_batch", [t, multiplier]):
-    t = ops.convert_to_tensor(t, name="t")
-    shape_t = array_ops.shape(t)
-    if t.shape.ndims is None or t.shape.ndims < 1:
-      raise ValueError("t must have statically known rank")
-    tiling = [1] * (t.shape.ndims + 1)
-    tiling[1] = multiplier
-    tiled_static_batch_size = (
-        t.shape[0].value * multiplier if t.shape[0].value is not None else None)
-    tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
-    tiled = array_ops.reshape(
-        tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
-    tiled.set_shape(
-        tensor_shape.TensorShape(
-            [tiled_static_batch_size]).concatenate(t.shape[1:]))
-    return tiled
+  flat_t = nest.flatten(t)
+  with ops.name_scope(name, "tile_batch", flat_t + [multiplier]):
+    return nest.map_structure(lambda t_: _tile_batch(t_, multiplier), t)
+
+
+def _check_maybe(t):
+  if isinstance(t, tensor_array_ops.TensorArray):
+    raise TypeError(
+        "TensorArray state is not supported by BeamSearchDecoder: %s" % t.name)
+  if t.shape.ndims is None:
+    raise ValueError(
+        "Expected tensor (%s) to have known rank, but ndims == None." % t)
 
 
 class BeamSearchDecoder(decoder.Decoder):
@@ -278,7 +296,7 @@ class BeamSearchDecoder(decoder.Decoder):
       A reshaped version of t with dimension [batch_size * beam_width, s].
     """
     if isinstance(s, ops.Tensor):
-      s = tensor_util.constant_value_as_shape(s)
+      s = tensor_shape.as_shape(tensor_util.constant_value(s))
     else:
       s = tensor_shape.TensorShape(s)
     t_shape = array_ops.shape(t)
@@ -312,7 +330,7 @@ class BeamSearchDecoder(decoder.Decoder):
         are known statically).
     """
     if isinstance(s, ops.Tensor):
-      s = tensor_util.constant_value_as_shape(s)
+      s = tensor_shape.TensorShape(tensor_util.constant_value(s))
     else:
       s = tensor_shape.TensorShape(s)
     t_shape = array_ops.shape(t)
@@ -351,13 +369,7 @@ class BeamSearchDecoder(decoder.Decoder):
       TypeError: If t is an instance of TensorArray.
       ValueError: If the rank of t is not statically known.
     """
-    if isinstance(t, tensor_array_ops.TensorArray):
-      raise TypeError(
-          "TensorArray state is not supported by BeamSearchDecoder: %s"
-          % t.name)
-    if t.shape.ndims is None:
-      raise ValueError(
-          "Expected tensor (%s) to have known rank, but ndims == None." % t)
+    _check_maybe(t)
     if t.shape.ndims >= 1:
       return self._split_batch_beams(t, s)
     else:
@@ -380,13 +392,7 @@ class BeamSearchDecoder(decoder.Decoder):
       TypeError: If t is an instance of TensorArray.
       ValueError:  If the rank of t is not statically known.
     """
-    if isinstance(t, tensor_array_ops.TensorArray):
-      raise TypeError(
-          "TensorArray state is not supported by BeamSearchDecoder: %s"
-          % t.name)
-    if t.shape.ndims is None:
-      raise ValueError(
-          "Expected tensor (%s) to have known rank, but ndims == None." % t)
+    _check_maybe(t)
     if t.shape.ndims >= 2:
       return self._merge_batch_beams(t, s)
     else:
@@ -417,7 +423,6 @@ class BeamSearchDecoder(decoder.Decoder):
           self._maybe_merge_batch_beams,
           cell_state, self._cell.state_size)
       cell_outputs, next_cell_state = self._cell(inputs, cell_state)
-
       cell_outputs = nest.map_structure(
           lambda out: self._split_batch_beams(out, out.shape[1:]), cell_outputs)
       next_cell_state = nest.map_structure(
@@ -430,11 +435,13 @@ class BeamSearchDecoder(decoder.Decoder):
       beam_search_output, beam_search_state = _beam_search_step(
           time=time,
           logits=cell_outputs,
+          next_cell_state=next_cell_state,
           beam_state=state,
           batch_size=batch_size,
           beam_width=beam_width,
           end_token=end_token,
           length_penalty_weight=length_penalty_weight)
+
       finished = beam_search_state.finished
       sample_ids = beam_search_output.predicted_ids
       next_inputs = control_flow_ops.cond(
@@ -444,8 +451,8 @@ class BeamSearchDecoder(decoder.Decoder):
     return (beam_search_output, beam_search_state, next_inputs, finished)
 
 
-def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
-                      end_token, length_penalty_weight):
+def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
+                      beam_width, end_token, length_penalty_weight):
   """Performs a single step of Beam Search Decoding.
 
   Args:
@@ -454,6 +461,8 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
       continuations.
     logits: Logits at the current time step. A tensor of shape
       `[batch_size, beam_width, vocab_size]`
+    next_cell_state: The next state from the cell, e.g. an instance of
+      AttentionWrapperState if the cell is attentional.
     beam_state: Current state of the beam search.
       An instance of `BeamSearchDecoderState`.
     batch_size: The batch size for this input.
@@ -520,10 +529,9 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
   next_beam_probs = _tensor_gather_helper(
       gather_indices=word_indices,
       gather_from=total_probs,
-      range_input=batch_size,
+      batch_size=batch_size,
       range_size=beam_width * vocab_size,
-      final_shape=[static_batch_size, beam_width])
-
+      gather_shape=[-1])
   next_word_ids = math_ops.to_int32(word_indices % vocab_size)
   next_beam_ids = math_ops.to_int32(word_indices / vocab_size)
 
@@ -531,9 +539,9 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
   previously_finished = _tensor_gather_helper(
       gather_indices=next_beam_ids,
       gather_from=previously_finished,
-      range_input=batch_size,
+      batch_size=batch_size,
       range_size=beam_width,
-      final_shape=[static_batch_size, beam_width])
+      gather_shape=[-1])
   next_finished = math_ops.logical_or(previously_finished,
                                       math_ops.equal(next_word_ids, end_token))
 
@@ -547,13 +555,28 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
   next_prediction_len = _tensor_gather_helper(
       gather_indices=next_beam_ids,
       gather_from=beam_state.lengths,
-      range_input=batch_size,
+      batch_size=batch_size,
       range_size=beam_width,
-      final_shape=[static_batch_size, beam_width])
+      gather_shape=[-1])
   next_prediction_len += lengths_to_add
 
+  # Pick out the cell_states according to the next_beam_ids. We use a
+  # different gather_shape here because the cell_state tensors, i.e.
+  # the tensors that would be gathered from, all have dimension
+  # greater than two and we need to preserve those dimensions.
+  # pylint: disable=g-long-lambda
+  next_cell_state = nest.map_structure(
+      lambda gather_from: _maybe_tensor_gather_helper(
+          gather_indices=next_beam_ids,
+          gather_from=gather_from,
+          batch_size=batch_size,
+          range_size=beam_width,
+          gather_shape=[batch_size * beam_width, -1]),
+      next_cell_state)
+  # pylint: enable=g-long-lambda
+
   next_state = BeamSearchDecoderState(
-      cell_state=beam_state.cell_state,
+      cell_state=next_cell_state,
       log_probs=next_beam_probs,
       lengths=next_prediction_len,
       finished=next_finished)
@@ -639,12 +662,74 @@ def _mask_probs(probs, eos_token, finished):
   return finished_examples + non_finished_examples
 
 
-def _tensor_gather_helper(gather_indices, gather_from, range_input, range_size,
-                          final_shape):
-  range_ = array_ops.expand_dims(math_ops.range(range_input) * range_size, 1)
+def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
+                                range_size, gather_shape):
+  """Maybe applies _tensor_gather_helper.
+
+  This applies _tensor_gather_helper when the gather_from dims is at least as
+  big as the length of gather_shape. This is used in conjunction with nest so
+  that we don't apply _tensor_gather_helper to inapplicable values like scalars.
+
+  Args:
+    gather_indices: The tensor indices that we use to gather.
+    gather_from: The tensor that we are gathering from.
+    batch_size: The batch size.
+    range_size: The number of values in each range. Likely equal to beam_width.
+    gather_shape: What we should reshape gather_from to in order to preserve the
+      correct values. An example is when gather_from is the attention from an
+      AttentionWrapperState with shape [batch_size, beam_width, attention_size].
+      There, we want to preserve the attention_size elements, so gather_shape is
+      [batch_size * beam_width, -1]. Then, upon reshape, we still have the
+      attention_size as desired.
+
+  Returns:
+    output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)]
+      or the original tensor if its dimensions are too small.
+  """
+  _check_maybe(gather_from)
+  if gather_from.shape.ndims >= len(gather_shape):
+    return _tensor_gather_helper(
+        gather_indices=gather_indices,
+        gather_from=gather_from,
+        batch_size=batch_size,
+        range_size=range_size,
+        gather_shape=gather_shape)
+  else:
+    return gather_from
+
+
+def _tensor_gather_helper(gather_indices, gather_from, batch_size,
+                          range_size, gather_shape):
+  """Helper for gathering the right indices from the tensor.
+
+  This works by reshaping gather_from to gather_shape (e.g. [-1]) and then
+  gathering from that according to the gather_indices, which are offset by
+  the right amounts in order to preserve the batch order.
+
+  Args:
+    gather_indices: The tensor indices that we use to gather.
+    gather_from: The tensor that we are gathering from.
+    batch_size: The input batch size.
+    range_size: The number of values in each range. Likely equal to beam_width.
+    gather_shape: What we should reshape gather_from to in order to preserve the
+      correct values. An example is when gather_from is the attention from an
+      AttentionWrapperState with shape [batch_size, beam_width, attention_size].
+      There, we want to preserve the attention_size elements, so gather_shape is
+      [batch_size * beam_width, -1]. Then, upon reshape, we still have the
+      attention_size as desired.
+
+  Returns:
+    output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)]
+  """
+  range_ = array_ops.expand_dims(math_ops.range(batch_size) * range_size, 1)
   gather_indices = array_ops.reshape(gather_indices + range_, [-1])
   output = array_ops.gather(
-      array_ops.reshape(gather_from, [-1]), gather_indices)
+      array_ops.reshape(gather_from, gather_shape), gather_indices)
+  final_shape = array_ops.shape(gather_from)[:1 + len(gather_shape)]
+  static_batch_size = tensor_util.constant_value(batch_size)
+  final_static_shape = (tensor_shape.TensorShape([static_batch_size])
+                        .concatenate(
+                            gather_from.shape[1:1 + len(gather_shape)]))
   output = array_ops.reshape(output, final_shape)
-  output.set_shape(final_shape)
+  output.set_shape(final_static_shape)
   return output
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index bdd7d7ca73..bee7547935 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -41,6 +41,7 @@ __all__ = [
     "Helper",
     "TrainingHelper",
     "GreedyEmbeddingHelper",
+    "SampleEmbeddingHelper",
     "CustomHelper",
     "ScheduledEmbeddingTrainingHelper",
     "ScheduledOutputTrainingHelper",
@@ -512,3 +513,42 @@ class GreedyEmbeddingHelper(Helper):
         lambda: self._start_inputs,
         lambda: self._embedding_fn(sample_ids))
     return (finished, next_inputs, state)
+
+
+class SampleEmbeddingHelper(GreedyEmbeddingHelper):
+  """A helper for use during inference.
+
+  Uses sampling (from a distribution) instead of argmax and passes the
+  result through an embedding layer to get the next input.
+  """
+
+  def __init__(self, embedding, start_tokens, end_token, seed=None):
+    """Initializer.
+
+    Args:
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`. The returned tensor
+        will be passed to the decoder input.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      seed: The sampling seed.
+
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+        scalar.
+    """
+    super(SampleEmbeddingHelper, self).__init__(
+        embedding, start_tokens, end_token)
+    self._seed = seed
+
+  def sample(self, time, outputs, state, name=None):
+    """sample for SampleEmbeddingHelper."""
+    del time, state  # unused by sample_fn
+    # Outputs are logits, we sample instead of argmax (greedy).
+    if not isinstance(outputs, ops.Tensor):
+      raise TypeError("Expected outputs to be a single Tensor, got: %s" %
+                      type(outputs))
+    sample_id_sampler = categorical.Categorical(logits=outputs)
+    sample_ids = sample_id_sampler.sample(seed=self._seed)
+
+    return sample_ids
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 343d6faf5a..284195f8fc 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1556,8 +1556,6 @@ tf_cuda_library(
         "graph/graph_constructor.cc",
         "graph/graph_def_builder.cc",
         "graph/graph_partition.cc",
-        "graph/mkl_layout_pass.cc",
-        "graph/mkl_tfconversion_pass.cc",
         "graph/node_builder.cc",
         "graph/optimizer_cse.cc",
         "graph/subgraph.cc",
@@ -1619,6 +1617,8 @@ tf_cuda_library(
         "common_runtime/threadpool_device.cc",
         "common_runtime/threadpool_device_factory.cc",
         "graph/gradients.cc",
+        "graph/mkl_layout_pass.cc",
+        "graph/mkl_tfconversion_pass.cc",
         "graph/quantize_training.cc",
         "public/session.h",
         "public/session_options.h",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index e2ad18f33b..d5dba27f45 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 #include <algorithm>
+#include <map>
+#include <tuple>
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -174,6 +176,63 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
 };
 
+// This factory helps to ensure that different GPU device objects that refer to
+// the same physical device and stream group id use the same stream group
+// object (and therefore the same CUDA streams). This is necessary since there
+// is a single memory allocator per device (see ProcessState::GetGPUAllocator)
+// and allocators must not be shared across streams.
+class BaseGPUDevice::StreamGroupFactory {
+ public:
+  // Returns the unique stream group for use with the stream defined by
+  // {gpu_id, stream_group_within_gpu}, creating it if it does not yet exist.
+  // This function is thread safe.
+  BaseGPUDevice::StreamGroup* GetOrCreate(int gpu_id,
+                                          int stream_group_within_gpu,
+                                          gpu::StreamExecutor* executor) {
+    mutex_lock guard(lock_);
+    StreamGroup* group = &streams_[key_type(gpu_id, stream_group_within_gpu)];
+    if (!group->compute) {
+      group->compute = new gpu::Stream(executor);
+      group->compute->Init();
+      VLOG(2) << "Created stream[" << stream_group_within_gpu
+              << "] = " << group->compute;
+
+      group->host_to_device = new gpu::Stream(executor);
+      group->host_to_device->Init();
+      VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
+              << "] = " << group->host_to_device;
+
+      group->device_to_host = new gpu::Stream(executor);
+      group->device_to_host->Init();
+      VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
+              << "] = " << group->device_to_host;
+
+      group->device_to_device = new gpu::Stream(executor);
+      group->device_to_device->Init();
+      VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
+              << "] = " << group->device_to_host;
+    }
+    return group;
+  }
+
+  // Returns a reference to the StreamGroupFactory singleton. Note that this is
+  // never destroyed, so the objects it owns are never deleted.
+  static StreamGroupFactory& Global() {
+    static StreamGroupFactory* instance = new StreamGroupFactory();
+    return *instance;
+  }
+
+ private:
+  mutex lock_;
+  using key_type = std::tuple<int, int>;
+  std::map<key_type, StreamGroup> streams_;
+
+  // StreamGroupFactory cannot be created directly; Call
+  // StreamGroupFactory::Global() to get the global instance.
+  StreamGroupFactory() = default;
+  TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
+};
+
 BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                              Bytes memory_limit, const DeviceLocality& locality,
                              int gpu_id, const string& physical_device_desc,
@@ -193,12 +252,6 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
 BaseGPUDevice::~BaseGPUDevice() {
   delete gpu_device_info_;
   for (auto ctx : device_contexts_) ctx->Unref();
-  for (auto& stream_group : streams_) {
-    delete stream_group.compute;
-    delete stream_group.host_to_device;
-    delete stream_group.device_to_host;
-    delete stream_group.device_to_device;
-  }
 }
 
 Status BaseGPUDevice::Init(const SessionOptions& options) {
@@ -217,27 +270,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
 
   // Create the specified number of GPU streams
   for (int i = 0; i < max_streams_; i++) {
-    auto stream = new gpu::Stream(executor_);
-    stream->Init();
-    VLOG(2) << "Created stream[" << i << "] = " << stream;
-
-    auto host_to_device_stream = new gpu::Stream(executor_);
-    host_to_device_stream->Init();
-    VLOG(2) << "Created host_to_device_stream[" << i
-            << "] = " << host_to_device_stream;
-
-    auto device_to_host_stream = new gpu::Stream(executor_);
-    device_to_host_stream->Init();
-    VLOG(2) << "Created device_to_host_stream[" << i
-            << "] = " << device_to_host_stream;
-
-    auto device_to_device_stream = new gpu::Stream(executor_);
-    device_to_device_stream->Init();
-    VLOG(2) << "Created device_to_device_stream[" << i
-            << "] = " << device_to_device_stream;
-
-    streams_.push_back({stream, host_to_device_stream, device_to_host_stream,
-                        device_to_device_stream});
+    streams_.push_back(
+        StreamGroupFactory::Global().GetOrCreate(gpu_id_, i, executor_));
 
     size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
     void* scratch_buffer = gpu_allocator_->AllocateRaw(
@@ -259,12 +293,12 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
           "Failed to memcopy into scratch buffer for device ", gpu_id_);
     }
 
-    device_contexts_.push_back(
-        new GPUDeviceContext(i, stream, host_to_device_stream,
-                             device_to_host_stream, device_to_device_stream));
+    device_contexts_.push_back(new GPUDeviceContext(
+        i, streams_.back()->compute, streams_.back()->host_to_device,
+        streams_.back()->device_to_host, streams_.back()->device_to_device));
   }
   gpu_device_info_ = new GpuDeviceInfo;
-  gpu_device_info_->stream = streams_[0].compute;
+  gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
   gpu_device_info_->event_mgr = em_.get();
   gpu_device_info_->gpu_id = gpu_id_;
@@ -511,7 +545,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
       static_cast<ConcretePerOpGpuDevice*>(device);
   DCHECK(concrete_device);
   const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
-      streams_[stream_id].compute->implementation()->CudaStreamMemberHack());
+      streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
   concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator,
                                 scratch_[stream_id]);
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 370b3cc4f6..08c58867ee 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -20,7 +20,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
 #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
 
+#include <memory>
+#include <string>
+#include <unordered_map>
 #include <vector>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
@@ -96,12 +100,14 @@ class BaseGPUDevice : public LocalDevice {
 
  private:
   struct StreamGroup {
-    gpu::Stream* compute;
-    gpu::Stream* host_to_device;
-    gpu::Stream* device_to_host;
-    gpu::Stream* device_to_device;
+    gpu::Stream* compute = nullptr;
+    gpu::Stream* host_to_device = nullptr;
+    gpu::Stream* device_to_host = nullptr;
+    gpu::Stream* device_to_device = nullptr;
   };
-  gtl::InlinedVector<StreamGroup, 4> streams_;
+  class StreamGroupFactory;
+
+  gtl::InlinedVector<StreamGroup*, 4> streams_;
   gtl::InlinedVector<char*, 4> scratch_;
   std::vector<GPUDeviceContext*> device_contexts_;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b2eaaa3492..70e66280fd 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5216,6 +5216,7 @@ tf_kernel_library(
     srcs = ["iterator_ops.cc"],
     deps = [
         ":dataset",
+        ":ops_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc
index 7f1560e9f7..a6d9ddd086 100644
--- a/tensorflow/core/kernels/iterator_ops.cc
+++ b/tensorflow/core/kernels/iterator_ops.cc
@@ -18,7 +18,10 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 
@@ -282,38 +285,54 @@ class OneShotIteratorOp : public OpKernel {
   IteratorResource* iterator_resource_ = nullptr;
 };
 
-class IteratorGetNextOp : public OpKernel {
+class IteratorGetNextOp : public AsyncOpKernel {
  public:
-  explicit IteratorGetNextOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  // TODO(mrry): Convert this to an async op, because
-  // `iterator->GetNext()` could trigger long-running operations
-  // (e.g. a QueueDequeue or a remote read).
-  void Compute(OpKernelContext* ctx) override {
+  explicit IteratorGetNextOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("iterator_get_next_thread_",
+                            SanitizeThreadSuffix(def().name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     IteratorResource* iterator;
     OP_REQUIRES_OK(ctx,
                    LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
-    core::ScopedUnref unref_iterator(iterator);
-
-    std::vector<Tensor> components;
-    bool end_of_sequence;
-
-    IteratorContext::Params params;
-    params.env = ctx->env();
-    params.step_id = ctx->step_id();
-    params.resource_manager = ctx->resource_manager();
-    params.runner = *(ctx->runner());
-    IteratorContext iter_ctx(std::move(params));
 
-    OP_REQUIRES_OK(ctx,
-                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
-    OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
+    // The call to `iterator->GetNext()` may block and depend on an
+    // inter-op thread pool thread, so we issue the call from the
+    // owned thread pool.
+    thread_pool_->Schedule([this, ctx, iterator, done]() {
+      core::ScopedUnref unref_iterator(iterator);
+
+      std::vector<Tensor> components;
+      bool end_of_sequence;
+
+      IteratorContext::Params params;
+      params.env = ctx->env();
+      params.step_id = ctx->step_id();
+      params.resource_manager = ctx->resource_manager();
+      params.runner = *(ctx->runner());
+      IteratorContext iter_ctx(std::move(params));
+
+      OP_REQUIRES_OK_ASYNC(
+          ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+          done);
+      OP_REQUIRES_ASYNC(ctx, !end_of_sequence,
+                        errors::OutOfRange("End of sequence"), done);
+
+      for (int i = 0; i < components.size(); ++i) {
+        // TODO(mrry): Check that the shapes match the shape attrs.
+        ctx->set_output(i, components[i]);
+      }
 
-    for (int i = 0; i < components.size(); ++i) {
-      // TODO(mrry): Check that the shapes match the shape attrs.
-      ctx->set_output(i, components[i]);
-    }
+      done();
+    });
   }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 };
 
 class IteratorDisposeOp : public OpKernel {
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index bd7556658a..2e1a62d8ba 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
@@ -101,7 +102,7 @@ Status SetupFlowControlInputs(OpKernelContext* ctx, bool set_output) {
 class TensorArrayCreationOp : public OpKernel {
  public:
   explicit TensorArrayCreationOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context), device_type_(context->device_type()) {}
 
   void Compute(OpKernelContext* ctx) override {
     Tensor tensor_array_output_handle;
@@ -133,6 +134,12 @@ class TensorArrayCreationOp : public OpKernel {
       // Create the flow output.
       Tensor* flow;
       OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &flow));
+      if (device_type_ == DEVICE_CPU) {
+        // Value doesn't matter, but this makes msan not complaint about
+        // copying an uninitialized value. To do this on GPU would require
+        // a kernel launch or a host->device memcpy, so we avoid that.
+        flow->flat<float>()(0) = 0;
+      }
     }
   }
 
@@ -140,6 +147,9 @@ class TensorArrayCreationOp : public OpKernel {
   virtual Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
                                    Tensor* tensor_array_output_handle,
                                    TensorArray** output_tensor_array) = 0;
+
+ private:
+  const DeviceType device_type_;
 };
 
 // A per-run local tensor array. The tensor array uses a "per-step" resource
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 5c2bda4770..334444a4a2 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -238,6 +238,33 @@ bool IsFullSlice(const TensorSlice& slice_spec,
   }
 }
 
+Status CorruptFileError(const Status& in_status, const string& filename,
+                        const string& detail) {
+  if (in_status.ok()) {
+    return errors::Internal("Unable to read file (", filename,
+                            "). Perhaps the file is corrupt or was produced by "
+                            "a newer version of TensorFlow with format changes "
+                            "(",
+                            detail, ")");
+  }
+  return Status(
+      in_status.code(),
+      strings::StrCat("Unable to read file (", filename,
+                      "). Perhaps the file is corrupt or was produced by a "
+                      "newer version of TensorFlow with format changes (",
+                      detail, "): ", in_status.error_message()));
+}
+
+table::Options TableBuilderOptions() {
+  table::Options o;
+  // Compressed tables cannot be read by TensorFlow releases prior to 1.1.
+  // To smoothen the transition, compressed writes are disabled for now
+  // (version 1.2) with the intention that they will be enabled again at
+  // some point (perhaps the 1.3 release?).
+  o.compression = table::kNoCompression;
+  return o;
+}
+
 }  // namespace
 
 BundleWriter::BundleWriter(Env* env, StringPiece prefix)
@@ -425,7 +452,7 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
 
   table::Table* table = nullptr;
   TF_RETURN_IF_ERROR(
-      table::Table::Open(table::Options(), file.get(), file_size, &table));
+      table::Table::Open(TableBuilderOptions(), file.get(), file_size, &table));
   std::unique_ptr<table::Table> table_deleter(table);
   std::unique_ptr<table::Iterator> iter(table->NewIterator());
 
@@ -433,11 +460,13 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
   // Process header.
   {
     iter->Seek(kHeaderEntryKey);
-    CHECK(iter->Valid()) << "File: " << filename
-                         << ", iterator status: " << iter->status();
+    if (!iter->Valid()) {
+      return CorruptFileError(iter->status(), filename,
+                              "failed to seek to header entry");
+    }
     BundleHeaderProto header;
-    TF_CHECK_OK(ParseEntryProto(iter->key(), iter->value(), &header));
-    CHECK_GE(header.num_shards(), 0);
+    Status s = ParseEntryProto(iter->key(), iter->value(), &header);
+    if (!s.ok()) return CorruptFileError(s, filename, "unable to parse header");
 
     merge_state->num_shards += header.num_shards();
     if (!merge_state->seen_first_bundle) {
@@ -536,7 +565,7 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
   TF_RETURN_IF_ERROR(
       env->NewWritableFile(MetaFilename(merged_prefix), &merged_metadata));
   {
-    table::TableBuilder builder(table::Options(), merged_metadata.get());
+    table::TableBuilder builder(TableBuilderOptions(), merged_metadata.get());
     // Header entry.
     BundleHeaderProto header;
     header.set_num_shards(merge.num_shards);
@@ -584,10 +613,17 @@ BundleReader::BundleReader(Env* env, StringPiece prefix)
 
   // Reads "num_shards_" from the first entry.
   iter_->Seek(kHeaderEntryKey);
-  CHECK(iter_->Valid()) << "File: " << filename
-                        << ", iterator status: " << iter_->status();
+  if (!iter_->Valid()) {
+    status_ = CorruptFileError(iter_->status(), filename,
+                               "failed to seek to header entry");
+    return;
+  }
   BundleHeaderProto header;
-  TF_CHECK_OK(ParseEntryProto(iter_->key(), iter_->value(), &header));
+  status_ = ParseEntryProto(iter_->key(), iter_->value(), &header);
+  if (!status_.ok()) {
+    status_ = CorruptFileError(status_, filename, "unable to parse header");
+    return;
+  }
   num_shards_ = header.num_shards();
   if ((header.endianness() == BundleHeaderProto::BIG && port::kLittleEndian) ||
       (header.endianness() == BundleHeaderProto::LITTLE &&
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 0438d95bc4..dbbc2de811 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -18,15 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
+import itertools
+import threading
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args
 from tensorflow.python.platform import test
 
@@ -219,5 +225,50 @@ class BroadcastSimpleTest(test.TestCase):
     self._compareGpu(x, y + 0.1, np.floor_divide, math_ops.floordiv)
 
 
-if __name__ == "__main__":
+class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
+  """Tests concurrent sessions executing on the same GPU."""
+
+  def _run_session(self, session, results):
+    n_iterations = 500
+    with session as s:
+      data = variables.Variable(1.0)
+      with ops.device('/gpu:0'):
+        random_seed.set_random_seed(1)
+        matrix1 = variables.Variable(
+            random_ops.truncated_normal([1024, 1]), name='matrix1')
+        matrix2 = variables.Variable(
+            random_ops.truncated_normal([1, 1024]), name='matrix2')
+        x1 = math_ops.multiply(data, matrix1, name='x1')
+        x3 = math_ops.matmul(x1, math_ops.matmul(matrix2, matrix1))
+        x4 = math_ops.matmul(array_ops.transpose(x3), x3, name='x4')
+        s.run(variables.global_variables_initializer())
+
+        for _ in xrange(n_iterations):
+          value = s.run(x4)
+          results.add(value.flat[0])
+          if len(results) != 1:
+            break
+
+  def testConcurrentSessions(self):
+    n_threads = 4
+    threads = []
+    results = []
+    for _ in xrange(n_threads):
+      session = self.test_session(graph=ops.Graph(), use_gpu=True)
+      results.append(set())
+      args = (session, results[-1])
+      threads.append(threading.Thread(target=self._run_session, args=args))
+
+    for thread in threads:
+      thread.start()
+    for thread in threads:
+      thread.join()
+
+    flat_results = set([x for x in itertools.chain(*results)])
+    self.assertEqual(1,
+                     len(flat_results),
+                     'Expected single value, got %r' % flat_results)
+
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 5b0f318efe..9941c97c30 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -53,6 +53,16 @@ def _make_converter(tf_dtype):
 
 class TensorArrayTest(test.TestCase):
 
+  @classmethod
+  def setUpClass(cls):
+    super(TensorArrayTest, cls).setUpClass()
+    cls._workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
+
+  @classmethod
+  def tearDownClass(cls):
+    super(TensorArrayTest, cls).tearDownClass()
+    session_lib.Session.reset(cls._workers[0].target)
+
   def testTensorArrayWriteRead(self):
     with self.test_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1225,8 +1235,7 @@ class TensorArrayTest(test.TestCase):
       ta = ta.split([1.0, 2.0], [1, 1])
     flows.append(ta.flow)
 
-    workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
-    session = session_lib.Session(workers[0].target)
+    session = session_lib.Session(self._workers[0].target)
 
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1250,13 +1259,12 @@ class TensorArrayTest(test.TestCase):
 
     def _body(i, ta_i):
       with ops.device("/job:worker/task:1/cpu:0"):
-        return i + 1, ta_i.write(i, 0.0)
+        return i + 1, ta_i.write(i, constant_op.constant(0.0))
 
     _, ta_out = control_flow_ops.while_loop(
         lambda i, ta: i < 2, _body, loop_vars=[0, ta])
 
-    workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
-    session = session_lib.Session(workers[0].target)
+    session = session_lib.Session(self._workers[0].target)
 
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1274,6 +1282,36 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
+  def testTensorArrayDisabledColocateWithFirstWriteCall(self):
+    with ops.device("/job:worker/task:0/cpu:0"):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, colocate_with_first_write_call=False)
+
+    def _body(i, ta_i):
+      with ops.device("/job:worker/task:1/cpu:0"):
+        return i + 1, ta_i.write(i, constant_op.constant(0.0))
+
+    _, ta_out = control_flow_ops.while_loop(
+        lambda i, ta: i < 2, _body, loop_vars=[0, ta])
+
+    session = session_lib.Session(self._workers[0].target)
+
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    session.run(ta_out.flow, options=run_options, run_metadata=run_metadata)
+    self.assertTrue(run_metadata.HasField("step_stats"))
+    dev_stats = {d.device: list(d.node_stats)
+                 for d in run_metadata.step_stats.dev_stats}
+    for d in dev_stats:
+      if "/task:0/" in d and "cpu" in d:  # Skip any GPU node stats
+        self.assertTrue(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+      else:
+        self.assertFalse(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+
   def testTensorArrayIdentity(self):
     with self.test_session(use_gpu=True) as session:
       ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 8410f12f3e..a37308f702 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -23,9 +23,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import functools
 import re
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 import six
@@ -650,10 +652,10 @@ def _to_list(x):
   return [x]
 
 
-def _add_elements_to_collection(elements, collections):
+def _add_elements_to_collection(elements, collection_list):
   elements = _to_list(elements)
-  collections = _to_list(collections)
-  for name in collections:
+  collection_list = _to_list(collection_list)
+  for name in collection_list:
     collection = ops.get_collection_ref(name)
     collection_set = set(collection)
     for element in elements:
@@ -666,6 +668,13 @@ def _object_list_uid(object_list):
   return ', '.join([str(abs(id(x))) for x in object_list])
 
 
+# A global dictionary mapping graph objects to an index of counters used
+# for various layer names in each graph.
+# Allows to give unique autogenerated names to layers, in a graph-specific way.
+PER_GRAPH_LAYER_NAME_UIDS = collections.defaultdict(
+    lambda: collections.defaultdict(int))
+
+
 def _unique_layer_name(name):
   """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
 
@@ -684,14 +693,7 @@ def _unique_layer_name(name):
     dense_2
   ```
   """
-  layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS')
-  if not layer_name_uids_collection:
-    layer_name_uids = {}
-    ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids)
-  else:
-    layer_name_uids = layer_name_uids_collection[0]
-  if name not in layer_name_uids:
-    layer_name_uids[name] = 1
-  else:
-    layer_name_uids[name] += 1
+  graph = ops.get_default_graph()
+  layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
+  layer_name_uids[name] += 1
   return name + '_' + str(layer_name_uids[name])
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index b61168695a..49dcd2370c 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -149,39 +149,13 @@ class _Conv(base.Layer):
     self.built = True
 
   def call(self, inputs):
-    if (self.data_format == 'channels_first' and
-        not framework.test_util.gpu_device_name()):
-      # `nn.convolution` is not implemented on CPU for `channels_first` format.
-      # In cases where we are most likely running on CPU using `channels_first`,
-      # we reshape the inputs to use `channels_last` (and reshape them back
-      # afterwards). This is a temporary fix; a better solution would be a fix
-      # at the op level.
-      # TODO(chollet): remove this when `nn.convolution` is feature-complete.
-      data_format = 'channels_last'
-      if self.rank == 1:
-        inputs = array_ops.transpose(inputs, (0, 2, 1))
-      elif self.rank == 2:
-        inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
-      elif self.rank == 3:
-        inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
-    else:
-      data_format = self.data_format
     outputs = nn.convolution(
         input=inputs,
         filter=self.kernel,
         dilation_rate=self.dilation_rate,
         strides=self.strides,
         padding=self.padding.upper(),
-        data_format=utils.convert_data_format(data_format,
-                                              self.rank + 2))
-    if (self.data_format == 'channels_first' and
-        not framework.test_util.gpu_device_name()):
-      if self.rank == 1:
-        outputs = array_ops.transpose(outputs, (0, 2, 1))
-      elif self.rank == 2:
-        outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
-      elif self.rank == 3:
-        outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+        data_format=utils.convert_data_format(self.data_format, self.rank + 2))
 
     if self.bias is not None:
       if self.data_format == 'channels_first':
@@ -202,18 +176,10 @@ class _Conv(base.Layer):
                                          [outputs_shape[0], outputs_shape[1],
                                           outputs_shape[2] * outputs_shape[3],
                                           outputs_shape[4]])
-          outputs_4d = nn.bias_add(
-              outputs_4d,
-              self.bias,
-              data_format=utils.convert_data_format(self.data_format, 4))
+          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
           outputs = array_ops.reshape(outputs_4d, outputs_shape)
       else:
-        outputs = nn.bias_add(
-            outputs,
-            self.bias,
-            data_format=utils.convert_data_format(self.data_format, 4))
-        # Note that we passed rank=4 because bias_add will only accept
-        # NHWC and NCWH even if the rank of the inputs is 3 or 5.
+        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
 
     if self.activation is not None:
       return self.activation(outputs)
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 6cd644b642..e903afa0a8 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -262,16 +262,7 @@ class _Pooling2D(base.Layer):
     self.input_spec = base.InputSpec(ndim=4)
 
   def call(self, inputs):
-    if (self.data_format == 'channels_first' and
-        not framework.test_util.gpu_device_name()):
-      # `nn.convolution` is not implemented on CPU for `channels_first` format.
-      # TODO(chollet): remove this when `nn.convolution` is feature-complete.
-      data_format = 'channels_last'
-      inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
-    else:
-      data_format = self.data_format
-
-    if data_format == 'channels_last':
+    if self.data_format == 'channels_last':
       pool_shape = (1,) + self.pool_size + (1,)
       strides = (1,) + self.strides + (1,)
     else:
@@ -282,11 +273,7 @@ class _Pooling2D(base.Layer):
         ksize=pool_shape,
         strides=strides,
         padding=self.padding.upper(),
-        data_format=utils.convert_data_format(data_format, 4))
-
-    if (self.data_format == 'channels_first' and
-        not framework.test_util.gpu_device_name()):
-      outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
+        data_format=utils.convert_data_format(self.data_format, 4))
     return outputs
 
   def _compute_output_shape(self, input_shape):
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 96ace6e79b..c98e74fe6b 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -437,10 +437,14 @@ def _convert_tensorarray_to_flow(tensor_or_tensor_array):
 
 
 def _make_tensor_array(ta, t_or_flow):
+  # pylint: disable=protected-access
   new_ta = tensor_array_ops.TensorArray(
       dtype=ta.dtype, handle=ta.handle, flow=t_or_flow,
-      infer_shape=ta._infer_shape)
-  new_ta._element_shape = ta._element_shape  # pylint: disable=protected-access
+      infer_shape=ta._infer_shape,
+      colocate_with_first_write_call=ta._colocate_with_first_write_call)
+  new_ta._colocate_with = ta._colocate_with
+  new_ta._element_shape = ta._element_shape
+  # pylint: enable=protected-access
   return new_ta
 
 
diff --git a/tensorflow/python/ops/tensor_array_grad.py b/tensorflow/python/ops/tensor_array_grad.py
index 0e7d1880ce..1f70d69548 100644
--- a/tensorflow/python/ops/tensor_array_grad.py
+++ b/tensorflow/python/ops/tensor_array_grad.py
@@ -99,9 +99,9 @@ def _TensorArrayReadGrad(op, grad):
   flow = op.inputs[2]
   dtype = op.get_attr("dtype")
   grad_source = _GetGradSource(grad)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   w_g = g.write(index, grad)
   return [None, None, w_g.flow]
 
@@ -125,9 +125,9 @@ def _TensorArrayWriteGrad(op, flow):
   index = op.inputs[1]
   dtype = op.get_attr("T")
   grad_source = _GetGradSource(flow)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   grad = g.read(index)
   return [None, None, grad, flow]
 
@@ -156,9 +156,9 @@ def _TensorArrayGatherGrad(op, grad):
   flow = op.inputs[2]
   dtype = op.get_attr("dtype")
   grad_source = _GetGradSource(grad)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   u_g = g.scatter(indices, grad)
   return [None, None, u_g.flow]
 
@@ -180,9 +180,9 @@ def _TensorArrayScatterGrad(op, flow):
   indices = op.inputs[1]
   dtype = op.get_attr("T")
   grad_source = _GetGradSource(flow)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   grad = g.gather(indices)
   return [None, None, grad, flow]
 
@@ -211,9 +211,9 @@ def _TensorArrayConcatGrad(op, grad, unused_lengths_grad):
   lengths = op.outputs[1]
   dtype = op.get_attr("dtype")
   grad_source = _GetGradSource(grad)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   u_g = g.split(grad, lengths=lengths)
   # handle, flow_in
   return [None, u_g.flow]
@@ -235,9 +235,9 @@ def _TensorArraySplitGrad(op, flow):
   handle = op.inputs[0]
   dtype = op.get_attr("T")
   grad_source = _GetGradSource(flow)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   grad = g.concat()
   # handle, value, lengths, flow_in
   return [None, grad, None, flow]
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 8b119f5842..7a6abc8e61 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -54,6 +54,7 @@ class TensorArray(object):
                flow=None,
                infer_shape=True,
                element_shape=None,
+               colocate_with_first_write_call=True,
                name=None):
     """Construct a new TensorArray or wrap an existing TensorArray handle.
 
@@ -85,6 +86,11 @@ class TensorArray(object):
       element_shape: (optional, default: None) A `TensorShape` object specifying
         the shape constraints of each of the elements of the TensorArray.
         Need not be fully defined.
+      colocate_with_first_write_call: If `True`, the TensorArray will be
+        colocated on the same device as the the Tensor used on its first write
+        (write operations include `write`, `unstack`, and `split`).  If `False`,
+        the TensorArray will be placed on the device determined by the
+        device context available during its initialization.
       name: A name for the operation (optional).
 
     Raises:
@@ -120,7 +126,11 @@ class TensorArray(object):
     # Used to keep track of what tensors the TensorArray should be
     # colocated with.  We choose to colocate the TensorArray with the
     # first tensor written to it.
-    self._colocate_with = []
+    self._colocate_with_first_write_call = colocate_with_first_write_call
+    if colocate_with_first_write_call:
+      self._colocate_with = []
+    else:
+      self._colocate_with = None
 
     # Record the current static shape for the array elements. The element
     # shape is defined either by `element_shape` or the shape of the tensor
@@ -142,8 +152,8 @@ class TensorArray(object):
         # Construct the TensorArray with an empty device.  The first
         # write into the TensorArray from a Tensor with a set device
         # will retroactively set the device value of this op.
-        with ops.device(None), ops.colocate_with(None, ignore_existing=True):
-          self._handle, self._flow = gen_data_flow_ops._tensor_array_v3(
+        def create():
+          return gen_data_flow_ops._tensor_array_v3(
               dtype=dtype,
               size=size,
               element_shape=element_shape,
@@ -151,6 +161,11 @@ class TensorArray(object):
               clear_after_read=clear_after_read,
               tensor_array_name=tensor_array_name,
               name=scope)
+        if colocate_with_first_write_call:
+          with ops.device(None), ops.colocate_with(None, ignore_existing=True):
+            self._handle, self._flow = create()
+        else:
+          self._handle, self._flow = create()
 
   @property
   def flow(self):
@@ -200,10 +215,13 @@ class TensorArray(object):
     If no internal colocation group is set, colocate with `value` and set
     the internal colocation group to be value.
     """
-    if not self._colocate_with:
-      self._colocate_with.append(value)
-    with ops.colocate_with(self._colocate_with[0]):
+    if not self._colocate_with_first_write_call:
       yield
+    else:
+      if not self._colocate_with:
+        self._colocate_with.append(value)
+      with ops.colocate_with(self._colocate_with[0]):
+        yield
 
   def identity(self):
     """Returns a TensorArray with the same content and properties.
@@ -214,8 +232,10 @@ class TensorArray(object):
       Use this object all for subsequent operations.
     """
     flow = array_ops.identity(self._flow)
-    ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow,
-                     infer_shape=self._infer_shape)
+    ta = TensorArray(
+        dtype=self._dtype, handle=self._handle, flow=flow,
+        infer_shape=self._infer_shape,
+        colocate_with_first_write_call=self._colocate_with_first_write_call)
     ta._element_shape = self._element_shape
     ta._colocate_with = self._colocate_with
     return ta
@@ -237,7 +257,8 @@ class TensorArray(object):
             dtype=self._dtype,
             handle=g_handle,
             flow=flow,
-            infer_shape=self._infer_shape)
+            infer_shape=self._infer_shape,
+            colocate_with_first_write_call=False)
         g._element_shape = self._element_shape
         return g
 
@@ -286,7 +307,9 @@ class TensorArray(object):
             value=value,
             flow_in=self._flow,
             name=name)
-      ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+      ta = TensorArray(
+          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
       ta._colocate_with = self._colocate_with
@@ -416,7 +439,9 @@ class TensorArray(object):
             value=value,
             flow_in=self._flow,
             name=name)
-      ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+      ta = TensorArray(
+          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
       ta._colocate_with = self._colocate_with
@@ -456,7 +481,9 @@ class TensorArray(object):
             lengths=lengths_64,
             flow_in=self._flow,
             name=name)
-      ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+      ta = TensorArray(
+          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
       ta._colocate_with = self._colocate_with
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index d713e222ae..4e58602a6f 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -22,6 +22,7 @@ import threading
 import weakref
 
 from tensorflow.core.protobuf import queue_runner_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
@@ -401,6 +402,10 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
     collection: A `GraphKey` specifying the graph collection to
       get the queue runners from.  Defaults to `GraphKeys.QUEUE_RUNNERS`.
 
+  Raises:
+    ValueError: if `sess` is None and there isn't any default session.
+    TypeError: if `sess` is not a `tf.Session` object.
+
   Returns:
     A list of threads.
   """
@@ -410,6 +415,15 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
       raise ValueError("Cannot start queue runners: No default session is "
                        "registered. Use `with sess.as_default()` or pass an "
                        "explicit session to tf.start_queue_runners(sess=sess)")
+
+  if not isinstance(sess, session.SessionInterface):
+    # Following check is due to backward compatibility. (b/62061352)
+    if sess.__class__.__name__ in [
+        "MonitoredSession", "SingularMonitoredSession"]:
+      return []
+    raise TypeError("sess must be a `tf.Session` object. "
+                    "Given class: {}".format(sess.__class__))
+
   with sess.graph.as_default():
     threads = []
     for qr in ops.get_collection(collection):
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 5b00ac9fc3..51c0eecf46 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import queue_runner_impl
 
 
@@ -247,6 +248,33 @@ class QueueRunnerTest(test.TestCase):
       # The variable should be 3.
       self.assertEqual(3, var.eval())
 
+  def testStartQueueRunnersRaisesIfNotASession(self):
+    zero64 = constant_op.constant(0, dtype=dtypes.int64)
+    var = variables.Variable(zero64)
+    count_up_to = var.count_up_to(3)
+    queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
+    init_op = variables.global_variables_initializer()
+    qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
+    queue_runner_impl.add_queue_runner(qr)
+    with self.test_session():
+      init_op.run()
+      with self.assertRaisesRegexp(TypeError, "tf.Session"):
+        queue_runner_impl.start_queue_runners("NotASession")
+
+  def testStartQueueRunnersIgnoresMonitoredSession(self):
+    zero64 = constant_op.constant(0, dtype=dtypes.int64)
+    var = variables.Variable(zero64)
+    count_up_to = var.count_up_to(3)
+    queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
+    init_op = variables.global_variables_initializer()
+    qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
+    queue_runner_impl.add_queue_runner(qr)
+    with self.test_session():
+      init_op.run()
+      threads = queue_runner_impl.start_queue_runners(
+          monitored_session.MonitoredSession())
+      self.assertFalse(threads)
+
   def testStartQueueRunnersNonDefaultGraph(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     graph = ops.Graph()
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 88df3351e6..05c99856d2 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -17,14 +17,52 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
+import itertools
 import traceback
 import types
 
+import six  # pylint: disable=unused-import
+
+from backports import weakref  # pylint: disable=g-bad-import-order
+
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
 
 
+class _RefInfoField(
+    collections.namedtuple(
+        '_RefInfoField', ('type_', 'repr_', 'creation_stack', 'object_used'))):
+  pass
+
+
+# Thread-safe up to int32max/2 thanks to python's GIL; and may be safe even for
+# higher values in Python 3.4+.  We don't expect to ever count higher than this.
+# https://mail.python.org/pipermail/python-list/2005-April/342279.html
+_REF_ITER = itertools.count()
+
+# Dictionary mapping id(obj) => _RefInfoField.
+_REF_INFO = {}
+
+
+def _deleted(obj_id, fatal_error):
+  obj = _REF_INFO[obj_id]
+  del _REF_INFO[obj_id]
+  if not obj.object_used:
+    if fatal_error:
+      logger = tf_logging.fatal
+    else:
+      logger = tf_logging.error
+    logger(
+        '==================================\n'
+        'Object was never used (type %s):\n%s\nIf you want to mark it as '
+        'used call its "mark_used()" method.\nIt was originally created '
+        'here:\n%s\n'
+        '==================================' %
+        (obj.type_, obj.repr_, obj.creation_stack))
+
+
 def _add_should_use_warning(x, fatal_error=False):
   """Wraps object x so that if it is never used, a warning is logged.
 
@@ -39,14 +77,14 @@ def _add_should_use_warning(x, fatal_error=False):
   """
   if x is None:  # special corner case where x is None
     return x
-  has_been_used = getattr(x, '_tf_object_has_been_used', None)
-  if has_been_used is not None:
-    x._tf_object_has_been_used = has_been_used  # pylint: disable=protected-access
+  if hasattr(x, '_tf_ref_id'):  # this is already a TFShouldUseWarningWrapper
     return x
 
   def override_method(method):
     def fn(self, *args, **kwargs):
-      self._tf_object_has_been_used = True  # pylint: disable=protected-access
+      # pylint: disable=protected-access
+      _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+          object_used=True)
       return method(self, *args, **kwargs)
     return fn
 
@@ -55,38 +93,36 @@ def _add_should_use_warning(x, fatal_error=False):
 
     def __init__(self, true_self):
       self.__dict__ = true_self.__dict__
-      stack = [x.strip() for x in traceback.format_stack()]
+      stack = [s.strip() for s in traceback.format_stack()]
       # Remove top three stack entries from adding the wrapper
-      self._tf_object_creation_stack = '\n'.join(stack[:-3])
-      self._tf_object_has_been_used = False
+      self.creation_stack = '\n'.join(stack[:-3])
+      self._tf_ref_id = next(_REF_ITER)
+      _REF_INFO[self._tf_ref_id] = _RefInfoField(
+          type_=type(x),
+          repr_=repr(x),
+          creation_stack=stack,
+          object_used=False)
+
+      # Create a finalizer for self, which will be called when self is
+      # garbage collected.  Can't add self as the args because the
+      # loop will break garbage collection.  We keep track of
+      # ourselves via python ids.
+      weakref.finalize(self, _deleted, self._tf_ref_id, fatal_error)
 
     # Not sure why this pylint warning is being used; this is not an
     # old class form.
     # pylint: disable=super-on-old-class
     def __getattribute__(self, name):
-      if name != '_tf_object_has_been_used':
-        self._tf_object_has_been_used = True
+      if name == '_tf_ref_id':
+        return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
+      if self._tf_ref_id in _REF_INFO:
+        _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+            object_used=True)
       return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
 
-    def __del__(self):
-      if not self._tf_object_has_been_used:
-        if fatal_error:
-          logger = tf_logging.fatal
-        else:
-          logger = tf_logging.error
-        logger(
-            '==================================\n'
-            'Object was never used (type %s):\n%s\nIf you want to mark it as '
-            'used call its "mark_used()" method.\nIt was originally created '
-            'here:\n%s\n'
-            '==================================' %
-            (type(x), x, self._tf_object_creation_stack))
-
-      if hasattr(super(TFShouldUseWarningWrapper, self), '__del__'):
-        return super(TFShouldUseWarningWrapper, self).__del__()
-
     def mark_used(self, *args, **kwargs):
-      self._tf_object_has_been_used = True
+      _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+          object_used=True)
       if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'):
         return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs)
     # pylint: enable=super-on-old-class
@@ -102,7 +138,8 @@ def _add_should_use_warning(x, fatal_error=False):
 
   wrapped = TFShouldUseWarningWrapper(x)
   wrapped.__doc__ = x.__doc__  # functools.wraps fails on some objects.
-  wrapped._tf_object_has_been_used = False   # pylint: disable=protected-access
+  ref_id = wrapped._tf_ref_id  # pylint: disable=protected-access
+  _REF_INFO[ref_id] = _REF_INFO[ref_id]._replace(object_used=False)
   return wrapped
 
 
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index 71d48e3dde..c826874400 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import gc
 import sys
 
 from tensorflow.python.framework import constant_op
@@ -45,7 +46,7 @@ def reroute_error(captured):
 class TfShouldUseTest(test.TestCase):
 
   def testAddShouldUseWarningWhenNotUsed(self):
-    c = constant_op.constant(0, name='blah')
+    c = constant_op.constant(0, name='blah0')
     captured = []
     with reroute_error(captured):
       def in_this_function():
@@ -53,44 +54,52 @@ class TfShouldUseTest(test.TestCase):
         del h
       in_this_function()
     self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('blah0:0', '\n'.join(captured))
     self.assertIn('in_this_function', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
-  def _testAddShouldUseWarningWhenUsed(self, fn):
-    c = constant_op.constant(0, name='blah')
+  def _testAddShouldUseWarningWhenUsed(self, fn, name):
+    c = constant_op.constant(0, name=name)
     captured = []
     with reroute_error(captured):
       h = tf_should_use._add_should_use_warning(c)
       fn(h)
       del h
     self.assertNotIn('Object was never used', '\n'.join(captured))
-    self.assertNotIn('blah:0', '\n'.join(captured))
+    self.assertNotIn('%s:0' % name, '\n'.join(captured))
 
   def testAddShouldUseWarningWhenUsedWithAdd(self):
     def add(h):
       _ = h + 1
-    self._testAddShouldUseWarningWhenUsed(add)
+    self._testAddShouldUseWarningWhenUsed(add, name='blah_add')
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
   def testAddShouldUseWarningWhenUsedWithGetName(self):
     def get_name(h):
       _ = h.name
-    self._testAddShouldUseWarningWhenUsed(get_name)
+    self._testAddShouldUseWarningWhenUsed(get_name, name='blah_get_name')
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
   def testShouldUseResult(self):
     @tf_should_use.should_use_result
     def return_const(value):
-      return constant_op.constant(value, name='blah')
+      return constant_op.constant(value, name='blah2')
     captured = []
     with reroute_error(captured):
       return_const(0.0)
     self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('blah2:0', '\n'.join(captured))
     self.assertIn('return_const', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
   def testShouldUseResultWhenNotReallyUsed(self):
     @tf_should_use.should_use_result
     def return_const(value):
-      return constant_op.constant(value, name='blah')
+      return constant_op.constant(value, name='blah3')
     captured = []
     with reroute_error(captured):
       with self.test_session():
@@ -100,8 +109,10 @@ class TfShouldUseTest(test.TestCase):
         v = constant_op.constant(1.0, name='meh')
         v.eval()
     self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('blah3:0', '\n'.join(captured))
     self.assertIn('return_const', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
index a0fad4df52..ed088c41ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
@@ -16,7 +16,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'size\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'handle\', \'flow\', \'infer_shape\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'size\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'handle\', \'flow\', \'infer_shape\', \'element_shape\', \'colocate_with_first_write_call\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "close"
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b8f9fc8453..8768852dc7 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -85,3 +85,6 @@ pip2 install mock
 
 pip2 install portpicker
 pip3 install portpicker
+
+pip2 install backports.weakref==1.0rc1
+pip3 install backports.weakref==1.0rc1
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index e7e2d256cd..edfc4e3a98 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -89,3 +89,6 @@ pip3.5 install wheel==0.29.0
 pip3.5 install portpicker
 
 pip3.5 install werkzeug
+
+pip3.5 install backports.weakref==1.0rc1
+
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index f124012edc..b4f9cc8476 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -22,12 +22,14 @@ CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat"
 :: Turn echo back on, above script turns it off.
 ECHO ON
 
-:: Some common variables to be shared between runs.
-SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe"
-SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe"
-SET PY_EXE="C:\Program Files\Anaconda3\python.exe"
-SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib"
-SET CUDNN_HOME="c:\tools\cuda"
+:: Set environment variables to be shared between runs. Do not override if they
+:: are set already.
+
+IF DEFINED CMAKE_EXE (ECHO CMAKE_EXE is set to %CMAKE_EXE%) ELSE (SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe")
+IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe")
+IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe")
+IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
+IF DEFINED CUDNN_HOME (ECHO CUDNN_HOME is set to %CUDNN_HOME%) ELSE (SET CUDNN_HOME="c:\tools\cuda")
 
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index 9307ebb66b..ba2d939b5f 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -22,7 +22,7 @@ CD %BUILD_DIR%
 SET BUILD_CC_TESTS=OFF
 SET BUILD_PYTHON_TESTS=ON
 
-SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe"
+IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index a85a220270..a1676203c7 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -39,6 +39,7 @@ REQUIRED_PACKAGES = [
     'html5lib == 0.9999999',  # identical to 1.0b8
     'markdown == 2.2.0',
     'bleach == 1.5.0',
+    'backports.weakref == 1.0rc1',
 ]
 
 project_name = 'tensorflow'