aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--README.md4
-rw-r--r--RELEASE.md10
-rw-r--r--tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py3
-rw-r--r--tensorflow/contrib/keras/python/keras/backend.py14
-rw-r--r--tensorflow/contrib/keras/python/keras/layers/convolutional_test.py25
-rw-r--r--tensorflow/contrib/keras/python/keras/layers/pooling_test.py21
-rw-r--r--tensorflow/contrib/layers/BUILD1
-rw-r--r--tensorflow/contrib/layers/python/layers/feature_column.py16
-rw-r--r--tensorflow/contrib/layers/python/layers/feature_column_test.py36
-rw-r--r--tensorflow/contrib/seq2seq/__init__.py1
-rw-r--r--tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py73
-rw-r--r--tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py72
-rw-r--r--tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py23
-rw-r--r--tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py31
-rw-r--r--tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py189
-rw-r--r--tensorflow/contrib/seq2seq/python/ops/helper.py40
-rw-r--r--tensorflow/core/BUILD4
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.cc98
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.h16
-rw-r--r--tensorflow/core/kernels/BUILD1
-rw-r--r--tensorflow/core/kernels/iterator_ops.cc69
-rw-r--r--tensorflow/core/kernels/tensor_array_ops.cc12
-rw-r--r--tensorflow/core/util/tensor_bundle/tensor_bundle.cc54
-rw-r--r--tensorflow/python/kernel_tests/basic_gpu_test.py57
-rw-r--r--tensorflow/python/kernel_tests/tensor_array_ops_test.py48
-rw-r--r--tensorflow/python/layers/base.py28
-rw-r--r--tensorflow/python/layers/convolutional.py40
-rw-r--r--tensorflow/python/layers/pooling.py17
-rw-r--r--tensorflow/python/ops/control_flow_ops.py8
-rw-r--r--tensorflow/python/ops/tensor_array_grad.py36
-rw-r--r--tensorflow/python/ops/tensor_array_ops.py51
-rw-r--r--tensorflow/python/training/queue_runner_impl.py14
-rw-r--r--tensorflow/python/training/queue_runner_test.py28
-rw-r--r--tensorflow/python/util/tf_should_use.py93
-rw-r--r--tensorflow/python/util/tf_should_use_test.py33
-rw-r--r--tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt2
-rwxr-xr-xtensorflow/tools/ci_build/install/install_pip_packages.sh3
-rwxr-xr-xtensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh3
-rw-r--r--tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat14
-rw-r--r--tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat2
-rw-r--r--tensorflow/tools/pip_package/setup.py1
41 files changed, 921 insertions, 370 deletions
diff --git a/README.md b/README.md
index a88e72880b..b8cb4ba8ba 100644
--- a/README.md
+++ b/README.md
@@ -38,8 +38,8 @@ People who are a little more adventurous can also try our nightly binaries:
* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
diff --git a/RELEASE.md b/RELEASE.md
index 5b35f91477..64d9069e61 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,6 +1,7 @@
# Release 1.2.0
## Major Features and Improvements
+* Python 3.6 support on Windows.
* Added `tf.layers.conv3d_transpose` layer for spatio temporal deconvolution.
* Added `tf.Session.make_callable()`, which provides a lower overhead means of running a similar step multiple times.
* Added ibverbs-based RDMA support to contrib (courtesy @junshi15 from Yahoo).
@@ -48,6 +49,15 @@
be replaced by calling `embedding_lookup` or `layers.dense` as pre- or post-
processing of the rnn. For RNN decoding, this functionality has been replaced
with an alternative API in `tf.contrib.seq2seq`.
+* Intel MKL Integration (https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture). Intel developed a number of
+ optimized deep learning primitives: In addition to matrix multiplication and
+ convolution, these building blocks include:
+ Direct batched convolution
+ Pooling: maximum, minimum, average
+ Normalization: LRN, batch normalization
+ Activation: rectified linear unit (ReLU)
+ Data manipulation: multi-dimensional transposition (conversion), split,
+ concat, sum and scale.
## Deprecations
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index 2f7f8ebbae..68cd3623c0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -150,7 +150,8 @@ class MapDatasetTest(test.TestCase):
results.append(sess.run(get_next))
except errors.OutOfRangeError:
return
- threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
+ threads = [self.checkedThread(target=iterator_thread)
+ for _ in range(64)]
for t in threads:
t.start()
for t in threads:
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index 905ef13e14..ed2b251b31 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes as dtypes_module
from tensorflow.python.framework import ops
from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.layers import base as tf_base_layers
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import control_flow_ops
@@ -261,16 +262,9 @@ def get_uid(prefix=''):
2
```
"""
- layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS')
- if not layer_name_uids_collection:
- layer_name_uids = {}
- ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids)
- else:
- layer_name_uids = layer_name_uids_collection[0]
- if prefix not in layer_name_uids:
- layer_name_uids[prefix] = 1
- else:
- layer_name_uids[prefix] += 1
+ graph = ops.get_default_graph()
+ layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph]
+ layer_name_uids[prefix] += 1
return layer_name_uids[prefix]
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
index 3b7f31a3e9..131637f03f 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -113,17 +113,20 @@ class Conv2DTest(test.TestCase):
if padding == 'same' and strides != (1, 1):
continue
- with self.test_session():
- testing_utils.layer_test(
- keras.layers.Conv2D,
- kwargs={
- 'filters': filters,
- 'kernel_size': kernel_size,
- 'padding': padding,
- 'strides': strides,
- 'data_format': 'channels_first'
- },
- input_shape=(num_samples, stack_size, num_row, num_col))
+ with self.test_session(use_gpu=True):
+ # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+ # TODO(b/62340061): Support channels_first on CPU.
+ if test.is_gpu_available(cuda_only=True):
+ testing_utils.layer_test(
+ keras.layers.Conv2D,
+ kwargs={
+ 'filters': filters,
+ 'kernel_size': kernel_size,
+ 'padding': padding,
+ 'strides': strides,
+ 'data_format': 'channels_first'
+ },
+ input_shape=(num_samples, stack_size, num_row, num_col))
def test_convolution_2d_regularization(self):
# regularizers
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
index 76eaf50138..6808348414 100644
--- a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
@@ -100,15 +100,18 @@ class Pooling2DTest(test.TestCase):
'padding': 'valid',
'pool_size': (3, 3)},
input_shape=(3, 5, 6, 4))
- testing_utils.layer_test(
- keras.layers.AveragePooling2D,
- kwargs={
- 'strides': (1, 1),
- 'padding': 'valid',
- 'pool_size': (2, 2),
- 'data_format': 'channels_first'
- },
- input_shape=(3, 4, 5, 6))
+ # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+ # TODO(b/62340061): Support channels_first on CPU.
+ if test.is_gpu_available(cuda_only=True):
+ testing_utils.layer_test(
+ keras.layers.AveragePooling2D,
+ kwargs={
+ 'strides': (1, 1),
+ 'padding': 'valid',
+ 'pool_size': (2, 2),
+ 'data_format': 'channels_first'
+ },
+ input_shape=(3, 4, 5, 6))
class Pooling3DTest(test.TestCase):
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 03af377149..c7cae56049 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -237,6 +237,7 @@ py_test(
"//tensorflow/python:training",
"//tensorflow/python:variable_scope",
"//tensorflow/python:variables",
+ "//tensorflow/python/feature_column",
"//third_party/py/numpy",
],
)
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index e1a27335ab..68159fe9b9 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -476,7 +476,7 @@ class _SparseColumn(
return self._do_transform(input_tensor)
@property
- def _parse_example_config(self):
+ def _parse_example_spec(self):
return self.config
@property
@@ -802,7 +802,7 @@ class _WeightedSparseColumn(
inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name))
@property
- def _parse_example_config(self):
+ def _parse_example_spec(self):
return self.config
@property
@@ -960,7 +960,7 @@ class _OneHotColumn(
return self._to_dnn_input_layer(inputs.get(self.sparse_id_column))
@property
- def _parse_example_config(self):
+ def _parse_example_spec(self):
return self.config
@@ -1110,7 +1110,7 @@ class _EmbeddingColumn(
return inputs.get(self.sparse_id_column)
@property
- def _parse_example_config(self):
+ def _parse_example_spec(self):
return self.config
@@ -1467,7 +1467,7 @@ class _ScatteredEmbeddingColumn(
return inputs.get(self.column_name)
@property
- def _parse_example_config(self):
+ def _parse_example_spec(self):
return self.config
@@ -1817,7 +1817,7 @@ class _RealValuedColumn(
self._normalized_input_tensor(inputs.get(self.name)))
@property
- def _parse_example_config(self):
+ def _parse_example_spec(self):
return self.config
@@ -2081,7 +2081,7 @@ class _BucketizedColumn(
_LazyBuilderByColumnsToTensor(columns_to_tensors))
@property
- def _parse_example_config(self):
+ def _parse_example_spec(self):
return self.config
@property
@@ -2311,7 +2311,7 @@ class _CrossedColumn(
_LazyBuilderByColumnsToTensor(columns_to_tensors))
@property
- def _parse_example_config(self):
+ def _parse_example_spec(self):
return self.config
@property
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index b6a8b6bdda..ce3bc23cf6 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -27,6 +27,7 @@ import numpy as np
from tensorflow.contrib.layers.python.layers import feature_column as fc
from tensorflow.contrib.layers.python.layers import feature_column_ops
+from tensorflow.python.feature_column import feature_column as fc_core
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -697,11 +698,6 @@ class FeatureColumnTest(test.TestCase):
"str_id_weights_column")
real_valued_col1 = fc.real_valued_column("real_valued_column1")
real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
- real_valued_col3 = fc._real_valued_var_len_column(
- "real_valued_column3", is_sparse=True)
- real_valued_col4 = fc._real_valued_var_len_column(
- "real_valued_column4", dtype=dtypes.int64, default_value=0,
- is_sparse=False)
bucketized_col1 = fc.bucketized_column(
fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
bucketized_col2 = fc.bucketized_column(
@@ -717,8 +713,8 @@ class FeatureColumnTest(test.TestCase):
feature_columns = set([
sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
int64_sparse_id_col, real_valued_col1, real_valued_col2,
- real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
- cross_col, one_hot_col, scattered_embedding_col
+ bucketized_col1, bucketized_col2, cross_col, one_hot_col,
+ scattered_embedding_col
])
expected_config = {
"sparse_column":
@@ -739,11 +735,6 @@ class FeatureColumnTest(test.TestCase):
"real_valued_column2":
parsing_ops.FixedLenFeature(
[5], dtype=dtypes.float32),
- "real_valued_column3":
- parsing_ops.VarLenFeature(dtype=dtypes.float32),
- "real_valued_column4":
- parsing_ops.FixedLenSequenceFeature(
- [], dtype=dtypes.int64, allow_missing=True, default_value=0),
"real_valued_column_for_bucketization1":
parsing_ops.FixedLenFeature(
[1], dtype=dtypes.float32),
@@ -763,6 +754,10 @@ class FeatureColumnTest(test.TestCase):
config = fc.create_feature_spec_for_parsing(feature_columns)
self.assertDictEqual(expected_config, config)
+ # Tests that contrib feature columns work with core library:
+ config_core = fc_core.make_parse_example_spec(feature_columns)
+ self.assertDictEqual(expected_config, config_core)
+
# Test that the same config is parsed out if we pass a dictionary.
feature_columns_dict = {
str(i): val
@@ -771,6 +766,23 @@ class FeatureColumnTest(test.TestCase):
config = fc.create_feature_spec_for_parsing(feature_columns_dict)
self.assertDictEqual(expected_config, config)
+ def testCreateFeatureSpec_ExperimentalColumns(self):
+ real_valued_col0 = fc._real_valued_var_len_column(
+ "real_valued_column0", is_sparse=True)
+ real_valued_col1 = fc._real_valued_var_len_column(
+ "real_valued_column1", dtype=dtypes.int64, default_value=0,
+ is_sparse=False)
+ feature_columns = set([real_valued_col0, real_valued_col1])
+ expected_config = {
+ "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32),
+ "real_valued_column1":
+ parsing_ops.FixedLenSequenceFeature(
+ [], dtype=dtypes.int64, allow_missing=True, default_value=0),
+ }
+
+ config = fc.create_feature_spec_for_parsing(feature_columns)
+ self.assertDictEqual(expected_config, config)
+
def testCreateFeatureSpec_RealValuedColumnWithDefaultValue(self):
real_valued_col1 = fc.real_valued_column(
"real_valued_column1", default_value=2)
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index dc159b93a3..d36d7e16de 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -47,6 +47,7 @@ _allowed_symbols = [
"FinalBeamSearchDecoderOutput",
"gather_tree",
"GreedyEmbeddingHelper",
+ "SampleEmbeddingHelper",
"ScheduledEmbeddingTrainingHelper",
"ScheduledOutputTrainingHelper",
"TrainingHelper",
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index ea34333360..99e51589c9 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper
from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
from tensorflow.contrib.seq2seq.python.ops import basic_decoder
from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import variables
@@ -64,7 +65,8 @@ class AttentionWrapperTest(test.TestCase):
def assertAllCloseOrEqual(self, x, y, **kwargs):
if isinstance(x, np.ndarray) or isinstance(x, float):
- return super(AttentionWrapperTest, self).assertAllClose(x, y, **kwargs)
+ return super(AttentionWrapperTest, self).assertAllClose(
+ x, y, atol=1e-4, **kwargs)
else:
self.assertAllEqual(x, y, **kwargs)
@@ -84,7 +86,7 @@ class AttentionWrapperTest(test.TestCase):
expected_final_alignment_history=None,
attention_layer_size=6,
name=''):
- encoder_sequence_length = [3, 2, 3, 1, 0]
+ encoder_sequence_length = [3, 2, 3, 1, 1]
decoder_sequence_length = [2, 0, 1, 2, 3]
batch_size = 5
encoder_max_time = 8
@@ -98,10 +100,14 @@ class AttentionWrapperTest(test.TestCase):
else:
attention_depth = encoder_output_depth
- decoder_inputs = np.random.randn(batch_size, decoder_max_time,
- input_depth).astype(np.float32)
- encoder_outputs = np.random.randn(batch_size, encoder_max_time,
- encoder_output_depth).astype(np.float32)
+ decoder_inputs = array_ops.placeholder_with_default(
+ np.random.randn(batch_size, decoder_max_time,
+ input_depth).astype(np.float32),
+ shape=(None, None, input_depth))
+ encoder_outputs = array_ops.placeholder_with_default(
+ np.random.randn(batch_size, encoder_max_time,
+ encoder_output_depth).astype(np.float32),
+ shape=(None, None, encoder_output_depth))
attention_mechanism = create_attention_mechanism(
num_units=attention_mechanism_depth,
@@ -152,7 +158,7 @@ class AttentionWrapperTest(test.TestCase):
# Remove the history from final_state for purposes of the
# remainder of the tests.
final_state = final_state._replace(alignment_history=()) # pylint: disable=protected-access
- self.assertEqual((None, batch_size, encoder_max_time),
+ self.assertEqual((None, batch_size, None),
tuple(state_alignment_history.get_shape().as_list()))
else:
state_alignment_history = ()
@@ -190,16 +196,17 @@ class AttentionWrapperTest(test.TestCase):
expected_final_output = BasicDecoderOutput(
rnn_output=ResultSummary(
- shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00083043973),
- sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
+ shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052250605),
+ sample_id=ResultSummary(
+ shape=(5, 3), dtype=dtype('int32'), mean=1.4))
expected_final_state = AttentionWrapperState(
cell_state=LSTMStateTuple(
c=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0039763632),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.0040092287),
h=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0019849765)),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.0020015112)),
attention=ResultSummary(
- shape=(5, 6), dtype=dtype('float32'), mean=-0.00081052497),
+ shape=(5, 6), dtype=dtype('float32'), mean=-0.0052052638),
time=3,
alignments=ResultSummary(
shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -221,17 +228,17 @@ class AttentionWrapperTest(test.TestCase):
expected_final_output = BasicDecoderOutput(
rnn_output=ResultSummary(
- shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00040482997),
+ shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00597103),
sample_id=ResultSummary(
- shape=(5, 3), dtype=dtype('int32'), mean=1.8666666666666667))
+ shape=(5, 3), dtype=dtype('int32'), mean=1.4))
expected_final_state = AttentionWrapperState(
cell_state=LSTMStateTuple(
c=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0039785588),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.0040052128),
h=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0019861322)),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.0019996136)),
attention=ResultSummary(
- shape=(5, 6), dtype=dtype('float32'), mean=-0.00038488387),
+ shape=(5, 6), dtype=dtype('float32'), mean=-0.00595117),
time=3,
alignments=ResultSummary(
shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -248,16 +255,17 @@ class AttentionWrapperTest(test.TestCase):
expected_final_output = BasicDecoderOutput(
rnn_output=ResultSummary(
- shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338),
- sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
+ shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
+ sample_id=ResultSummary(
+ shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666))
expected_final_state = AttentionWrapperState(
cell_state=LSTMStateTuple(
c=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.004009536),
h=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.0020016613)),
attention=ResultSummary(
- shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603),
+ shape=(5, 6), dtype=dtype('float32'), mean=-0.0051812846),
time=3,
alignments=ResultSummary(
shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -276,16 +284,17 @@ class AttentionWrapperTest(test.TestCase):
expected_final_output = BasicDecoderOutput(
rnn_output=ResultSummary(
- shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338),
- sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
+ shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
+ sample_id=ResultSummary(
+ shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666))
expected_final_state = AttentionWrapperState(
cell_state=LSTMStateTuple(
c=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.004009536),
h=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.0020016613)),
attention=ResultSummary(
- shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603),
+ shape=(5, 6), dtype=dtype('float32'), mean=-0.0051812846),
time=3,
alignments=ResultSummary(
shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -303,17 +312,17 @@ class AttentionWrapperTest(test.TestCase):
expected_final_output = BasicDecoderOutput(
rnn_output=ResultSummary(
- shape=(5, 3, 10), dtype=dtype('float32'), mean=0.019546926),
+ shape=(5, 3, 10), dtype=dtype('float32'), mean=0.117389656),
sample_id=ResultSummary(
- shape=(5, 3), dtype=dtype('int32'), mean=2.7999999999999998))
+ shape=(5, 3), dtype=dtype('int32'), mean=4.5999999999999996))
expected_final_state = AttentionWrapperState(
cell_state=LSTMStateTuple(
c=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.0041728448),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.0063607907),
h=ResultSummary(
- shape=(5, 9), dtype=dtype('float32'), mean=-0.002085865)),
+ shape=(5, 9), dtype=dtype('float32'), mean=-0.00323448)),
attention=ResultSummary(
- shape=(5, 10), dtype=dtype('float32'), mean=0.019546915),
+ shape=(5, 10), dtype=dtype('float32'), mean=0.117389656,),
time=3,
alignments=ResultSummary(
shape=(5, 8), dtype=dtype('float32'), mean=0.125),
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index 600adea189..cb12bc9450 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -27,8 +27,10 @@ from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import tensor_shape
from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import init_ops
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
from tensorflow.python.platform import test
# pylint: enable=g-import-not-at-top
@@ -189,6 +191,76 @@ class BasicDecoderTest(test.TestCase):
self.assertAllEqual(expected_step_next_inputs,
sess_results["step_next_inputs"])
+ def testStepWithSampleEmbeddingHelper(self):
+ batch_size = 5
+ vocabulary_size = 7
+ cell_depth = vocabulary_size # cell's logits must match vocabulary size
+ input_depth = 10
+ np.random.seed(0)
+ start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
+ end_token = 1
+
+ with self.test_session(use_gpu=True) as sess:
+ with variable_scope.variable_scope(
+ "testStepWithSampleEmbeddingHelper",
+ initializer=init_ops.constant_initializer(0.01)):
+ embeddings = np.random.randn(vocabulary_size,
+ input_depth).astype(np.float32)
+ cell = rnn_cell.LSTMCell(vocabulary_size)
+ helper = helper_py.SampleEmbeddingHelper(embeddings, start_tokens,
+ end_token, seed=0)
+ my_decoder = basic_decoder.BasicDecoder(
+ cell=cell,
+ helper=helper,
+ initial_state=cell.zero_state(
+ dtype=dtypes.float32, batch_size=batch_size))
+ output_size = my_decoder.output_size
+ output_dtype = my_decoder.output_dtype
+ self.assertEqual(
+ basic_decoder.BasicDecoderOutput(cell_depth,
+ tensor_shape.TensorShape([])),
+ output_size)
+ self.assertEqual(
+ basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+ output_dtype)
+
+ (first_finished, first_inputs, first_state) = my_decoder.initialize()
+ (step_outputs, step_state, step_next_inputs,
+ step_finished) = my_decoder.step(
+ constant_op.constant(0), first_inputs, first_state)
+ batch_size_t = my_decoder.batch_size
+
+ self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+ self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+ self.assertTrue(
+ isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+ self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+ self.assertEqual((batch_size,), step_outputs[1].get_shape())
+ self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+ self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+ self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+ self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+ sess.run(variables.global_variables_initializer())
+ sess_results = sess.run({
+ "batch_size": batch_size_t,
+ "first_finished": first_finished,
+ "first_inputs": first_inputs,
+ "first_state": first_state,
+ "step_outputs": step_outputs,
+ "step_state": step_state,
+ "step_next_inputs": step_next_inputs,
+ "step_finished": step_finished
+ })
+
+ sample_ids = sess_results["step_outputs"].sample_id
+ expected_step_finished = (sample_ids == end_token)
+ expected_step_next_inputs = embeddings[sample_ids]
+ self.assertAllEqual(expected_step_finished,
+ sess_results["step_finished"])
+ self.assertAllEqual(expected_step_next_inputs,
+ sess_results["step_next_inputs"])
+
def testStepWithScheduledEmbeddingTrainingHelper(self):
sequence_length = [3, 4, 3, 1, 0]
batch_size = 5
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 873a39154f..3d0627467a 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -141,6 +141,7 @@ class TestBeamStep(test.TestCase):
outputs, next_beam_state = beam_search_decoder._beam_search_step(
time=2,
logits=logits,
+ next_cell_state=dummy_cell_state,
beam_state=beam_state,
batch_size=ops.convert_to_tensor(self.batch_size),
beam_width=self.beam_width,
@@ -195,6 +196,7 @@ class TestBeamStep(test.TestCase):
outputs, next_beam_state = beam_search_decoder._beam_search_step(
time=2,
logits=logits,
+ next_cell_state=dummy_cell_state,
beam_state=beam_state,
batch_size=ops.convert_to_tensor(self.batch_size),
beam_width=self.beam_width,
@@ -224,8 +226,8 @@ class TestBeamStep(test.TestCase):
class BeamSearchDecoderTest(test.TestCase):
def _testDynamicDecodeRNN(self, time_major, has_attention):
- encoder_sequence_length = [3, 2, 3, 1, 0]
- decoder_sequence_length = [2, 0, 1, 2, 3]
+ encoder_sequence_length = np.array([3, 2, 3, 1, 1])
+ decoder_sequence_length = np.array([2, 0, 1, 2, 3])
batch_size = 5
decoder_max_time = 4
input_depth = 7
@@ -240,11 +242,15 @@ class BeamSearchDecoderTest(test.TestCase):
beam_width = 3
with self.test_session() as sess:
+ batch_size_tensor = constant_op.constant(batch_size)
embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
cell = rnn_cell.LSTMCell(cell_depth)
+ initial_state = cell.zero_state(batch_size, dtypes.float32)
if has_attention:
- inputs = np.random.randn(batch_size, decoder_max_time,
- input_depth).astype(np.float32)
+ inputs = array_ops.placeholder_with_default(
+ np.random.randn(batch_size, decoder_max_time,
+ input_depth).astype(np.float32),
+ shape=(None, None, input_depth))
tiled_inputs = beam_search_decoder.tile_batch(
inputs, multiplier=beam_width)
tiled_sequence_length = beam_search_decoder.tile_batch(
@@ -253,17 +259,22 @@ class BeamSearchDecoderTest(test.TestCase):
num_units=attention_depth,
memory=tiled_inputs,
memory_sequence_length=tiled_sequence_length)
+ initial_state = beam_search_decoder.tile_batch(
+ initial_state, multiplier=beam_width)
cell = attention_wrapper.AttentionWrapper(
cell=cell,
attention_mechanism=attention_mechanism,
attention_layer_size=attention_depth,
alignment_history=False)
cell_state = cell.zero_state(
- dtype=dtypes.float32, batch_size=batch_size * beam_width)
+ dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
+ if has_attention:
+ cell_state = cell_state.clone(
+ cell_state=initial_state)
bsd = beam_search_decoder.BeamSearchDecoder(
cell=cell,
embedding=embedding,
- start_tokens=batch_size * [start_token],
+ start_tokens=array_ops.fill([batch_size_tensor], start_token),
end_token=end_token,
initial_state=cell_state,
beam_width=beam_width,
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 64ffb6ca40..637175674b 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -112,6 +112,18 @@ def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
+def _maybe_mask_score(score, memory_sequence_length, score_mask_value):
+ if memory_sequence_length is None:
+ return score
+ message = ("All values in memory_sequence_length must greater than zero.")
+ with ops.control_dependencies(
+ [check_ops.assert_positive(memory_sequence_length, message=message)]):
+ score_mask = array_ops.sequence_mask(
+ memory_sequence_length, maxlen=array_ops.shape(score)[1])
+ score_mask_values = score_mask_value * array_ops.ones_like(score)
+ return array_ops.where(score_mask, score, score_mask_values)
+
+
class _BaseAttentionMechanism(AttentionMechanism):
"""A base AttentionMechanism class providing common functionality.
@@ -127,6 +139,7 @@ class _BaseAttentionMechanism(AttentionMechanism):
memory_sequence_length=None,
memory_layer=None,
check_inner_dims_defined=True,
+ score_mask_value=float("-inf"),
name=None):
"""Construct base AttentionMechanism class.
@@ -149,6 +162,9 @@ class _BaseAttentionMechanism(AttentionMechanism):
check_inner_dims_defined: Python boolean. If `True`, the `memory`
argument's shape is checked to ensure all but the two outermost
dimensions are fully defined.
+ score_mask_value: (optional): The mask value for score before passing into
+ `probability_fn`. The default is -inf. Only used if
+ `memory_sequence_length` is not None.
name: Name to use when creating ops.
"""
if (query_layer is not None
@@ -164,7 +180,10 @@ class _BaseAttentionMechanism(AttentionMechanism):
if not callable(probability_fn):
raise TypeError("probability_fn must be callable, saw type: %s" %
type(probability_fn).__name__)
- self._probability_fn = probability_fn
+ self._probability_fn = lambda score, prev: ( # pylint:disable=g-long-lambda
+ probability_fn(
+ _maybe_mask_score(score, memory_sequence_length, score_mask_value),
+ prev))
with ops.name_scope(
name, "BaseAttentionMechanismInit", nest.flatten(memory)):
self._values = _prepare_memory(
@@ -245,6 +264,7 @@ class LuongAttention(_BaseAttentionMechanism):
memory_sequence_length=None,
scale=False,
probability_fn=None,
+ score_mask_value=float("-inf"),
name="LuongAttention"):
"""Construct the AttentionMechanism mechanism.
@@ -260,6 +280,9 @@ class LuongAttention(_BaseAttentionMechanism):
probabilities. The default is @{tf.nn.softmax}. Other options include
@{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
Its signature should be: `probabilities = probability_fn(score)`.
+ score_mask_value: (optional): The mask value for score before passing into
+ `probability_fn`. The default is -inf. Only used if
+ `memory_sequence_length` is not None.
name: Name to use when creating ops.
"""
# For LuongAttention, we only transform the memory layer; thus
@@ -274,6 +297,7 @@ class LuongAttention(_BaseAttentionMechanism):
memory=memory,
probability_fn=wrapped_probability_fn,
memory_sequence_length=memory_sequence_length,
+ score_mask_value=score_mask_value,
name=name)
self._num_units = num_units
self._scale = scale
@@ -362,6 +386,7 @@ class BahdanauAttention(_BaseAttentionMechanism):
memory_sequence_length=None,
normalize=False,
probability_fn=None,
+ score_mask_value=float("-inf"),
name="BahdanauAttention"):
"""Construct the Attention mechanism.
@@ -377,6 +402,9 @@ class BahdanauAttention(_BaseAttentionMechanism):
probabilities. The default is @{tf.nn.softmax}. Other options include
@{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
Its signature should be: `probabilities = probability_fn(score)`.
+ score_mask_value: (optional): The mask value for score before passing into
+ `probability_fn`. The default is -inf. Only used if
+ `memory_sequence_length` is not None.
name: Name to use when creating ops.
"""
if probability_fn is None:
@@ -390,6 +418,7 @@ class BahdanauAttention(_BaseAttentionMechanism):
memory=memory,
probability_fn=wrapped_probability_fn,
memory_sequence_length=memory_sequence_length,
+ score_mask_value=score_mask_value,
name=name)
self._num_units = num_units
self._normalize = normalize
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index c9be517fad..d86275f864 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -72,10 +72,30 @@ class FinalBeamSearchDecoderOutput(
pass
+def _tile_batch(t, multiplier):
+ """Core single-tensor implementation of tile_batch."""
+ t = ops.convert_to_tensor(t, name="t")
+ shape_t = array_ops.shape(t)
+ if t.shape.ndims is None or t.shape.ndims < 1:
+ raise ValueError("t must have statically known rank")
+ tiling = [1] * (t.shape.ndims + 1)
+ tiling[1] = multiplier
+ tiled_static_batch_size = (
+ t.shape[0].value * multiplier if t.shape[0].value is not None else None)
+ tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
+ tiled = array_ops.reshape(
+ tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
+ tiled.set_shape(
+ tensor_shape.TensorShape(
+ [tiled_static_batch_size]).concatenate(t.shape[1:]))
+ return tiled
+
+
def tile_batch(t, multiplier, name=None):
- """Tile the batch dimension of tensor t.
+ """Tile the batch dimension of a (possibly nested structure of) tensor(s) t.
- This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of
+ For each tensor t in a (possibly nested structure) of tensors,
+ this function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of
minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
`[batch_size * multiplier, s0, s1, ...]` composed of minibatch entries
`t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
@@ -87,27 +107,25 @@ def tile_batch(t, multiplier, name=None):
name: Name scope for any created operations.
Returns:
- A `Tensor` shaped `[batch_size * multiplier, ...]`.
+ A (possibly nested structure of) `Tensor` shaped
+ `[batch_size * multiplier, ...]`.
Raises:
- ValueError: if `t` does not have a statically known rank or it's < 1.
+ ValueError: if tensor(s) `t` do not have a statically known rank or
+ the rank is < 1.
"""
- with ops.name_scope(name, "tile_batch", [t, multiplier]):
- t = ops.convert_to_tensor(t, name="t")
- shape_t = array_ops.shape(t)
- if t.shape.ndims is None or t.shape.ndims < 1:
- raise ValueError("t must have statically known rank")
- tiling = [1] * (t.shape.ndims + 1)
- tiling[1] = multiplier
- tiled_static_batch_size = (
- t.shape[0].value * multiplier if t.shape[0].value is not None else None)
- tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
- tiled = array_ops.reshape(
- tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
- tiled.set_shape(
- tensor_shape.TensorShape(
- [tiled_static_batch_size]).concatenate(t.shape[1:]))
- return tiled
+ flat_t = nest.flatten(t)
+ with ops.name_scope(name, "tile_batch", flat_t + [multiplier]):
+ return nest.map_structure(lambda t_: _tile_batch(t_, multiplier), t)
+
+
+def _check_maybe(t):
+ if isinstance(t, tensor_array_ops.TensorArray):
+ raise TypeError(
+ "TensorArray state is not supported by BeamSearchDecoder: %s" % t.name)
+ if t.shape.ndims is None:
+ raise ValueError(
+ "Expected tensor (%s) to have known rank, but ndims == None." % t)
class BeamSearchDecoder(decoder.Decoder):
@@ -278,7 +296,7 @@ class BeamSearchDecoder(decoder.Decoder):
A reshaped version of t with dimension [batch_size * beam_width, s].
"""
if isinstance(s, ops.Tensor):
- s = tensor_util.constant_value_as_shape(s)
+ s = tensor_shape.as_shape(tensor_util.constant_value(s))
else:
s = tensor_shape.TensorShape(s)
t_shape = array_ops.shape(t)
@@ -312,7 +330,7 @@ class BeamSearchDecoder(decoder.Decoder):
are known statically).
"""
if isinstance(s, ops.Tensor):
- s = tensor_util.constant_value_as_shape(s)
+ s = tensor_shape.TensorShape(tensor_util.constant_value(s))
else:
s = tensor_shape.TensorShape(s)
t_shape = array_ops.shape(t)
@@ -351,13 +369,7 @@ class BeamSearchDecoder(decoder.Decoder):
TypeError: If t is an instance of TensorArray.
ValueError: If the rank of t is not statically known.
"""
- if isinstance(t, tensor_array_ops.TensorArray):
- raise TypeError(
- "TensorArray state is not supported by BeamSearchDecoder: %s"
- % t.name)
- if t.shape.ndims is None:
- raise ValueError(
- "Expected tensor (%s) to have known rank, but ndims == None." % t)
+ _check_maybe(t)
if t.shape.ndims >= 1:
return self._split_batch_beams(t, s)
else:
@@ -380,13 +392,7 @@ class BeamSearchDecoder(decoder.Decoder):
TypeError: If t is an instance of TensorArray.
ValueError: If the rank of t is not statically known.
"""
- if isinstance(t, tensor_array_ops.TensorArray):
- raise TypeError(
- "TensorArray state is not supported by BeamSearchDecoder: %s"
- % t.name)
- if t.shape.ndims is None:
- raise ValueError(
- "Expected tensor (%s) to have known rank, but ndims == None." % t)
+ _check_maybe(t)
if t.shape.ndims >= 2:
return self._merge_batch_beams(t, s)
else:
@@ -417,7 +423,6 @@ class BeamSearchDecoder(decoder.Decoder):
self._maybe_merge_batch_beams,
cell_state, self._cell.state_size)
cell_outputs, next_cell_state = self._cell(inputs, cell_state)
-
cell_outputs = nest.map_structure(
lambda out: self._split_batch_beams(out, out.shape[1:]), cell_outputs)
next_cell_state = nest.map_structure(
@@ -430,11 +435,13 @@ class BeamSearchDecoder(decoder.Decoder):
beam_search_output, beam_search_state = _beam_search_step(
time=time,
logits=cell_outputs,
+ next_cell_state=next_cell_state,
beam_state=state,
batch_size=batch_size,
beam_width=beam_width,
end_token=end_token,
length_penalty_weight=length_penalty_weight)
+
finished = beam_search_state.finished
sample_ids = beam_search_output.predicted_ids
next_inputs = control_flow_ops.cond(
@@ -444,8 +451,8 @@ class BeamSearchDecoder(decoder.Decoder):
return (beam_search_output, beam_search_state, next_inputs, finished)
-def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
- end_token, length_penalty_weight):
+def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
+ beam_width, end_token, length_penalty_weight):
"""Performs a single step of Beam Search Decoding.
Args:
@@ -454,6 +461,8 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
continuations.
logits: Logits at the current time step. A tensor of shape
`[batch_size, beam_width, vocab_size]`
+ next_cell_state: The next state from the cell, e.g. an instance of
+ AttentionWrapperState if the cell is attentional.
beam_state: Current state of the beam search.
An instance of `BeamSearchDecoderState`.
batch_size: The batch size for this input.
@@ -520,10 +529,9 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
next_beam_probs = _tensor_gather_helper(
gather_indices=word_indices,
gather_from=total_probs,
- range_input=batch_size,
+ batch_size=batch_size,
range_size=beam_width * vocab_size,
- final_shape=[static_batch_size, beam_width])
-
+ gather_shape=[-1])
next_word_ids = math_ops.to_int32(word_indices % vocab_size)
next_beam_ids = math_ops.to_int32(word_indices / vocab_size)
@@ -531,9 +539,9 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
previously_finished = _tensor_gather_helper(
gather_indices=next_beam_ids,
gather_from=previously_finished,
- range_input=batch_size,
+ batch_size=batch_size,
range_size=beam_width,
- final_shape=[static_batch_size, beam_width])
+ gather_shape=[-1])
next_finished = math_ops.logical_or(previously_finished,
math_ops.equal(next_word_ids, end_token))
@@ -547,13 +555,28 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
next_prediction_len = _tensor_gather_helper(
gather_indices=next_beam_ids,
gather_from=beam_state.lengths,
- range_input=batch_size,
+ batch_size=batch_size,
range_size=beam_width,
- final_shape=[static_batch_size, beam_width])
+ gather_shape=[-1])
next_prediction_len += lengths_to_add
+ # Pick out the cell_states according to the next_beam_ids. We use a
+ # different gather_shape here because the cell_state tensors, i.e.
+ # the tensors that would be gathered from, all have dimension
+ # greater than two and we need to preserve those dimensions.
+ # pylint: disable=g-long-lambda
+ next_cell_state = nest.map_structure(
+ lambda gather_from: _maybe_tensor_gather_helper(
+ gather_indices=next_beam_ids,
+ gather_from=gather_from,
+ batch_size=batch_size,
+ range_size=beam_width,
+ gather_shape=[batch_size * beam_width, -1]),
+ next_cell_state)
+ # pylint: enable=g-long-lambda
+
next_state = BeamSearchDecoderState(
- cell_state=beam_state.cell_state,
+ cell_state=next_cell_state,
log_probs=next_beam_probs,
lengths=next_prediction_len,
finished=next_finished)
@@ -639,12 +662,74 @@ def _mask_probs(probs, eos_token, finished):
return finished_examples + non_finished_examples
-def _tensor_gather_helper(gather_indices, gather_from, range_input, range_size,
- final_shape):
- range_ = array_ops.expand_dims(math_ops.range(range_input) * range_size, 1)
+def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
+ range_size, gather_shape):
+ """Maybe applies _tensor_gather_helper.
+
+ This applies _tensor_gather_helper when the gather_from dims is at least as
+ big as the length of gather_shape. This is used in conjunction with nest so
+ that we don't apply _tensor_gather_helper to inapplicable values like scalars.
+
+ Args:
+ gather_indices: The tensor indices that we use to gather.
+ gather_from: The tensor that we are gathering from.
+ batch_size: The batch size.
+ range_size: The number of values in each range. Likely equal to beam_width.
+ gather_shape: What we should reshape gather_from to in order to preserve the
+ correct values. An example is when gather_from is the attention from an
+ AttentionWrapperState with shape [batch_size, beam_width, attention_size].
+ There, we want to preserve the attention_size elements, so gather_shape is
+ [batch_size * beam_width, -1]. Then, upon reshape, we still have the
+ attention_size as desired.
+
+ Returns:
+ output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)]
+ or the original tensor if its dimensions are too small.
+ """
+ _check_maybe(gather_from)
+ if gather_from.shape.ndims >= len(gather_shape):
+ return _tensor_gather_helper(
+ gather_indices=gather_indices,
+ gather_from=gather_from,
+ batch_size=batch_size,
+ range_size=range_size,
+ gather_shape=gather_shape)
+ else:
+ return gather_from
+
+
+def _tensor_gather_helper(gather_indices, gather_from, batch_size,
+ range_size, gather_shape):
+ """Helper for gathering the right indices from the tensor.
+
+ This works by reshaping gather_from to gather_shape (e.g. [-1]) and then
+ gathering from that according to the gather_indices, which are offset by
+ the right amounts in order to preserve the batch order.
+
+ Args:
+ gather_indices: The tensor indices that we use to gather.
+ gather_from: The tensor that we are gathering from.
+ batch_size: The input batch size.
+ range_size: The number of values in each range. Likely equal to beam_width.
+ gather_shape: What we should reshape gather_from to in order to preserve the
+ correct values. An example is when gather_from is the attention from an
+ AttentionWrapperState with shape [batch_size, beam_width, attention_size].
+ There, we want to preserve the attention_size elements, so gather_shape is
+ [batch_size * beam_width, -1]. Then, upon reshape, we still have the
+ attention_size as desired.
+
+ Returns:
+ output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)]
+ """
+ range_ = array_ops.expand_dims(math_ops.range(batch_size) * range_size, 1)
gather_indices = array_ops.reshape(gather_indices + range_, [-1])
output = array_ops.gather(
- array_ops.reshape(gather_from, [-1]), gather_indices)
+ array_ops.reshape(gather_from, gather_shape), gather_indices)
+ final_shape = array_ops.shape(gather_from)[:1 + len(gather_shape)]
+ static_batch_size = tensor_util.constant_value(batch_size)
+ final_static_shape = (tensor_shape.TensorShape([static_batch_size])
+ .concatenate(
+ gather_from.shape[1:1 + len(gather_shape)]))
output = array_ops.reshape(output, final_shape)
- output.set_shape(final_shape)
+ output.set_shape(final_static_shape)
return output
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index bdd7d7ca73..bee7547935 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -41,6 +41,7 @@ __all__ = [
"Helper",
"TrainingHelper",
"GreedyEmbeddingHelper",
+ "SampleEmbeddingHelper",
"CustomHelper",
"ScheduledEmbeddingTrainingHelper",
"ScheduledOutputTrainingHelper",
@@ -512,3 +513,42 @@ class GreedyEmbeddingHelper(Helper):
lambda: self._start_inputs,
lambda: self._embedding_fn(sample_ids))
return (finished, next_inputs, state)
+
+
+class SampleEmbeddingHelper(GreedyEmbeddingHelper):
+ """A helper for use during inference.
+
+ Uses sampling (from a distribution) instead of argmax and passes the
+ result through an embedding layer to get the next input.
+ """
+
+ def __init__(self, embedding, start_tokens, end_token, seed=None):
+ """Initializer.
+
+ Args:
+ embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+ or the `params` argument for `embedding_lookup`. The returned tensor
+ will be passed to the decoder input.
+ start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+ end_token: `int32` scalar, the token that marks end of decoding.
+ seed: The sampling seed.
+
+ Raises:
+ ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+ scalar.
+ """
+ super(SampleEmbeddingHelper, self).__init__(
+ embedding, start_tokens, end_token)
+ self._seed = seed
+
+ def sample(self, time, outputs, state, name=None):
+ """sample for SampleEmbeddingHelper."""
+ del time, state # unused by sample_fn
+ # Outputs are logits, we sample instead of argmax (greedy).
+ if not isinstance(outputs, ops.Tensor):
+ raise TypeError("Expected outputs to be a single Tensor, got: %s" %
+ type(outputs))
+ sample_id_sampler = categorical.Categorical(logits=outputs)
+ sample_ids = sample_id_sampler.sample(seed=self._seed)
+
+ return sample_ids
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 343d6faf5a..284195f8fc 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1556,8 +1556,6 @@ tf_cuda_library(
"graph/graph_constructor.cc",
"graph/graph_def_builder.cc",
"graph/graph_partition.cc",
- "graph/mkl_layout_pass.cc",
- "graph/mkl_tfconversion_pass.cc",
"graph/node_builder.cc",
"graph/optimizer_cse.cc",
"graph/subgraph.cc",
@@ -1619,6 +1617,8 @@ tf_cuda_library(
"common_runtime/threadpool_device.cc",
"common_runtime/threadpool_device_factory.cc",
"graph/gradients.cc",
+ "graph/mkl_layout_pass.cc",
+ "graph/mkl_tfconversion_pass.cc",
"graph/quantize_training.cc",
"public/session.h",
"public/session_options.h",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index e2ad18f33b..d5dba27f45 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -24,6 +24,8 @@ limitations under the License.
#include <stdlib.h>
#include <string.h>
#include <algorithm>
+#include <map>
+#include <tuple>
#include <vector>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -174,6 +176,63 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
};
+// This factory helps to ensure that different GPU device objects that refer to
+// the same physical device and stream group id use the same stream group
+// object (and therefore the same CUDA streams). This is necessary since there
+// is a single memory allocator per device (see ProcessState::GetGPUAllocator)
+// and allocators must not be shared across streams.
+class BaseGPUDevice::StreamGroupFactory {
+ public:
+ // Returns the unique stream group for use with the stream defined by
+ // {gpu_id, stream_group_within_gpu}, creating it if it does not yet exist.
+ // This function is thread safe.
+ BaseGPUDevice::StreamGroup* GetOrCreate(int gpu_id,
+ int stream_group_within_gpu,
+ gpu::StreamExecutor* executor) {
+ mutex_lock guard(lock_);
+ StreamGroup* group = &streams_[key_type(gpu_id, stream_group_within_gpu)];
+ if (!group->compute) {
+ group->compute = new gpu::Stream(executor);
+ group->compute->Init();
+ VLOG(2) << "Created stream[" << stream_group_within_gpu
+ << "] = " << group->compute;
+
+ group->host_to_device = new gpu::Stream(executor);
+ group->host_to_device->Init();
+ VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
+ << "] = " << group->host_to_device;
+
+ group->device_to_host = new gpu::Stream(executor);
+ group->device_to_host->Init();
+ VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
+ << "] = " << group->device_to_host;
+
+ group->device_to_device = new gpu::Stream(executor);
+ group->device_to_device->Init();
+ VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
+ << "] = " << group->device_to_host;
+ }
+ return group;
+ }
+
+ // Returns a reference to the StreamGroupFactory singleton. Note that this is
+ // never destroyed, so the objects it owns are never deleted.
+ static StreamGroupFactory& Global() {
+ static StreamGroupFactory* instance = new StreamGroupFactory();
+ return *instance;
+ }
+
+ private:
+ mutex lock_;
+ using key_type = std::tuple<int, int>;
+ std::map<key_type, StreamGroup> streams_;
+
+ // StreamGroupFactory cannot be created directly; Call
+ // StreamGroupFactory::Global() to get the global instance.
+ StreamGroupFactory() = default;
+ TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
+};
+
BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
Bytes memory_limit, const DeviceLocality& locality,
int gpu_id, const string& physical_device_desc,
@@ -193,12 +252,6 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
BaseGPUDevice::~BaseGPUDevice() {
delete gpu_device_info_;
for (auto ctx : device_contexts_) ctx->Unref();
- for (auto& stream_group : streams_) {
- delete stream_group.compute;
- delete stream_group.host_to_device;
- delete stream_group.device_to_host;
- delete stream_group.device_to_device;
- }
}
Status BaseGPUDevice::Init(const SessionOptions& options) {
@@ -217,27 +270,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
// Create the specified number of GPU streams
for (int i = 0; i < max_streams_; i++) {
- auto stream = new gpu::Stream(executor_);
- stream->Init();
- VLOG(2) << "Created stream[" << i << "] = " << stream;
-
- auto host_to_device_stream = new gpu::Stream(executor_);
- host_to_device_stream->Init();
- VLOG(2) << "Created host_to_device_stream[" << i
- << "] = " << host_to_device_stream;
-
- auto device_to_host_stream = new gpu::Stream(executor_);
- device_to_host_stream->Init();
- VLOG(2) << "Created device_to_host_stream[" << i
- << "] = " << device_to_host_stream;
-
- auto device_to_device_stream = new gpu::Stream(executor_);
- device_to_device_stream->Init();
- VLOG(2) << "Created device_to_device_stream[" << i
- << "] = " << device_to_device_stream;
-
- streams_.push_back({stream, host_to_device_stream, device_to_host_stream,
- device_to_device_stream});
+ streams_.push_back(
+ StreamGroupFactory::Global().GetOrCreate(gpu_id_, i, executor_));
size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
void* scratch_buffer = gpu_allocator_->AllocateRaw(
@@ -259,12 +293,12 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
"Failed to memcopy into scratch buffer for device ", gpu_id_);
}
- device_contexts_.push_back(
- new GPUDeviceContext(i, stream, host_to_device_stream,
- device_to_host_stream, device_to_device_stream));
+ device_contexts_.push_back(new GPUDeviceContext(
+ i, streams_.back()->compute, streams_.back()->host_to_device,
+ streams_.back()->device_to_host, streams_.back()->device_to_device));
}
gpu_device_info_ = new GpuDeviceInfo;
- gpu_device_info_->stream = streams_[0].compute;
+ gpu_device_info_->stream = streams_[0]->compute;
gpu_device_info_->default_context = device_contexts_[0];
gpu_device_info_->event_mgr = em_.get();
gpu_device_info_->gpu_id = gpu_id_;
@@ -511,7 +545,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
static_cast<ConcretePerOpGpuDevice*>(device);
DCHECK(concrete_device);
const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
- streams_[stream_id].compute->implementation()->CudaStreamMemberHack());
+ streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator,
scratch_[stream_id]);
}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 370b3cc4f6..08c58867ee 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -20,7 +20,11 @@ limitations under the License.
#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#include <memory>
+#include <string>
+#include <unordered_map>
#include <vector>
+
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/common_runtime/device_factory.h"
#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
@@ -96,12 +100,14 @@ class BaseGPUDevice : public LocalDevice {
private:
struct StreamGroup {
- gpu::Stream* compute;
- gpu::Stream* host_to_device;
- gpu::Stream* device_to_host;
- gpu::Stream* device_to_device;
+ gpu::Stream* compute = nullptr;
+ gpu::Stream* host_to_device = nullptr;
+ gpu::Stream* device_to_host = nullptr;
+ gpu::Stream* device_to_device = nullptr;
};
- gtl::InlinedVector<StreamGroup, 4> streams_;
+ class StreamGroupFactory;
+
+ gtl::InlinedVector<StreamGroup*, 4> streams_;
gtl::InlinedVector<char*, 4> scratch_;
std::vector<GPUDeviceContext*> device_contexts_;
GpuDeviceInfo* gpu_device_info_ = nullptr;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b2eaaa3492..70e66280fd 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5216,6 +5216,7 @@ tf_kernel_library(
srcs = ["iterator_ops.cc"],
deps = [
":dataset",
+ ":ops_util",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:dataset_ops_op_lib",
"//tensorflow/core:framework",
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc
index 7f1560e9f7..a6d9ddd086 100644
--- a/tensorflow/core/kernels/iterator_ops.cc
+++ b/tensorflow/core/kernels/iterator_ops.cc
@@ -18,7 +18,10 @@ limitations under the License.
#include "tensorflow/core/framework/partial_tensor_shape.h"
#include "tensorflow/core/framework/resource_op_kernel.h"
#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
namespace tensorflow {
@@ -282,38 +285,54 @@ class OneShotIteratorOp : public OpKernel {
IteratorResource* iterator_resource_ = nullptr;
};
-class IteratorGetNextOp : public OpKernel {
+class IteratorGetNextOp : public AsyncOpKernel {
public:
- explicit IteratorGetNextOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
- // TODO(mrry): Convert this to an async op, because
- // `iterator->GetNext()` could trigger long-running operations
- // (e.g. a QueueDequeue or a remote read).
- void Compute(OpKernelContext* ctx) override {
+ explicit IteratorGetNextOp(OpKernelConstruction* ctx)
+ : AsyncOpKernel(ctx),
+ thread_pool_(new thread::ThreadPool(
+ ctx->env(), ThreadOptions(),
+ strings::StrCat("iterator_get_next_thread_",
+ SanitizeThreadSuffix(def().name())),
+ 1 /* num_threads */, false /* low_latency_hint */)) {}
+
+ void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
IteratorResource* iterator;
OP_REQUIRES_OK(ctx,
LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
- core::ScopedUnref unref_iterator(iterator);
-
- std::vector<Tensor> components;
- bool end_of_sequence;
-
- IteratorContext::Params params;
- params.env = ctx->env();
- params.step_id = ctx->step_id();
- params.resource_manager = ctx->resource_manager();
- params.runner = *(ctx->runner());
- IteratorContext iter_ctx(std::move(params));
- OP_REQUIRES_OK(ctx,
- iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
- OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
+ // The call to `iterator->GetNext()` may block and depend on an
+ // inter-op thread pool thread, so we issue the call from the
+ // owned thread pool.
+ thread_pool_->Schedule([this, ctx, iterator, done]() {
+ core::ScopedUnref unref_iterator(iterator);
+
+ std::vector<Tensor> components;
+ bool end_of_sequence;
+
+ IteratorContext::Params params;
+ params.env = ctx->env();
+ params.step_id = ctx->step_id();
+ params.resource_manager = ctx->resource_manager();
+ params.runner = *(ctx->runner());
+ IteratorContext iter_ctx(std::move(params));
+
+ OP_REQUIRES_OK_ASYNC(
+ ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+ done);
+ OP_REQUIRES_ASYNC(ctx, !end_of_sequence,
+ errors::OutOfRange("End of sequence"), done);
+
+ for (int i = 0; i < components.size(); ++i) {
+ // TODO(mrry): Check that the shapes match the shape attrs.
+ ctx->set_output(i, components[i]);
+ }
- for (int i = 0; i < components.size(); ++i) {
- // TODO(mrry): Check that the shapes match the shape attrs.
- ctx->set_output(i, components[i]);
- }
+ done();
+ });
}
+
+ private:
+ std::unique_ptr<thread::ThreadPool> thread_pool_;
};
class IteratorDisposeOp : public OpKernel {
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index bd7556658a..2e1a62d8ba 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -36,6 +36,7 @@ limitations under the License.
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/refcount.h"
#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/platform/types.h"
@@ -101,7 +102,7 @@ Status SetupFlowControlInputs(OpKernelContext* ctx, bool set_output) {
class TensorArrayCreationOp : public OpKernel {
public:
explicit TensorArrayCreationOp(OpKernelConstruction* context)
- : OpKernel(context) {}
+ : OpKernel(context), device_type_(context->device_type()) {}
void Compute(OpKernelContext* ctx) override {
Tensor tensor_array_output_handle;
@@ -133,6 +134,12 @@ class TensorArrayCreationOp : public OpKernel {
// Create the flow output.
Tensor* flow;
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &flow));
+ if (device_type_ == DEVICE_CPU) {
+ // Value doesn't matter, but this makes msan not complaint about
+ // copying an uninitialized value. To do this on GPU would require
+ // a kernel launch or a host->device memcpy, so we avoid that.
+ flow->flat<float>()(0) = 0;
+ }
}
}
@@ -140,6 +147,9 @@ class TensorArrayCreationOp : public OpKernel {
virtual Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
Tensor* tensor_array_output_handle,
TensorArray** output_tensor_array) = 0;
+
+ private:
+ const DeviceType device_type_;
};
// A per-run local tensor array. The tensor array uses a "per-step" resource
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 5c2bda4770..334444a4a2 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -238,6 +238,33 @@ bool IsFullSlice(const TensorSlice& slice_spec,
}
}
+Status CorruptFileError(const Status& in_status, const string& filename,
+ const string& detail) {
+ if (in_status.ok()) {
+ return errors::Internal("Unable to read file (", filename,
+ "). Perhaps the file is corrupt or was produced by "
+ "a newer version of TensorFlow with format changes "
+ "(",
+ detail, ")");
+ }
+ return Status(
+ in_status.code(),
+ strings::StrCat("Unable to read file (", filename,
+ "). Perhaps the file is corrupt or was produced by a "
+ "newer version of TensorFlow with format changes (",
+ detail, "): ", in_status.error_message()));
+}
+
+table::Options TableBuilderOptions() {
+ table::Options o;
+ // Compressed tables cannot be read by TensorFlow releases prior to 1.1.
+ // To smoothen the transition, compressed writes are disabled for now
+ // (version 1.2) with the intention that they will be enabled again at
+ // some point (perhaps the 1.3 release?).
+ o.compression = table::kNoCompression;
+ return o;
+}
+
} // namespace
BundleWriter::BundleWriter(Env* env, StringPiece prefix)
@@ -425,7 +452,7 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
table::Table* table = nullptr;
TF_RETURN_IF_ERROR(
- table::Table::Open(table::Options(), file.get(), file_size, &table));
+ table::Table::Open(TableBuilderOptions(), file.get(), file_size, &table));
std::unique_ptr<table::Table> table_deleter(table);
std::unique_ptr<table::Iterator> iter(table->NewIterator());
@@ -433,11 +460,13 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
// Process header.
{
iter->Seek(kHeaderEntryKey);
- CHECK(iter->Valid()) << "File: " << filename
- << ", iterator status: " << iter->status();
+ if (!iter->Valid()) {
+ return CorruptFileError(iter->status(), filename,
+ "failed to seek to header entry");
+ }
BundleHeaderProto header;
- TF_CHECK_OK(ParseEntryProto(iter->key(), iter->value(), &header));
- CHECK_GE(header.num_shards(), 0);
+ Status s = ParseEntryProto(iter->key(), iter->value(), &header);
+ if (!s.ok()) return CorruptFileError(s, filename, "unable to parse header");
merge_state->num_shards += header.num_shards();
if (!merge_state->seen_first_bundle) {
@@ -536,7 +565,7 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
TF_RETURN_IF_ERROR(
env->NewWritableFile(MetaFilename(merged_prefix), &merged_metadata));
{
- table::TableBuilder builder(table::Options(), merged_metadata.get());
+ table::TableBuilder builder(TableBuilderOptions(), merged_metadata.get());
// Header entry.
BundleHeaderProto header;
header.set_num_shards(merge.num_shards);
@@ -584,10 +613,17 @@ BundleReader::BundleReader(Env* env, StringPiece prefix)
// Reads "num_shards_" from the first entry.
iter_->Seek(kHeaderEntryKey);
- CHECK(iter_->Valid()) << "File: " << filename
- << ", iterator status: " << iter_->status();
+ if (!iter_->Valid()) {
+ status_ = CorruptFileError(iter_->status(), filename,
+ "failed to seek to header entry");
+ return;
+ }
BundleHeaderProto header;
- TF_CHECK_OK(ParseEntryProto(iter_->key(), iter_->value(), &header));
+ status_ = ParseEntryProto(iter_->key(), iter_->value(), &header);
+ if (!status_.ok()) {
+ status_ = CorruptFileError(status_, filename, "unable to parse header");
+ return;
+ }
num_shards_ = header.num_shards();
if ((header.endianness() == BundleHeaderProto::BIG && port::kLittleEndian) ||
(header.endianness() == BundleHeaderProto::LITTLE &&
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 0438d95bc4..dbbc2de811 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -18,15 +18,21 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
-import math
+import itertools
+import threading
import numpy as np
+from six.moves import xrange # pylint: disable=redefined-builtin
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gradient_checker
from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args
from tensorflow.python.platform import test
@@ -219,5 +225,50 @@ class BroadcastSimpleTest(test.TestCase):
self._compareGpu(x, y + 0.1, np.floor_divide, math_ops.floordiv)
-if __name__ == "__main__":
+class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
+ """Tests concurrent sessions executing on the same GPU."""
+
+ def _run_session(self, session, results):
+ n_iterations = 500
+ with session as s:
+ data = variables.Variable(1.0)
+ with ops.device('/gpu:0'):
+ random_seed.set_random_seed(1)
+ matrix1 = variables.Variable(
+ random_ops.truncated_normal([1024, 1]), name='matrix1')
+ matrix2 = variables.Variable(
+ random_ops.truncated_normal([1, 1024]), name='matrix2')
+ x1 = math_ops.multiply(data, matrix1, name='x1')
+ x3 = math_ops.matmul(x1, math_ops.matmul(matrix2, matrix1))
+ x4 = math_ops.matmul(array_ops.transpose(x3), x3, name='x4')
+ s.run(variables.global_variables_initializer())
+
+ for _ in xrange(n_iterations):
+ value = s.run(x4)
+ results.add(value.flat[0])
+ if len(results) != 1:
+ break
+
+ def testConcurrentSessions(self):
+ n_threads = 4
+ threads = []
+ results = []
+ for _ in xrange(n_threads):
+ session = self.test_session(graph=ops.Graph(), use_gpu=True)
+ results.append(set())
+ args = (session, results[-1])
+ threads.append(threading.Thread(target=self._run_session, args=args))
+
+ for thread in threads:
+ thread.start()
+ for thread in threads:
+ thread.join()
+
+ flat_results = set([x for x in itertools.chain(*results)])
+ self.assertEqual(1,
+ len(flat_results),
+ 'Expected single value, got %r' % flat_results)
+
+
+if __name__ == '__main__':
test.main()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 5b0f318efe..9941c97c30 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -53,6 +53,16 @@ def _make_converter(tf_dtype):
class TensorArrayTest(test.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ super(TensorArrayTest, cls).setUpClass()
+ cls._workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
+
+ @classmethod
+ def tearDownClass(cls):
+ super(TensorArrayTest, cls).tearDownClass()
+ session_lib.Session.reset(cls._workers[0].target)
+
def testTensorArrayWriteRead(self):
with self.test_session(use_gpu=True) as session:
ta = tensor_array_ops.TensorArray(
@@ -1225,8 +1235,7 @@ class TensorArrayTest(test.TestCase):
ta = ta.split([1.0, 2.0], [1, 1])
flows.append(ta.flow)
- workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
- session = session_lib.Session(workers[0].target)
+ session = session_lib.Session(self._workers[0].target)
run_options = config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1250,13 +1259,12 @@ class TensorArrayTest(test.TestCase):
def _body(i, ta_i):
with ops.device("/job:worker/task:1/cpu:0"):
- return i + 1, ta_i.write(i, 0.0)
+ return i + 1, ta_i.write(i, constant_op.constant(0.0))
_, ta_out = control_flow_ops.while_loop(
lambda i, ta: i < 2, _body, loop_vars=[0, ta])
- workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
- session = session_lib.Session(workers[0].target)
+ session = session_lib.Session(self._workers[0].target)
run_options = config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1274,6 +1282,36 @@ class TensorArrayTest(test.TestCase):
self.assertFalse(
[s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+ def testTensorArrayDisabledColocateWithFirstWriteCall(self):
+ with ops.device("/job:worker/task:0/cpu:0"):
+ ta = tensor_array_ops.TensorArray(
+ dtype=dtypes.float32, size=2, colocate_with_first_write_call=False)
+
+ def _body(i, ta_i):
+ with ops.device("/job:worker/task:1/cpu:0"):
+ return i + 1, ta_i.write(i, constant_op.constant(0.0))
+
+ _, ta_out = control_flow_ops.while_loop(
+ lambda i, ta: i < 2, _body, loop_vars=[0, ta])
+
+ session = session_lib.Session(self._workers[0].target)
+
+ run_options = config_pb2.RunOptions(
+ trace_level=config_pb2.RunOptions.FULL_TRACE)
+ run_metadata = config_pb2.RunMetadata()
+
+ session.run(ta_out.flow, options=run_options, run_metadata=run_metadata)
+ self.assertTrue(run_metadata.HasField("step_stats"))
+ dev_stats = {d.device: list(d.node_stats)
+ for d in run_metadata.step_stats.dev_stats}
+ for d in dev_stats:
+ if "/task:0/" in d and "cpu" in d: # Skip any GPU node stats
+ self.assertTrue(
+ [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+ else:
+ self.assertFalse(
+ [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+
def testTensorArrayIdentity(self):
with self.test_session(use_gpu=True) as session:
ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 8410f12f3e..a37308f702 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -23,9 +23,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
+import collections
import copy
import functools
import re
+
from six.moves import xrange # pylint: disable=redefined-builtin
import numpy as np
import six
@@ -650,10 +652,10 @@ def _to_list(x):
return [x]
-def _add_elements_to_collection(elements, collections):
+def _add_elements_to_collection(elements, collection_list):
elements = _to_list(elements)
- collections = _to_list(collections)
- for name in collections:
+ collection_list = _to_list(collection_list)
+ for name in collection_list:
collection = ops.get_collection_ref(name)
collection_set = set(collection)
for element in elements:
@@ -666,6 +668,13 @@ def _object_list_uid(object_list):
return ', '.join([str(abs(id(x))) for x in object_list])
+# A global dictionary mapping graph objects to an index of counters used
+# for various layer names in each graph.
+# Allows to give unique autogenerated names to layers, in a graph-specific way.
+PER_GRAPH_LAYER_NAME_UIDS = collections.defaultdict(
+ lambda: collections.defaultdict(int))
+
+
def _unique_layer_name(name):
"""Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
@@ -684,14 +693,7 @@ def _unique_layer_name(name):
dense_2
```
"""
- layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS')
- if not layer_name_uids_collection:
- layer_name_uids = {}
- ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids)
- else:
- layer_name_uids = layer_name_uids_collection[0]
- if name not in layer_name_uids:
- layer_name_uids[name] = 1
- else:
- layer_name_uids[name] += 1
+ graph = ops.get_default_graph()
+ layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
+ layer_name_uids[name] += 1
return name + '_' + str(layer_name_uids[name])
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index b61168695a..49dcd2370c 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -149,39 +149,13 @@ class _Conv(base.Layer):
self.built = True
def call(self, inputs):
- if (self.data_format == 'channels_first' and
- not framework.test_util.gpu_device_name()):
- # `nn.convolution` is not implemented on CPU for `channels_first` format.
- # In cases where we are most likely running on CPU using `channels_first`,
- # we reshape the inputs to use `channels_last` (and reshape them back
- # afterwards). This is a temporary fix; a better solution would be a fix
- # at the op level.
- # TODO(chollet): remove this when `nn.convolution` is feature-complete.
- data_format = 'channels_last'
- if self.rank == 1:
- inputs = array_ops.transpose(inputs, (0, 2, 1))
- elif self.rank == 2:
- inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
- elif self.rank == 3:
- inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
- else:
- data_format = self.data_format
outputs = nn.convolution(
input=inputs,
filter=self.kernel,
dilation_rate=self.dilation_rate,
strides=self.strides,
padding=self.padding.upper(),
- data_format=utils.convert_data_format(data_format,
- self.rank + 2))
- if (self.data_format == 'channels_first' and
- not framework.test_util.gpu_device_name()):
- if self.rank == 1:
- outputs = array_ops.transpose(outputs, (0, 2, 1))
- elif self.rank == 2:
- outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
- elif self.rank == 3:
- outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+ data_format=utils.convert_data_format(self.data_format, self.rank + 2))
if self.bias is not None:
if self.data_format == 'channels_first':
@@ -202,18 +176,10 @@ class _Conv(base.Layer):
[outputs_shape[0], outputs_shape[1],
outputs_shape[2] * outputs_shape[3],
outputs_shape[4]])
- outputs_4d = nn.bias_add(
- outputs_4d,
- self.bias,
- data_format=utils.convert_data_format(self.data_format, 4))
+ outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
outputs = array_ops.reshape(outputs_4d, outputs_shape)
else:
- outputs = nn.bias_add(
- outputs,
- self.bias,
- data_format=utils.convert_data_format(self.data_format, 4))
- # Note that we passed rank=4 because bias_add will only accept
- # NHWC and NCWH even if the rank of the inputs is 3 or 5.
+ outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
if self.activation is not None:
return self.activation(outputs)
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 6cd644b642..e903afa0a8 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -262,16 +262,7 @@ class _Pooling2D(base.Layer):
self.input_spec = base.InputSpec(ndim=4)
def call(self, inputs):
- if (self.data_format == 'channels_first' and
- not framework.test_util.gpu_device_name()):
- # `nn.convolution` is not implemented on CPU for `channels_first` format.
- # TODO(chollet): remove this when `nn.convolution` is feature-complete.
- data_format = 'channels_last'
- inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
- else:
- data_format = self.data_format
-
- if data_format == 'channels_last':
+ if self.data_format == 'channels_last':
pool_shape = (1,) + self.pool_size + (1,)
strides = (1,) + self.strides + (1,)
else:
@@ -282,11 +273,7 @@ class _Pooling2D(base.Layer):
ksize=pool_shape,
strides=strides,
padding=self.padding.upper(),
- data_format=utils.convert_data_format(data_format, 4))
-
- if (self.data_format == 'channels_first' and
- not framework.test_util.gpu_device_name()):
- outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
+ data_format=utils.convert_data_format(self.data_format, 4))
return outputs
def _compute_output_shape(self, input_shape):
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 96ace6e79b..c98e74fe6b 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -437,10 +437,14 @@ def _convert_tensorarray_to_flow(tensor_or_tensor_array):
def _make_tensor_array(ta, t_or_flow):
+ # pylint: disable=protected-access
new_ta = tensor_array_ops.TensorArray(
dtype=ta.dtype, handle=ta.handle, flow=t_or_flow,
- infer_shape=ta._infer_shape)
- new_ta._element_shape = ta._element_shape # pylint: disable=protected-access
+ infer_shape=ta._infer_shape,
+ colocate_with_first_write_call=ta._colocate_with_first_write_call)
+ new_ta._colocate_with = ta._colocate_with
+ new_ta._element_shape = ta._element_shape
+ # pylint: enable=protected-access
return new_ta
diff --git a/tensorflow/python/ops/tensor_array_grad.py b/tensorflow/python/ops/tensor_array_grad.py
index 0e7d1880ce..1f70d69548 100644
--- a/tensorflow/python/ops/tensor_array_grad.py
+++ b/tensorflow/python/ops/tensor_array_grad.py
@@ -99,9 +99,9 @@ def _TensorArrayReadGrad(op, grad):
flow = op.inputs[2]
dtype = op.get_attr("dtype")
grad_source = _GetGradSource(grad)
- g = tensor_array_ops.TensorArray(
- dtype=dtype, handle=handle, flow=flow).grad(
- source=grad_source, flow=flow)
+ g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+ colocate_with_first_write_call=False)
+ .grad(source=grad_source, flow=flow))
w_g = g.write(index, grad)
return [None, None, w_g.flow]
@@ -125,9 +125,9 @@ def _TensorArrayWriteGrad(op, flow):
index = op.inputs[1]
dtype = op.get_attr("T")
grad_source = _GetGradSource(flow)
- g = tensor_array_ops.TensorArray(
- dtype=dtype, handle=handle, flow=flow).grad(
- source=grad_source, flow=flow)
+ g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+ colocate_with_first_write_call=False)
+ .grad(source=grad_source, flow=flow))
grad = g.read(index)
return [None, None, grad, flow]
@@ -156,9 +156,9 @@ def _TensorArrayGatherGrad(op, grad):
flow = op.inputs[2]
dtype = op.get_attr("dtype")
grad_source = _GetGradSource(grad)
- g = tensor_array_ops.TensorArray(
- dtype=dtype, handle=handle, flow=flow).grad(
- source=grad_source, flow=flow)
+ g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+ colocate_with_first_write_call=False)
+ .grad(source=grad_source, flow=flow))
u_g = g.scatter(indices, grad)
return [None, None, u_g.flow]
@@ -180,9 +180,9 @@ def _TensorArrayScatterGrad(op, flow):
indices = op.inputs[1]
dtype = op.get_attr("T")
grad_source = _GetGradSource(flow)
- g = tensor_array_ops.TensorArray(
- dtype=dtype, handle=handle, flow=flow).grad(
- source=grad_source, flow=flow)
+ g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+ colocate_with_first_write_call=False)
+ .grad(source=grad_source, flow=flow))
grad = g.gather(indices)
return [None, None, grad, flow]
@@ -211,9 +211,9 @@ def _TensorArrayConcatGrad(op, grad, unused_lengths_grad):
lengths = op.outputs[1]
dtype = op.get_attr("dtype")
grad_source = _GetGradSource(grad)
- g = tensor_array_ops.TensorArray(
- dtype=dtype, handle=handle, flow=flow).grad(
- source=grad_source, flow=flow)
+ g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+ colocate_with_first_write_call=False)
+ .grad(source=grad_source, flow=flow))
u_g = g.split(grad, lengths=lengths)
# handle, flow_in
return [None, u_g.flow]
@@ -235,9 +235,9 @@ def _TensorArraySplitGrad(op, flow):
handle = op.inputs[0]
dtype = op.get_attr("T")
grad_source = _GetGradSource(flow)
- g = tensor_array_ops.TensorArray(
- dtype=dtype, handle=handle, flow=flow).grad(
- source=grad_source, flow=flow)
+ g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+ colocate_with_first_write_call=False)
+ .grad(source=grad_source, flow=flow))
grad = g.concat()
# handle, value, lengths, flow_in
return [None, grad, None, flow]
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 8b119f5842..7a6abc8e61 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -54,6 +54,7 @@ class TensorArray(object):
flow=None,
infer_shape=True,
element_shape=None,
+ colocate_with_first_write_call=True,
name=None):
"""Construct a new TensorArray or wrap an existing TensorArray handle.
@@ -85,6 +86,11 @@ class TensorArray(object):
element_shape: (optional, default: None) A `TensorShape` object specifying
the shape constraints of each of the elements of the TensorArray.
Need not be fully defined.
+ colocate_with_first_write_call: If `True`, the TensorArray will be
+ colocated on the same device as the the Tensor used on its first write
+ (write operations include `write`, `unstack`, and `split`). If `False`,
+ the TensorArray will be placed on the device determined by the
+ device context available during its initialization.
name: A name for the operation (optional).
Raises:
@@ -120,7 +126,11 @@ class TensorArray(object):
# Used to keep track of what tensors the TensorArray should be
# colocated with. We choose to colocate the TensorArray with the
# first tensor written to it.
- self._colocate_with = []
+ self._colocate_with_first_write_call = colocate_with_first_write_call
+ if colocate_with_first_write_call:
+ self._colocate_with = []
+ else:
+ self._colocate_with = None
# Record the current static shape for the array elements. The element
# shape is defined either by `element_shape` or the shape of the tensor
@@ -142,8 +152,8 @@ class TensorArray(object):
# Construct the TensorArray with an empty device. The first
# write into the TensorArray from a Tensor with a set device
# will retroactively set the device value of this op.
- with ops.device(None), ops.colocate_with(None, ignore_existing=True):
- self._handle, self._flow = gen_data_flow_ops._tensor_array_v3(
+ def create():
+ return gen_data_flow_ops._tensor_array_v3(
dtype=dtype,
size=size,
element_shape=element_shape,
@@ -151,6 +161,11 @@ class TensorArray(object):
clear_after_read=clear_after_read,
tensor_array_name=tensor_array_name,
name=scope)
+ if colocate_with_first_write_call:
+ with ops.device(None), ops.colocate_with(None, ignore_existing=True):
+ self._handle, self._flow = create()
+ else:
+ self._handle, self._flow = create()
@property
def flow(self):
@@ -200,10 +215,13 @@ class TensorArray(object):
If no internal colocation group is set, colocate with `value` and set
the internal colocation group to be value.
"""
- if not self._colocate_with:
- self._colocate_with.append(value)
- with ops.colocate_with(self._colocate_with[0]):
+ if not self._colocate_with_first_write_call:
yield
+ else:
+ if not self._colocate_with:
+ self._colocate_with.append(value)
+ with ops.colocate_with(self._colocate_with[0]):
+ yield
def identity(self):
"""Returns a TensorArray with the same content and properties.
@@ -214,8 +232,10 @@ class TensorArray(object):
Use this object all for subsequent operations.
"""
flow = array_ops.identity(self._flow)
- ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow,
- infer_shape=self._infer_shape)
+ ta = TensorArray(
+ dtype=self._dtype, handle=self._handle, flow=flow,
+ infer_shape=self._infer_shape,
+ colocate_with_first_write_call=self._colocate_with_first_write_call)
ta._element_shape = self._element_shape
ta._colocate_with = self._colocate_with
return ta
@@ -237,7 +257,8 @@ class TensorArray(object):
dtype=self._dtype,
handle=g_handle,
flow=flow,
- infer_shape=self._infer_shape)
+ infer_shape=self._infer_shape,
+ colocate_with_first_write_call=False)
g._element_shape = self._element_shape
return g
@@ -286,7 +307,9 @@ class TensorArray(object):
value=value,
flow_in=self._flow,
name=name)
- ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+ ta = TensorArray(
+ dtype=self._dtype, handle=self._handle, flow=flow_out,
+ colocate_with_first_write_call=self._colocate_with_first_write_call)
ta._infer_shape = self._infer_shape
ta._element_shape = self._element_shape
ta._colocate_with = self._colocate_with
@@ -416,7 +439,9 @@ class TensorArray(object):
value=value,
flow_in=self._flow,
name=name)
- ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+ ta = TensorArray(
+ dtype=self._dtype, handle=self._handle, flow=flow_out,
+ colocate_with_first_write_call=self._colocate_with_first_write_call)
ta._infer_shape = self._infer_shape
ta._element_shape = self._element_shape
ta._colocate_with = self._colocate_with
@@ -456,7 +481,9 @@ class TensorArray(object):
lengths=lengths_64,
flow_in=self._flow,
name=name)
- ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+ ta = TensorArray(
+ dtype=self._dtype, handle=self._handle, flow=flow_out,
+ colocate_with_first_write_call=self._colocate_with_first_write_call)
ta._infer_shape = self._infer_shape
ta._element_shape = self._element_shape
ta._colocate_with = self._colocate_with
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index d713e222ae..4e58602a6f 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -22,6 +22,7 @@ import threading
import weakref
from tensorflow.core.protobuf import queue_runner_pb2
+from tensorflow.python.client import session
from tensorflow.python.framework import errors
from tensorflow.python.framework import ops
from tensorflow.python.platform import tf_logging as logging
@@ -401,6 +402,10 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
collection: A `GraphKey` specifying the graph collection to
get the queue runners from. Defaults to `GraphKeys.QUEUE_RUNNERS`.
+ Raises:
+ ValueError: if `sess` is None and there isn't any default session.
+ TypeError: if `sess` is not a `tf.Session` object.
+
Returns:
A list of threads.
"""
@@ -410,6 +415,15 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
raise ValueError("Cannot start queue runners: No default session is "
"registered. Use `with sess.as_default()` or pass an "
"explicit session to tf.start_queue_runners(sess=sess)")
+
+ if not isinstance(sess, session.SessionInterface):
+ # Following check is due to backward compatibility. (b/62061352)
+ if sess.__class__.__name__ in [
+ "MonitoredSession", "SingularMonitoredSession"]:
+ return []
+ raise TypeError("sess must be a `tf.Session` object. "
+ "Given class: {}".format(sess.__class__))
+
with sess.graph.as_default():
threads = []
for qr in ops.get_collection(collection):
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 5b00ac9fc3..51c0eecf46 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import data_flow_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
from tensorflow.python.training import coordinator
+from tensorflow.python.training import monitored_session
from tensorflow.python.training import queue_runner_impl
@@ -247,6 +248,33 @@ class QueueRunnerTest(test.TestCase):
# The variable should be 3.
self.assertEqual(3, var.eval())
+ def testStartQueueRunnersRaisesIfNotASession(self):
+ zero64 = constant_op.constant(0, dtype=dtypes.int64)
+ var = variables.Variable(zero64)
+ count_up_to = var.count_up_to(3)
+ queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
+ init_op = variables.global_variables_initializer()
+ qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
+ queue_runner_impl.add_queue_runner(qr)
+ with self.test_session():
+ init_op.run()
+ with self.assertRaisesRegexp(TypeError, "tf.Session"):
+ queue_runner_impl.start_queue_runners("NotASession")
+
+ def testStartQueueRunnersIgnoresMonitoredSession(self):
+ zero64 = constant_op.constant(0, dtype=dtypes.int64)
+ var = variables.Variable(zero64)
+ count_up_to = var.count_up_to(3)
+ queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
+ init_op = variables.global_variables_initializer()
+ qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
+ queue_runner_impl.add_queue_runner(qr)
+ with self.test_session():
+ init_op.run()
+ threads = queue_runner_impl.start_queue_runners(
+ monitored_session.MonitoredSession())
+ self.assertFalse(threads)
+
def testStartQueueRunnersNonDefaultGraph(self):
# CountUpTo will raise OUT_OF_RANGE when it reaches the count.
graph = ops.Graph()
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 88df3351e6..05c99856d2 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -17,14 +17,52 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
+import collections
import functools
+import itertools
import traceback
import types
+import six # pylint: disable=unused-import
+
+from backports import weakref # pylint: disable=g-bad-import-order
+
from tensorflow.python.platform import tf_logging
from tensorflow.python.util import tf_decorator
+class _RefInfoField(
+ collections.namedtuple(
+ '_RefInfoField', ('type_', 'repr_', 'creation_stack', 'object_used'))):
+ pass
+
+
+# Thread-safe up to int32max/2 thanks to python's GIL; and may be safe even for
+# higher values in Python 3.4+. We don't expect to ever count higher than this.
+# https://mail.python.org/pipermail/python-list/2005-April/342279.html
+_REF_ITER = itertools.count()
+
+# Dictionary mapping id(obj) => _RefInfoField.
+_REF_INFO = {}
+
+
+def _deleted(obj_id, fatal_error):
+ obj = _REF_INFO[obj_id]
+ del _REF_INFO[obj_id]
+ if not obj.object_used:
+ if fatal_error:
+ logger = tf_logging.fatal
+ else:
+ logger = tf_logging.error
+ logger(
+ '==================================\n'
+ 'Object was never used (type %s):\n%s\nIf you want to mark it as '
+ 'used call its "mark_used()" method.\nIt was originally created '
+ 'here:\n%s\n'
+ '==================================' %
+ (obj.type_, obj.repr_, obj.creation_stack))
+
+
def _add_should_use_warning(x, fatal_error=False):
"""Wraps object x so that if it is never used, a warning is logged.
@@ -39,14 +77,14 @@ def _add_should_use_warning(x, fatal_error=False):
"""
if x is None: # special corner case where x is None
return x
- has_been_used = getattr(x, '_tf_object_has_been_used', None)
- if has_been_used is not None:
- x._tf_object_has_been_used = has_been_used # pylint: disable=protected-access
+ if hasattr(x, '_tf_ref_id'): # this is already a TFShouldUseWarningWrapper
return x
def override_method(method):
def fn(self, *args, **kwargs):
- self._tf_object_has_been_used = True # pylint: disable=protected-access
+ # pylint: disable=protected-access
+ _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+ object_used=True)
return method(self, *args, **kwargs)
return fn
@@ -55,38 +93,36 @@ def _add_should_use_warning(x, fatal_error=False):
def __init__(self, true_self):
self.__dict__ = true_self.__dict__
- stack = [x.strip() for x in traceback.format_stack()]
+ stack = [s.strip() for s in traceback.format_stack()]
# Remove top three stack entries from adding the wrapper
- self._tf_object_creation_stack = '\n'.join(stack[:-3])
- self._tf_object_has_been_used = False
+ self.creation_stack = '\n'.join(stack[:-3])
+ self._tf_ref_id = next(_REF_ITER)
+ _REF_INFO[self._tf_ref_id] = _RefInfoField(
+ type_=type(x),
+ repr_=repr(x),
+ creation_stack=stack,
+ object_used=False)
+
+ # Create a finalizer for self, which will be called when self is
+ # garbage collected. Can't add self as the args because the
+ # loop will break garbage collection. We keep track of
+ # ourselves via python ids.
+ weakref.finalize(self, _deleted, self._tf_ref_id, fatal_error)
# Not sure why this pylint warning is being used; this is not an
# old class form.
# pylint: disable=super-on-old-class
def __getattribute__(self, name):
- if name != '_tf_object_has_been_used':
- self._tf_object_has_been_used = True
+ if name == '_tf_ref_id':
+ return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
+ if self._tf_ref_id in _REF_INFO:
+ _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+ object_used=True)
return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
- def __del__(self):
- if not self._tf_object_has_been_used:
- if fatal_error:
- logger = tf_logging.fatal
- else:
- logger = tf_logging.error
- logger(
- '==================================\n'
- 'Object was never used (type %s):\n%s\nIf you want to mark it as '
- 'used call its "mark_used()" method.\nIt was originally created '
- 'here:\n%s\n'
- '==================================' %
- (type(x), x, self._tf_object_creation_stack))
-
- if hasattr(super(TFShouldUseWarningWrapper, self), '__del__'):
- return super(TFShouldUseWarningWrapper, self).__del__()
-
def mark_used(self, *args, **kwargs):
- self._tf_object_has_been_used = True
+ _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+ object_used=True)
if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'):
return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs)
# pylint: enable=super-on-old-class
@@ -102,7 +138,8 @@ def _add_should_use_warning(x, fatal_error=False):
wrapped = TFShouldUseWarningWrapper(x)
wrapped.__doc__ = x.__doc__ # functools.wraps fails on some objects.
- wrapped._tf_object_has_been_used = False # pylint: disable=protected-access
+ ref_id = wrapped._tf_ref_id # pylint: disable=protected-access
+ _REF_INFO[ref_id] = _REF_INFO[ref_id]._replace(object_used=False)
return wrapped
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index 71d48e3dde..c826874400 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -20,6 +20,7 @@ from __future__ import division
from __future__ import print_function
import contextlib
+import gc
import sys
from tensorflow.python.framework import constant_op
@@ -45,7 +46,7 @@ def reroute_error(captured):
class TfShouldUseTest(test.TestCase):
def testAddShouldUseWarningWhenNotUsed(self):
- c = constant_op.constant(0, name='blah')
+ c = constant_op.constant(0, name='blah0')
captured = []
with reroute_error(captured):
def in_this_function():
@@ -53,44 +54,52 @@ class TfShouldUseTest(test.TestCase):
del h
in_this_function()
self.assertIn('Object was never used', '\n'.join(captured))
- self.assertIn('blah:0', '\n'.join(captured))
+ self.assertIn('blah0:0', '\n'.join(captured))
self.assertIn('in_this_function', '\n'.join(captured))
+ gc.collect()
+ self.assertFalse(gc.garbage)
- def _testAddShouldUseWarningWhenUsed(self, fn):
- c = constant_op.constant(0, name='blah')
+ def _testAddShouldUseWarningWhenUsed(self, fn, name):
+ c = constant_op.constant(0, name=name)
captured = []
with reroute_error(captured):
h = tf_should_use._add_should_use_warning(c)
fn(h)
del h
self.assertNotIn('Object was never used', '\n'.join(captured))
- self.assertNotIn('blah:0', '\n'.join(captured))
+ self.assertNotIn('%s:0' % name, '\n'.join(captured))
def testAddShouldUseWarningWhenUsedWithAdd(self):
def add(h):
_ = h + 1
- self._testAddShouldUseWarningWhenUsed(add)
+ self._testAddShouldUseWarningWhenUsed(add, name='blah_add')
+ gc.collect()
+ self.assertFalse(gc.garbage)
def testAddShouldUseWarningWhenUsedWithGetName(self):
def get_name(h):
_ = h.name
- self._testAddShouldUseWarningWhenUsed(get_name)
+ self._testAddShouldUseWarningWhenUsed(get_name, name='blah_get_name')
+ gc.collect()
+ self.assertFalse(gc.garbage)
def testShouldUseResult(self):
@tf_should_use.should_use_result
def return_const(value):
- return constant_op.constant(value, name='blah')
+ return constant_op.constant(value, name='blah2')
captured = []
with reroute_error(captured):
return_const(0.0)
self.assertIn('Object was never used', '\n'.join(captured))
- self.assertIn('blah:0', '\n'.join(captured))
+ self.assertIn('blah2:0', '\n'.join(captured))
self.assertIn('return_const', '\n'.join(captured))
+ gc.collect()
+ self.assertFalse(gc.garbage)
def testShouldUseResultWhenNotReallyUsed(self):
@tf_should_use.should_use_result
def return_const(value):
- return constant_op.constant(value, name='blah')
+ return constant_op.constant(value, name='blah3')
captured = []
with reroute_error(captured):
with self.test_session():
@@ -100,8 +109,10 @@ class TfShouldUseTest(test.TestCase):
v = constant_op.constant(1.0, name='meh')
v.eval()
self.assertIn('Object was never used', '\n'.join(captured))
- self.assertIn('blah:0', '\n'.join(captured))
+ self.assertIn('blah3:0', '\n'.join(captured))
self.assertIn('return_const', '\n'.join(captured))
+ gc.collect()
+ self.assertFalse(gc.garbage)
if __name__ == '__main__':
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
index a0fad4df52..ed088c41ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
@@ -16,7 +16,7 @@ tf_class {
}
member_method {
name: "__init__"
- argspec: "args=[\'self\', \'dtype\', \'size\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'handle\', \'flow\', \'infer_shape\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+ argspec: "args=[\'self\', \'dtype\', \'size\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'handle\', \'flow\', \'infer_shape\', \'element_shape\', \'colocate_with_first_write_call\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'True\', \'None\'], "
}
member_method {
name: "close"
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b8f9fc8453..8768852dc7 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -85,3 +85,6 @@ pip2 install mock
pip2 install portpicker
pip3 install portpicker
+
+pip2 install backports.weakref==1.0rc1
+pip3 install backports.weakref==1.0rc1
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index e7e2d256cd..edfc4e3a98 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -89,3 +89,6 @@ pip3.5 install wheel==0.29.0
pip3.5 install portpicker
pip3.5 install werkzeug
+
+pip3.5 install backports.weakref==1.0rc1
+
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index f124012edc..b4f9cc8476 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -22,12 +22,14 @@ CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat"
:: Turn echo back on, above script turns it off.
ECHO ON
-:: Some common variables to be shared between runs.
-SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe"
-SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe"
-SET PY_EXE="C:\Program Files\Anaconda3\python.exe"
-SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib"
-SET CUDNN_HOME="c:\tools\cuda"
+:: Set environment variables to be shared between runs. Do not override if they
+:: are set already.
+
+IF DEFINED CMAKE_EXE (ECHO CMAKE_EXE is set to %CMAKE_EXE%) ELSE (SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe")
+IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe")
+IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe")
+IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
+IF DEFINED CUDNN_HOME (ECHO CUDNN_HOME is set to %CUDNN_HOME%) ELSE (SET CUDNN_HOME="c:\tools\cuda")
SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index 9307ebb66b..ba2d939b5f 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -22,7 +22,7 @@ CD %BUILD_DIR%
SET BUILD_CC_TESTS=OFF
SET BUILD_PYTHON_TESTS=ON
-SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe"
+IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
:: Run the CMAKE build to build the pip package.
CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index a85a220270..a1676203c7 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -39,6 +39,7 @@ REQUIRED_PACKAGES = [
'html5lib == 0.9999999', # identical to 1.0b8
'markdown == 2.2.0',
'bleach == 1.5.0',
+ 'backports.weakref == 1.0rc1',
]
project_name = 'tensorflow'