diff options
41 files changed, 921 insertions, 370 deletions
@@ -38,8 +38,8 @@ People who are a little more adventurous can also try our nightly binaries: * Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) * Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/)) * Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/)) -* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/)) -* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/)) +* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc1-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/)) +* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc1-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/)) * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/)) diff --git a/RELEASE.md b/RELEASE.md index 5b35f91477..64d9069e61 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,7 @@ # Release 1.2.0 ## Major Features and Improvements +* Python 3.6 support on Windows. * Added `tf.layers.conv3d_transpose` layer for spatio temporal deconvolution. * Added `tf.Session.make_callable()`, which provides a lower overhead means of running a similar step multiple times. * Added ibverbs-based RDMA support to contrib (courtesy @junshi15 from Yahoo). @@ -48,6 +49,15 @@ be replaced by calling `embedding_lookup` or `layers.dense` as pre- or post- processing of the rnn. For RNN decoding, this functionality has been replaced with an alternative API in `tf.contrib.seq2seq`. +* Intel MKL Integration (https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture). Intel developed a number of + optimized deep learning primitives: In addition to matrix multiplication and + convolution, these building blocks include: + Direct batched convolution + Pooling: maximum, minimum, average + Normalization: LRN, batch normalization + Activation: rectified linear unit (ReLU) + Data manipulation: multi-dimensional transposition (conversion), split, + concat, sum and scale. ## Deprecations diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py index 2f7f8ebbae..68cd3623c0 100644 --- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py @@ -150,7 +150,8 @@ class MapDatasetTest(test.TestCase): results.append(sess.run(get_next)) except errors.OutOfRangeError: return - threads = [self.checkedThread(target=iterator_thread) for _ in range(8)] + threads = [self.checkedThread(target=iterator_thread) + for _ in range(64)] for t in threads: t.start() for t in threads: diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py index 905ef13e14..ed2b251b31 100644 --- a/tensorflow/contrib/keras/python/keras/backend.py +++ b/tensorflow/contrib/keras/python/keras/backend.py @@ -33,6 +33,7 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes as dtypes_module from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor +from tensorflow.python.layers import base as tf_base_layers from tensorflow.python.ops import array_ops from tensorflow.python.ops import clip_ops from tensorflow.python.ops import control_flow_ops @@ -261,16 +262,9 @@ def get_uid(prefix=''): 2 ``` """ - layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS') - if not layer_name_uids_collection: - layer_name_uids = {} - ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids) - else: - layer_name_uids = layer_name_uids_collection[0] - if prefix not in layer_name_uids: - layer_name_uids[prefix] = 1 - else: - layer_name_uids[prefix] += 1 + graph = ops.get_default_graph() + layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph] + layer_name_uids[prefix] += 1 return layer_name_uids[prefix] diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py index 3b7f31a3e9..131637f03f 100644 --- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py +++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py @@ -113,17 +113,20 @@ class Conv2DTest(test.TestCase): if padding == 'same' and strides != (1, 1): continue - with self.test_session(): - testing_utils.layer_test( - keras.layers.Conv2D, - kwargs={ - 'filters': filters, - 'kernel_size': kernel_size, - 'padding': padding, - 'strides': strides, - 'data_format': 'channels_first' - }, - input_shape=(num_samples, stack_size, num_row, num_col)) + with self.test_session(use_gpu=True): + # Only runs on GPU with CUDA, channels_first is not supported on CPU. + # TODO(b/62340061): Support channels_first on CPU. + if test.is_gpu_available(cuda_only=True): + testing_utils.layer_test( + keras.layers.Conv2D, + kwargs={ + 'filters': filters, + 'kernel_size': kernel_size, + 'padding': padding, + 'strides': strides, + 'data_format': 'channels_first' + }, + input_shape=(num_samples, stack_size, num_row, num_col)) def test_convolution_2d_regularization(self): # regularizers diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py index 76eaf50138..6808348414 100644 --- a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py +++ b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py @@ -100,15 +100,18 @@ class Pooling2DTest(test.TestCase): 'padding': 'valid', 'pool_size': (3, 3)}, input_shape=(3, 5, 6, 4)) - testing_utils.layer_test( - keras.layers.AveragePooling2D, - kwargs={ - 'strides': (1, 1), - 'padding': 'valid', - 'pool_size': (2, 2), - 'data_format': 'channels_first' - }, - input_shape=(3, 4, 5, 6)) + # Only runs on GPU with CUDA, channels_first is not supported on CPU. + # TODO(b/62340061): Support channels_first on CPU. + if test.is_gpu_available(cuda_only=True): + testing_utils.layer_test( + keras.layers.AveragePooling2D, + kwargs={ + 'strides': (1, 1), + 'padding': 'valid', + 'pool_size': (2, 2), + 'data_format': 'channels_first' + }, + input_shape=(3, 4, 5, 6)) class Pooling3DTest(test.TestCase): diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD index 03af377149..c7cae56049 100644 --- a/tensorflow/contrib/layers/BUILD +++ b/tensorflow/contrib/layers/BUILD @@ -237,6 +237,7 @@ py_test( "//tensorflow/python:training", "//tensorflow/python:variable_scope", "//tensorflow/python:variables", + "//tensorflow/python/feature_column", "//third_party/py/numpy", ], ) diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py index e1a27335ab..68159fe9b9 100644 --- a/tensorflow/contrib/layers/python/layers/feature_column.py +++ b/tensorflow/contrib/layers/python/layers/feature_column.py @@ -476,7 +476,7 @@ class _SparseColumn( return self._do_transform(input_tensor) @property - def _parse_example_config(self): + def _parse_example_spec(self): return self.config @property @@ -802,7 +802,7 @@ class _WeightedSparseColumn( inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name)) @property - def _parse_example_config(self): + def _parse_example_spec(self): return self.config @property @@ -960,7 +960,7 @@ class _OneHotColumn( return self._to_dnn_input_layer(inputs.get(self.sparse_id_column)) @property - def _parse_example_config(self): + def _parse_example_spec(self): return self.config @@ -1110,7 +1110,7 @@ class _EmbeddingColumn( return inputs.get(self.sparse_id_column) @property - def _parse_example_config(self): + def _parse_example_spec(self): return self.config @@ -1467,7 +1467,7 @@ class _ScatteredEmbeddingColumn( return inputs.get(self.column_name) @property - def _parse_example_config(self): + def _parse_example_spec(self): return self.config @@ -1817,7 +1817,7 @@ class _RealValuedColumn( self._normalized_input_tensor(inputs.get(self.name))) @property - def _parse_example_config(self): + def _parse_example_spec(self): return self.config @@ -2081,7 +2081,7 @@ class _BucketizedColumn( _LazyBuilderByColumnsToTensor(columns_to_tensors)) @property - def _parse_example_config(self): + def _parse_example_spec(self): return self.config @property @@ -2311,7 +2311,7 @@ class _CrossedColumn( _LazyBuilderByColumnsToTensor(columns_to_tensors)) @property - def _parse_example_config(self): + def _parse_example_spec(self): return self.config @property diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py index b6a8b6bdda..ce3bc23cf6 100644 --- a/tensorflow/contrib/layers/python/layers/feature_column_test.py +++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py @@ -27,6 +27,7 @@ import numpy as np from tensorflow.contrib.layers.python.layers import feature_column as fc from tensorflow.contrib.layers.python.layers import feature_column_ops +from tensorflow.python.feature_column import feature_column as fc_core from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib @@ -697,11 +698,6 @@ class FeatureColumnTest(test.TestCase): "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) - real_valued_col3 = fc._real_valued_var_len_column( - "real_valued_column3", is_sparse=True) - real_valued_col4 = fc._real_valued_var_len_column( - "real_valued_column4", dtype=dtypes.int64, default_value=0, - is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( @@ -717,8 +713,8 @@ class FeatureColumnTest(test.TestCase): feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, - real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, - cross_col, one_hot_col, scattered_embedding_col + bucketized_col1, bucketized_col2, cross_col, one_hot_col, + scattered_embedding_col ]) expected_config = { "sparse_column": @@ -739,11 +735,6 @@ class FeatureColumnTest(test.TestCase): "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), - "real_valued_column3": - parsing_ops.VarLenFeature(dtype=dtypes.float32), - "real_valued_column4": - parsing_ops.FixedLenSequenceFeature( - [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), @@ -763,6 +754,10 @@ class FeatureColumnTest(test.TestCase): config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) + # Tests that contrib feature columns work with core library: + config_core = fc_core.make_parse_example_spec(feature_columns) + self.assertDictEqual(expected_config, config_core) + # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val @@ -771,6 +766,23 @@ class FeatureColumnTest(test.TestCase): config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config) + def testCreateFeatureSpec_ExperimentalColumns(self): + real_valued_col0 = fc._real_valued_var_len_column( + "real_valued_column0", is_sparse=True) + real_valued_col1 = fc._real_valued_var_len_column( + "real_valued_column1", dtype=dtypes.int64, default_value=0, + is_sparse=False) + feature_columns = set([real_valued_col0, real_valued_col1]) + expected_config = { + "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32), + "real_valued_column1": + parsing_ops.FixedLenSequenceFeature( + [], dtype=dtypes.int64, allow_missing=True, default_value=0), + } + + config = fc.create_feature_spec_for_parsing(feature_columns) + self.assertDictEqual(expected_config, config) + def testCreateFeatureSpec_RealValuedColumnWithDefaultValue(self): real_valued_col1 = fc.real_valued_column( "real_valued_column1", default_value=2) diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py index dc159b93a3..d36d7e16de 100644 --- a/tensorflow/contrib/seq2seq/__init__.py +++ b/tensorflow/contrib/seq2seq/__init__.py @@ -47,6 +47,7 @@ _allowed_symbols = [ "FinalBeamSearchDecoderOutput", "gather_tree", "GreedyEmbeddingHelper", + "SampleEmbeddingHelper", "ScheduledEmbeddingTrainingHelper", "ScheduledOutputTrainingHelper", "TrainingHelper", diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py index ea34333360..99e51589c9 100644 --- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py +++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py @@ -29,6 +29,7 @@ from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper from tensorflow.contrib.seq2seq.python.ops import helper as helper_py from tensorflow.contrib.seq2seq.python.ops import basic_decoder from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import rnn_cell from tensorflow.python.ops import variables @@ -64,7 +65,8 @@ class AttentionWrapperTest(test.TestCase): def assertAllCloseOrEqual(self, x, y, **kwargs): if isinstance(x, np.ndarray) or isinstance(x, float): - return super(AttentionWrapperTest, self).assertAllClose(x, y, **kwargs) + return super(AttentionWrapperTest, self).assertAllClose( + x, y, atol=1e-4, **kwargs) else: self.assertAllEqual(x, y, **kwargs) @@ -84,7 +86,7 @@ class AttentionWrapperTest(test.TestCase): expected_final_alignment_history=None, attention_layer_size=6, name=''): - encoder_sequence_length = [3, 2, 3, 1, 0] + encoder_sequence_length = [3, 2, 3, 1, 1] decoder_sequence_length = [2, 0, 1, 2, 3] batch_size = 5 encoder_max_time = 8 @@ -98,10 +100,14 @@ class AttentionWrapperTest(test.TestCase): else: attention_depth = encoder_output_depth - decoder_inputs = np.random.randn(batch_size, decoder_max_time, - input_depth).astype(np.float32) - encoder_outputs = np.random.randn(batch_size, encoder_max_time, - encoder_output_depth).astype(np.float32) + decoder_inputs = array_ops.placeholder_with_default( + np.random.randn(batch_size, decoder_max_time, + input_depth).astype(np.float32), + shape=(None, None, input_depth)) + encoder_outputs = array_ops.placeholder_with_default( + np.random.randn(batch_size, encoder_max_time, + encoder_output_depth).astype(np.float32), + shape=(None, None, encoder_output_depth)) attention_mechanism = create_attention_mechanism( num_units=attention_mechanism_depth, @@ -152,7 +158,7 @@ class AttentionWrapperTest(test.TestCase): # Remove the history from final_state for purposes of the # remainder of the tests. final_state = final_state._replace(alignment_history=()) # pylint: disable=protected-access - self.assertEqual((None, batch_size, encoder_max_time), + self.assertEqual((None, batch_size, None), tuple(state_alignment_history.get_shape().as_list())) else: state_alignment_history = () @@ -190,16 +196,17 @@ class AttentionWrapperTest(test.TestCase): expected_final_output = BasicDecoderOutput( rnn_output=ResultSummary( - shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00083043973), - sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0)) + shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052250605), + sample_id=ResultSummary( + shape=(5, 3), dtype=dtype('int32'), mean=1.4)) expected_final_state = AttentionWrapperState( cell_state=LSTMStateTuple( c=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0039763632), + shape=(5, 9), dtype=dtype('float32'), mean=-0.0040092287), h=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0019849765)), + shape=(5, 9), dtype=dtype('float32'), mean=-0.0020015112)), attention=ResultSummary( - shape=(5, 6), dtype=dtype('float32'), mean=-0.00081052497), + shape=(5, 6), dtype=dtype('float32'), mean=-0.0052052638), time=3, alignments=ResultSummary( shape=(5, 8), dtype=dtype('float32'), mean=0.125), @@ -221,17 +228,17 @@ class AttentionWrapperTest(test.TestCase): expected_final_output = BasicDecoderOutput( rnn_output=ResultSummary( - shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00040482997), + shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00597103), sample_id=ResultSummary( - shape=(5, 3), dtype=dtype('int32'), mean=1.8666666666666667)) + shape=(5, 3), dtype=dtype('int32'), mean=1.4)) expected_final_state = AttentionWrapperState( cell_state=LSTMStateTuple( c=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0039785588), + shape=(5, 9), dtype=dtype('float32'), mean=-0.0040052128), h=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0019861322)), + shape=(5, 9), dtype=dtype('float32'), mean=-0.0019996136)), attention=ResultSummary( - shape=(5, 6), dtype=dtype('float32'), mean=-0.00038488387), + shape=(5, 6), dtype=dtype('float32'), mean=-0.00595117), time=3, alignments=ResultSummary( shape=(5, 8), dtype=dtype('float32'), mean=0.125), @@ -248,16 +255,17 @@ class AttentionWrapperTest(test.TestCase): expected_final_output = BasicDecoderOutput( rnn_output=ResultSummary( - shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338), - sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0)) + shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386), + sample_id=ResultSummary( + shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666)) expected_final_state = AttentionWrapperState( cell_state=LSTMStateTuple( c=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317), + shape=(5, 9), dtype=dtype('float32'), mean=-0.004009536), h=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)), + shape=(5, 9), dtype=dtype('float32'), mean=-0.0020016613)), attention=ResultSummary( - shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603), + shape=(5, 6), dtype=dtype('float32'), mean=-0.0051812846), time=3, alignments=ResultSummary( shape=(5, 8), dtype=dtype('float32'), mean=0.125), @@ -276,16 +284,17 @@ class AttentionWrapperTest(test.TestCase): expected_final_output = BasicDecoderOutput( rnn_output=ResultSummary( - shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338), - sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0)) + shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386), + sample_id=ResultSummary( + shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666)) expected_final_state = AttentionWrapperState( cell_state=LSTMStateTuple( c=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317), + shape=(5, 9), dtype=dtype('float32'), mean=-0.004009536), h=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)), + shape=(5, 9), dtype=dtype('float32'), mean=-0.0020016613)), attention=ResultSummary( - shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603), + shape=(5, 6), dtype=dtype('float32'), mean=-0.0051812846), time=3, alignments=ResultSummary( shape=(5, 8), dtype=dtype('float32'), mean=0.125), @@ -303,17 +312,17 @@ class AttentionWrapperTest(test.TestCase): expected_final_output = BasicDecoderOutput( rnn_output=ResultSummary( - shape=(5, 3, 10), dtype=dtype('float32'), mean=0.019546926), + shape=(5, 3, 10), dtype=dtype('float32'), mean=0.117389656), sample_id=ResultSummary( - shape=(5, 3), dtype=dtype('int32'), mean=2.7999999999999998)) + shape=(5, 3), dtype=dtype('int32'), mean=4.5999999999999996)) expected_final_state = AttentionWrapperState( cell_state=LSTMStateTuple( c=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.0041728448), + shape=(5, 9), dtype=dtype('float32'), mean=-0.0063607907), h=ResultSummary( - shape=(5, 9), dtype=dtype('float32'), mean=-0.002085865)), + shape=(5, 9), dtype=dtype('float32'), mean=-0.00323448)), attention=ResultSummary( - shape=(5, 10), dtype=dtype('float32'), mean=0.019546915), + shape=(5, 10), dtype=dtype('float32'), mean=0.117389656,), time=3, alignments=ResultSummary( shape=(5, 8), dtype=dtype('float32'), mean=0.125), diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py index 600adea189..cb12bc9450 100644 --- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py +++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py @@ -27,8 +27,10 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import tensor_shape from tensorflow.python.layers import core as layers_core +from tensorflow.python.ops import init_ops from tensorflow.python.ops import rnn_cell from tensorflow.python.ops import variables +from tensorflow.python.ops import variable_scope from tensorflow.python.platform import test # pylint: enable=g-import-not-at-top @@ -189,6 +191,76 @@ class BasicDecoderTest(test.TestCase): self.assertAllEqual(expected_step_next_inputs, sess_results["step_next_inputs"]) + def testStepWithSampleEmbeddingHelper(self): + batch_size = 5 + vocabulary_size = 7 + cell_depth = vocabulary_size # cell's logits must match vocabulary size + input_depth = 10 + np.random.seed(0) + start_tokens = np.random.randint(0, vocabulary_size, size=batch_size) + end_token = 1 + + with self.test_session(use_gpu=True) as sess: + with variable_scope.variable_scope( + "testStepWithSampleEmbeddingHelper", + initializer=init_ops.constant_initializer(0.01)): + embeddings = np.random.randn(vocabulary_size, + input_depth).astype(np.float32) + cell = rnn_cell.LSTMCell(vocabulary_size) + helper = helper_py.SampleEmbeddingHelper(embeddings, start_tokens, + end_token, seed=0) + my_decoder = basic_decoder.BasicDecoder( + cell=cell, + helper=helper, + initial_state=cell.zero_state( + dtype=dtypes.float32, batch_size=batch_size)) + output_size = my_decoder.output_size + output_dtype = my_decoder.output_dtype + self.assertEqual( + basic_decoder.BasicDecoderOutput(cell_depth, + tensor_shape.TensorShape([])), + output_size) + self.assertEqual( + basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32), + output_dtype) + + (first_finished, first_inputs, first_state) = my_decoder.initialize() + (step_outputs, step_state, step_next_inputs, + step_finished) = my_decoder.step( + constant_op.constant(0), first_inputs, first_state) + batch_size_t = my_decoder.batch_size + + self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple)) + self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple)) + self.assertTrue( + isinstance(step_outputs, basic_decoder.BasicDecoderOutput)) + self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape()) + self.assertEqual((batch_size,), step_outputs[1].get_shape()) + self.assertEqual((batch_size, cell_depth), first_state[0].get_shape()) + self.assertEqual((batch_size, cell_depth), first_state[1].get_shape()) + self.assertEqual((batch_size, cell_depth), step_state[0].get_shape()) + self.assertEqual((batch_size, cell_depth), step_state[1].get_shape()) + + sess.run(variables.global_variables_initializer()) + sess_results = sess.run({ + "batch_size": batch_size_t, + "first_finished": first_finished, + "first_inputs": first_inputs, + "first_state": first_state, + "step_outputs": step_outputs, + "step_state": step_state, + "step_next_inputs": step_next_inputs, + "step_finished": step_finished + }) + + sample_ids = sess_results["step_outputs"].sample_id + expected_step_finished = (sample_ids == end_token) + expected_step_next_inputs = embeddings[sample_ids] + self.assertAllEqual(expected_step_finished, + sess_results["step_finished"]) + self.assertAllEqual(expected_step_next_inputs, + sess_results["step_next_inputs"]) + def testStepWithScheduledEmbeddingTrainingHelper(self): sequence_length = [3, 4, 3, 1, 0] batch_size = 5 diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py index 873a39154f..3d0627467a 100644 --- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py +++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py @@ -141,6 +141,7 @@ class TestBeamStep(test.TestCase): outputs, next_beam_state = beam_search_decoder._beam_search_step( time=2, logits=logits, + next_cell_state=dummy_cell_state, beam_state=beam_state, batch_size=ops.convert_to_tensor(self.batch_size), beam_width=self.beam_width, @@ -195,6 +196,7 @@ class TestBeamStep(test.TestCase): outputs, next_beam_state = beam_search_decoder._beam_search_step( time=2, logits=logits, + next_cell_state=dummy_cell_state, beam_state=beam_state, batch_size=ops.convert_to_tensor(self.batch_size), beam_width=self.beam_width, @@ -224,8 +226,8 @@ class TestBeamStep(test.TestCase): class BeamSearchDecoderTest(test.TestCase): def _testDynamicDecodeRNN(self, time_major, has_attention): - encoder_sequence_length = [3, 2, 3, 1, 0] - decoder_sequence_length = [2, 0, 1, 2, 3] + encoder_sequence_length = np.array([3, 2, 3, 1, 1]) + decoder_sequence_length = np.array([2, 0, 1, 2, 3]) batch_size = 5 decoder_max_time = 4 input_depth = 7 @@ -240,11 +242,15 @@ class BeamSearchDecoderTest(test.TestCase): beam_width = 3 with self.test_session() as sess: + batch_size_tensor = constant_op.constant(batch_size) embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32) cell = rnn_cell.LSTMCell(cell_depth) + initial_state = cell.zero_state(batch_size, dtypes.float32) if has_attention: - inputs = np.random.randn(batch_size, decoder_max_time, - input_depth).astype(np.float32) + inputs = array_ops.placeholder_with_default( + np.random.randn(batch_size, decoder_max_time, + input_depth).astype(np.float32), + shape=(None, None, input_depth)) tiled_inputs = beam_search_decoder.tile_batch( inputs, multiplier=beam_width) tiled_sequence_length = beam_search_decoder.tile_batch( @@ -253,17 +259,22 @@ class BeamSearchDecoderTest(test.TestCase): num_units=attention_depth, memory=tiled_inputs, memory_sequence_length=tiled_sequence_length) + initial_state = beam_search_decoder.tile_batch( + initial_state, multiplier=beam_width) cell = attention_wrapper.AttentionWrapper( cell=cell, attention_mechanism=attention_mechanism, attention_layer_size=attention_depth, alignment_history=False) cell_state = cell.zero_state( - dtype=dtypes.float32, batch_size=batch_size * beam_width) + dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width) + if has_attention: + cell_state = cell_state.clone( + cell_state=initial_state) bsd = beam_search_decoder.BeamSearchDecoder( cell=cell, embedding=embedding, - start_tokens=batch_size * [start_token], + start_tokens=array_ops.fill([batch_size_tensor], start_token), end_token=end_token, initial_state=cell_state, beam_width=beam_width, diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py index 64ffb6ca40..637175674b 100644 --- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py +++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py @@ -112,6 +112,18 @@ def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined): return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory) +def _maybe_mask_score(score, memory_sequence_length, score_mask_value): + if memory_sequence_length is None: + return score + message = ("All values in memory_sequence_length must greater than zero.") + with ops.control_dependencies( + [check_ops.assert_positive(memory_sequence_length, message=message)]): + score_mask = array_ops.sequence_mask( + memory_sequence_length, maxlen=array_ops.shape(score)[1]) + score_mask_values = score_mask_value * array_ops.ones_like(score) + return array_ops.where(score_mask, score, score_mask_values) + + class _BaseAttentionMechanism(AttentionMechanism): """A base AttentionMechanism class providing common functionality. @@ -127,6 +139,7 @@ class _BaseAttentionMechanism(AttentionMechanism): memory_sequence_length=None, memory_layer=None, check_inner_dims_defined=True, + score_mask_value=float("-inf"), name=None): """Construct base AttentionMechanism class. @@ -149,6 +162,9 @@ class _BaseAttentionMechanism(AttentionMechanism): check_inner_dims_defined: Python boolean. If `True`, the `memory` argument's shape is checked to ensure all but the two outermost dimensions are fully defined. + score_mask_value: (optional): The mask value for score before passing into + `probability_fn`. The default is -inf. Only used if + `memory_sequence_length` is not None. name: Name to use when creating ops. """ if (query_layer is not None @@ -164,7 +180,10 @@ class _BaseAttentionMechanism(AttentionMechanism): if not callable(probability_fn): raise TypeError("probability_fn must be callable, saw type: %s" % type(probability_fn).__name__) - self._probability_fn = probability_fn + self._probability_fn = lambda score, prev: ( # pylint:disable=g-long-lambda + probability_fn( + _maybe_mask_score(score, memory_sequence_length, score_mask_value), + prev)) with ops.name_scope( name, "BaseAttentionMechanismInit", nest.flatten(memory)): self._values = _prepare_memory( @@ -245,6 +264,7 @@ class LuongAttention(_BaseAttentionMechanism): memory_sequence_length=None, scale=False, probability_fn=None, + score_mask_value=float("-inf"), name="LuongAttention"): """Construct the AttentionMechanism mechanism. @@ -260,6 +280,9 @@ class LuongAttention(_BaseAttentionMechanism): probabilities. The default is @{tf.nn.softmax}. Other options include @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}. Its signature should be: `probabilities = probability_fn(score)`. + score_mask_value: (optional): The mask value for score before passing into + `probability_fn`. The default is -inf. Only used if + `memory_sequence_length` is not None. name: Name to use when creating ops. """ # For LuongAttention, we only transform the memory layer; thus @@ -274,6 +297,7 @@ class LuongAttention(_BaseAttentionMechanism): memory=memory, probability_fn=wrapped_probability_fn, memory_sequence_length=memory_sequence_length, + score_mask_value=score_mask_value, name=name) self._num_units = num_units self._scale = scale @@ -362,6 +386,7 @@ class BahdanauAttention(_BaseAttentionMechanism): memory_sequence_length=None, normalize=False, probability_fn=None, + score_mask_value=float("-inf"), name="BahdanauAttention"): """Construct the Attention mechanism. @@ -377,6 +402,9 @@ class BahdanauAttention(_BaseAttentionMechanism): probabilities. The default is @{tf.nn.softmax}. Other options include @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}. Its signature should be: `probabilities = probability_fn(score)`. + score_mask_value: (optional): The mask value for score before passing into + `probability_fn`. The default is -inf. Only used if + `memory_sequence_length` is not None. name: Name to use when creating ops. """ if probability_fn is None: @@ -390,6 +418,7 @@ class BahdanauAttention(_BaseAttentionMechanism): memory=memory, probability_fn=wrapped_probability_fn, memory_sequence_length=memory_sequence_length, + score_mask_value=score_mask_value, name=name) self._num_units = num_units self._normalize = normalize diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py index c9be517fad..d86275f864 100644 --- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py +++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py @@ -72,10 +72,30 @@ class FinalBeamSearchDecoderOutput( pass +def _tile_batch(t, multiplier): + """Core single-tensor implementation of tile_batch.""" + t = ops.convert_to_tensor(t, name="t") + shape_t = array_ops.shape(t) + if t.shape.ndims is None or t.shape.ndims < 1: + raise ValueError("t must have statically known rank") + tiling = [1] * (t.shape.ndims + 1) + tiling[1] = multiplier + tiled_static_batch_size = ( + t.shape[0].value * multiplier if t.shape[0].value is not None else None) + tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling) + tiled = array_ops.reshape( + tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0)) + tiled.set_shape( + tensor_shape.TensorShape( + [tiled_static_batch_size]).concatenate(t.shape[1:])) + return tiled + + def tile_batch(t, multiplier, name=None): - """Tile the batch dimension of tensor t. + """Tile the batch dimension of a (possibly nested structure of) tensor(s) t. - This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of + For each tensor t in a (possibly nested structure) of tensors, + this function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape `[batch_size * multiplier, s0, s1, ...]` composed of minibatch entries `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated @@ -87,27 +107,25 @@ def tile_batch(t, multiplier, name=None): name: Name scope for any created operations. Returns: - A `Tensor` shaped `[batch_size * multiplier, ...]`. + A (possibly nested structure of) `Tensor` shaped + `[batch_size * multiplier, ...]`. Raises: - ValueError: if `t` does not have a statically known rank or it's < 1. + ValueError: if tensor(s) `t` do not have a statically known rank or + the rank is < 1. """ - with ops.name_scope(name, "tile_batch", [t, multiplier]): - t = ops.convert_to_tensor(t, name="t") - shape_t = array_ops.shape(t) - if t.shape.ndims is None or t.shape.ndims < 1: - raise ValueError("t must have statically known rank") - tiling = [1] * (t.shape.ndims + 1) - tiling[1] = multiplier - tiled_static_batch_size = ( - t.shape[0].value * multiplier if t.shape[0].value is not None else None) - tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling) - tiled = array_ops.reshape( - tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0)) - tiled.set_shape( - tensor_shape.TensorShape( - [tiled_static_batch_size]).concatenate(t.shape[1:])) - return tiled + flat_t = nest.flatten(t) + with ops.name_scope(name, "tile_batch", flat_t + [multiplier]): + return nest.map_structure(lambda t_: _tile_batch(t_, multiplier), t) + + +def _check_maybe(t): + if isinstance(t, tensor_array_ops.TensorArray): + raise TypeError( + "TensorArray state is not supported by BeamSearchDecoder: %s" % t.name) + if t.shape.ndims is None: + raise ValueError( + "Expected tensor (%s) to have known rank, but ndims == None." % t) class BeamSearchDecoder(decoder.Decoder): @@ -278,7 +296,7 @@ class BeamSearchDecoder(decoder.Decoder): A reshaped version of t with dimension [batch_size * beam_width, s]. """ if isinstance(s, ops.Tensor): - s = tensor_util.constant_value_as_shape(s) + s = tensor_shape.as_shape(tensor_util.constant_value(s)) else: s = tensor_shape.TensorShape(s) t_shape = array_ops.shape(t) @@ -312,7 +330,7 @@ class BeamSearchDecoder(decoder.Decoder): are known statically). """ if isinstance(s, ops.Tensor): - s = tensor_util.constant_value_as_shape(s) + s = tensor_shape.TensorShape(tensor_util.constant_value(s)) else: s = tensor_shape.TensorShape(s) t_shape = array_ops.shape(t) @@ -351,13 +369,7 @@ class BeamSearchDecoder(decoder.Decoder): TypeError: If t is an instance of TensorArray. ValueError: If the rank of t is not statically known. """ - if isinstance(t, tensor_array_ops.TensorArray): - raise TypeError( - "TensorArray state is not supported by BeamSearchDecoder: %s" - % t.name) - if t.shape.ndims is None: - raise ValueError( - "Expected tensor (%s) to have known rank, but ndims == None." % t) + _check_maybe(t) if t.shape.ndims >= 1: return self._split_batch_beams(t, s) else: @@ -380,13 +392,7 @@ class BeamSearchDecoder(decoder.Decoder): TypeError: If t is an instance of TensorArray. ValueError: If the rank of t is not statically known. """ - if isinstance(t, tensor_array_ops.TensorArray): - raise TypeError( - "TensorArray state is not supported by BeamSearchDecoder: %s" - % t.name) - if t.shape.ndims is None: - raise ValueError( - "Expected tensor (%s) to have known rank, but ndims == None." % t) + _check_maybe(t) if t.shape.ndims >= 2: return self._merge_batch_beams(t, s) else: @@ -417,7 +423,6 @@ class BeamSearchDecoder(decoder.Decoder): self._maybe_merge_batch_beams, cell_state, self._cell.state_size) cell_outputs, next_cell_state = self._cell(inputs, cell_state) - cell_outputs = nest.map_structure( lambda out: self._split_batch_beams(out, out.shape[1:]), cell_outputs) next_cell_state = nest.map_structure( @@ -430,11 +435,13 @@ class BeamSearchDecoder(decoder.Decoder): beam_search_output, beam_search_state = _beam_search_step( time=time, logits=cell_outputs, + next_cell_state=next_cell_state, beam_state=state, batch_size=batch_size, beam_width=beam_width, end_token=end_token, length_penalty_weight=length_penalty_weight) + finished = beam_search_state.finished sample_ids = beam_search_output.predicted_ids next_inputs = control_flow_ops.cond( @@ -444,8 +451,8 @@ class BeamSearchDecoder(decoder.Decoder): return (beam_search_output, beam_search_state, next_inputs, finished) -def _beam_search_step(time, logits, beam_state, batch_size, beam_width, - end_token, length_penalty_weight): +def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size, + beam_width, end_token, length_penalty_weight): """Performs a single step of Beam Search Decoding. Args: @@ -454,6 +461,8 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width, continuations. logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]` + next_cell_state: The next state from the cell, e.g. an instance of + AttentionWrapperState if the cell is attentional. beam_state: Current state of the beam search. An instance of `BeamSearchDecoderState`. batch_size: The batch size for this input. @@ -520,10 +529,9 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width, next_beam_probs = _tensor_gather_helper( gather_indices=word_indices, gather_from=total_probs, - range_input=batch_size, + batch_size=batch_size, range_size=beam_width * vocab_size, - final_shape=[static_batch_size, beam_width]) - + gather_shape=[-1]) next_word_ids = math_ops.to_int32(word_indices % vocab_size) next_beam_ids = math_ops.to_int32(word_indices / vocab_size) @@ -531,9 +539,9 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width, previously_finished = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=previously_finished, - range_input=batch_size, + batch_size=batch_size, range_size=beam_width, - final_shape=[static_batch_size, beam_width]) + gather_shape=[-1]) next_finished = math_ops.logical_or(previously_finished, math_ops.equal(next_word_ids, end_token)) @@ -547,13 +555,28 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width, next_prediction_len = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=beam_state.lengths, - range_input=batch_size, + batch_size=batch_size, range_size=beam_width, - final_shape=[static_batch_size, beam_width]) + gather_shape=[-1]) next_prediction_len += lengths_to_add + # Pick out the cell_states according to the next_beam_ids. We use a + # different gather_shape here because the cell_state tensors, i.e. + # the tensors that would be gathered from, all have dimension + # greater than two and we need to preserve those dimensions. + # pylint: disable=g-long-lambda + next_cell_state = nest.map_structure( + lambda gather_from: _maybe_tensor_gather_helper( + gather_indices=next_beam_ids, + gather_from=gather_from, + batch_size=batch_size, + range_size=beam_width, + gather_shape=[batch_size * beam_width, -1]), + next_cell_state) + # pylint: enable=g-long-lambda + next_state = BeamSearchDecoderState( - cell_state=beam_state.cell_state, + cell_state=next_cell_state, log_probs=next_beam_probs, lengths=next_prediction_len, finished=next_finished) @@ -639,12 +662,74 @@ def _mask_probs(probs, eos_token, finished): return finished_examples + non_finished_examples -def _tensor_gather_helper(gather_indices, gather_from, range_input, range_size, - final_shape): - range_ = array_ops.expand_dims(math_ops.range(range_input) * range_size, 1) +def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size, + range_size, gather_shape): + """Maybe applies _tensor_gather_helper. + + This applies _tensor_gather_helper when the gather_from dims is at least as + big as the length of gather_shape. This is used in conjunction with nest so + that we don't apply _tensor_gather_helper to inapplicable values like scalars. + + Args: + gather_indices: The tensor indices that we use to gather. + gather_from: The tensor that we are gathering from. + batch_size: The batch size. + range_size: The number of values in each range. Likely equal to beam_width. + gather_shape: What we should reshape gather_from to in order to preserve the + correct values. An example is when gather_from is the attention from an + AttentionWrapperState with shape [batch_size, beam_width, attention_size]. + There, we want to preserve the attention_size elements, so gather_shape is + [batch_size * beam_width, -1]. Then, upon reshape, we still have the + attention_size as desired. + + Returns: + output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)] + or the original tensor if its dimensions are too small. + """ + _check_maybe(gather_from) + if gather_from.shape.ndims >= len(gather_shape): + return _tensor_gather_helper( + gather_indices=gather_indices, + gather_from=gather_from, + batch_size=batch_size, + range_size=range_size, + gather_shape=gather_shape) + else: + return gather_from + + +def _tensor_gather_helper(gather_indices, gather_from, batch_size, + range_size, gather_shape): + """Helper for gathering the right indices from the tensor. + + This works by reshaping gather_from to gather_shape (e.g. [-1]) and then + gathering from that according to the gather_indices, which are offset by + the right amounts in order to preserve the batch order. + + Args: + gather_indices: The tensor indices that we use to gather. + gather_from: The tensor that we are gathering from. + batch_size: The input batch size. + range_size: The number of values in each range. Likely equal to beam_width. + gather_shape: What we should reshape gather_from to in order to preserve the + correct values. An example is when gather_from is the attention from an + AttentionWrapperState with shape [batch_size, beam_width, attention_size]. + There, we want to preserve the attention_size elements, so gather_shape is + [batch_size * beam_width, -1]. Then, upon reshape, we still have the + attention_size as desired. + + Returns: + output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)] + """ + range_ = array_ops.expand_dims(math_ops.range(batch_size) * range_size, 1) gather_indices = array_ops.reshape(gather_indices + range_, [-1]) output = array_ops.gather( - array_ops.reshape(gather_from, [-1]), gather_indices) + array_ops.reshape(gather_from, gather_shape), gather_indices) + final_shape = array_ops.shape(gather_from)[:1 + len(gather_shape)] + static_batch_size = tensor_util.constant_value(batch_size) + final_static_shape = (tensor_shape.TensorShape([static_batch_size]) + .concatenate( + gather_from.shape[1:1 + len(gather_shape)])) output = array_ops.reshape(output, final_shape) - output.set_shape(final_shape) + output.set_shape(final_static_shape) return output diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py index bdd7d7ca73..bee7547935 100644 --- a/tensorflow/contrib/seq2seq/python/ops/helper.py +++ b/tensorflow/contrib/seq2seq/python/ops/helper.py @@ -41,6 +41,7 @@ __all__ = [ "Helper", "TrainingHelper", "GreedyEmbeddingHelper", + "SampleEmbeddingHelper", "CustomHelper", "ScheduledEmbeddingTrainingHelper", "ScheduledOutputTrainingHelper", @@ -512,3 +513,42 @@ class GreedyEmbeddingHelper(Helper): lambda: self._start_inputs, lambda: self._embedding_fn(sample_ids)) return (finished, next_inputs, state) + + +class SampleEmbeddingHelper(GreedyEmbeddingHelper): + """A helper for use during inference. + + Uses sampling (from a distribution) instead of argmax and passes the + result through an embedding layer to get the next input. + """ + + def __init__(self, embedding, start_tokens, end_token, seed=None): + """Initializer. + + Args: + embedding: A callable that takes a vector tensor of `ids` (argmax ids), + or the `params` argument for `embedding_lookup`. The returned tensor + will be passed to the decoder input. + start_tokens: `int32` vector shaped `[batch_size]`, the start tokens. + end_token: `int32` scalar, the token that marks end of decoding. + seed: The sampling seed. + + Raises: + ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a + scalar. + """ + super(SampleEmbeddingHelper, self).__init__( + embedding, start_tokens, end_token) + self._seed = seed + + def sample(self, time, outputs, state, name=None): + """sample for SampleEmbeddingHelper.""" + del time, state # unused by sample_fn + # Outputs are logits, we sample instead of argmax (greedy). + if not isinstance(outputs, ops.Tensor): + raise TypeError("Expected outputs to be a single Tensor, got: %s" % + type(outputs)) + sample_id_sampler = categorical.Categorical(logits=outputs) + sample_ids = sample_id_sampler.sample(seed=self._seed) + + return sample_ids diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 343d6faf5a..284195f8fc 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1556,8 +1556,6 @@ tf_cuda_library( "graph/graph_constructor.cc", "graph/graph_def_builder.cc", "graph/graph_partition.cc", - "graph/mkl_layout_pass.cc", - "graph/mkl_tfconversion_pass.cc", "graph/node_builder.cc", "graph/optimizer_cse.cc", "graph/subgraph.cc", @@ -1619,6 +1617,8 @@ tf_cuda_library( "common_runtime/threadpool_device.cc", "common_runtime/threadpool_device_factory.cc", "graph/gradients.cc", + "graph/mkl_layout_pass.cc", + "graph/mkl_tfconversion_pass.cc", "graph/quantize_training.cc", "public/session.h", "public/session_options.h", diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index e2ad18f33b..d5dba27f45 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -24,6 +24,8 @@ limitations under the License. #include <stdlib.h> #include <string.h> #include <algorithm> +#include <map> +#include <tuple> #include <vector> #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" @@ -174,6 +176,63 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface { TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice); }; +// This factory helps to ensure that different GPU device objects that refer to +// the same physical device and stream group id use the same stream group +// object (and therefore the same CUDA streams). This is necessary since there +// is a single memory allocator per device (see ProcessState::GetGPUAllocator) +// and allocators must not be shared across streams. +class BaseGPUDevice::StreamGroupFactory { + public: + // Returns the unique stream group for use with the stream defined by + // {gpu_id, stream_group_within_gpu}, creating it if it does not yet exist. + // This function is thread safe. + BaseGPUDevice::StreamGroup* GetOrCreate(int gpu_id, + int stream_group_within_gpu, + gpu::StreamExecutor* executor) { + mutex_lock guard(lock_); + StreamGroup* group = &streams_[key_type(gpu_id, stream_group_within_gpu)]; + if (!group->compute) { + group->compute = new gpu::Stream(executor); + group->compute->Init(); + VLOG(2) << "Created stream[" << stream_group_within_gpu + << "] = " << group->compute; + + group->host_to_device = new gpu::Stream(executor); + group->host_to_device->Init(); + VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu + << "] = " << group->host_to_device; + + group->device_to_host = new gpu::Stream(executor); + group->device_to_host->Init(); + VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu + << "] = " << group->device_to_host; + + group->device_to_device = new gpu::Stream(executor); + group->device_to_device->Init(); + VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu + << "] = " << group->device_to_host; + } + return group; + } + + // Returns a reference to the StreamGroupFactory singleton. Note that this is + // never destroyed, so the objects it owns are never deleted. + static StreamGroupFactory& Global() { + static StreamGroupFactory* instance = new StreamGroupFactory(); + return *instance; + } + + private: + mutex lock_; + using key_type = std::tuple<int, int>; + std::map<key_type, StreamGroup> streams_; + + // StreamGroupFactory cannot be created directly; Call + // StreamGroupFactory::Global() to get the global instance. + StreamGroupFactory() = default; + TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory); +}; + BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name, Bytes memory_limit, const DeviceLocality& locality, int gpu_id, const string& physical_device_desc, @@ -193,12 +252,6 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name, BaseGPUDevice::~BaseGPUDevice() { delete gpu_device_info_; for (auto ctx : device_contexts_) ctx->Unref(); - for (auto& stream_group : streams_) { - delete stream_group.compute; - delete stream_group.host_to_device; - delete stream_group.device_to_host; - delete stream_group.device_to_device; - } } Status BaseGPUDevice::Init(const SessionOptions& options) { @@ -217,27 +270,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) { // Create the specified number of GPU streams for (int i = 0; i < max_streams_; i++) { - auto stream = new gpu::Stream(executor_); - stream->Init(); - VLOG(2) << "Created stream[" << i << "] = " << stream; - - auto host_to_device_stream = new gpu::Stream(executor_); - host_to_device_stream->Init(); - VLOG(2) << "Created host_to_device_stream[" << i - << "] = " << host_to_device_stream; - - auto device_to_host_stream = new gpu::Stream(executor_); - device_to_host_stream->Init(); - VLOG(2) << "Created device_to_host_stream[" << i - << "] = " << device_to_host_stream; - - auto device_to_device_stream = new gpu::Stream(executor_); - device_to_device_stream->Init(); - VLOG(2) << "Created device_to_device_stream[" << i - << "] = " << device_to_device_stream; - - streams_.push_back({stream, host_to_device_stream, device_to_host_stream, - device_to_device_stream}); + streams_.push_back( + StreamGroupFactory::Global().GetOrCreate(gpu_id_, i, executor_)); size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int); void* scratch_buffer = gpu_allocator_->AllocateRaw( @@ -259,12 +293,12 @@ Status BaseGPUDevice::Init(const SessionOptions& options) { "Failed to memcopy into scratch buffer for device ", gpu_id_); } - device_contexts_.push_back( - new GPUDeviceContext(i, stream, host_to_device_stream, - device_to_host_stream, device_to_device_stream)); + device_contexts_.push_back(new GPUDeviceContext( + i, streams_.back()->compute, streams_.back()->host_to_device, + streams_.back()->device_to_host, streams_.back()->device_to_device)); } gpu_device_info_ = new GpuDeviceInfo; - gpu_device_info_->stream = streams_[0].compute; + gpu_device_info_->stream = streams_[0]->compute; gpu_device_info_->default_context = device_contexts_[0]; gpu_device_info_->event_mgr = em_.get(); gpu_device_info_->gpu_id = gpu_id_; @@ -511,7 +545,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context, static_cast<ConcretePerOpGpuDevice*>(device); DCHECK(concrete_device); const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>( - streams_[stream_id].compute->implementation()->CudaStreamMemberHack()); + streams_[stream_id]->compute->implementation()->CudaStreamMemberHack()); concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator, scratch_[stream_id]); } diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index 370b3cc4f6..08c58867ee 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -20,7 +20,11 @@ limitations under the License. #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ +#include <memory> +#include <string> +#include <unordered_map> #include <vector> + #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" @@ -96,12 +100,14 @@ class BaseGPUDevice : public LocalDevice { private: struct StreamGroup { - gpu::Stream* compute; - gpu::Stream* host_to_device; - gpu::Stream* device_to_host; - gpu::Stream* device_to_device; + gpu::Stream* compute = nullptr; + gpu::Stream* host_to_device = nullptr; + gpu::Stream* device_to_host = nullptr; + gpu::Stream* device_to_device = nullptr; }; - gtl::InlinedVector<StreamGroup, 4> streams_; + class StreamGroupFactory; + + gtl::InlinedVector<StreamGroup*, 4> streams_; gtl::InlinedVector<char*, 4> scratch_; std::vector<GPUDeviceContext*> device_contexts_; GpuDeviceInfo* gpu_device_info_ = nullptr; diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index b2eaaa3492..70e66280fd 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -5216,6 +5216,7 @@ tf_kernel_library( srcs = ["iterator_ops.cc"], deps = [ ":dataset", + ":ops_util", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:dataset_ops_op_lib", "//tensorflow/core:framework", diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc index 7f1560e9f7..a6d9ddd086 100644 --- a/tensorflow/core/kernels/iterator_ops.cc +++ b/tensorflow/core/kernels/iterator_ops.cc @@ -18,7 +18,10 @@ limitations under the License. #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/resource_op_kernel.h" #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/strings/strcat.h" namespace tensorflow { @@ -282,38 +285,54 @@ class OneShotIteratorOp : public OpKernel { IteratorResource* iterator_resource_ = nullptr; }; -class IteratorGetNextOp : public OpKernel { +class IteratorGetNextOp : public AsyncOpKernel { public: - explicit IteratorGetNextOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} - - // TODO(mrry): Convert this to an async op, because - // `iterator->GetNext()` could trigger long-running operations - // (e.g. a QueueDequeue or a remote read). - void Compute(OpKernelContext* ctx) override { + explicit IteratorGetNextOp(OpKernelConstruction* ctx) + : AsyncOpKernel(ctx), + thread_pool_(new thread::ThreadPool( + ctx->env(), ThreadOptions(), + strings::StrCat("iterator_get_next_thread_", + SanitizeThreadSuffix(def().name())), + 1 /* num_threads */, false /* low_latency_hint */)) {} + + void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override { IteratorResource* iterator; OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator)); - core::ScopedUnref unref_iterator(iterator); - - std::vector<Tensor> components; - bool end_of_sequence; - - IteratorContext::Params params; - params.env = ctx->env(); - params.step_id = ctx->step_id(); - params.resource_manager = ctx->resource_manager(); - params.runner = *(ctx->runner()); - IteratorContext iter_ctx(std::move(params)); - OP_REQUIRES_OK(ctx, - iterator->GetNext(&iter_ctx, &components, &end_of_sequence)); - OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence")); + // The call to `iterator->GetNext()` may block and depend on an + // inter-op thread pool thread, so we issue the call from the + // owned thread pool. + thread_pool_->Schedule([this, ctx, iterator, done]() { + core::ScopedUnref unref_iterator(iterator); + + std::vector<Tensor> components; + bool end_of_sequence; + + IteratorContext::Params params; + params.env = ctx->env(); + params.step_id = ctx->step_id(); + params.resource_manager = ctx->resource_manager(); + params.runner = *(ctx->runner()); + IteratorContext iter_ctx(std::move(params)); + + OP_REQUIRES_OK_ASYNC( + ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence), + done); + OP_REQUIRES_ASYNC(ctx, !end_of_sequence, + errors::OutOfRange("End of sequence"), done); + + for (int i = 0; i < components.size(); ++i) { + // TODO(mrry): Check that the shapes match the shape attrs. + ctx->set_output(i, components[i]); + } - for (int i = 0; i < components.size(); ++i) { - // TODO(mrry): Check that the shapes match the shape attrs. - ctx->set_output(i, components[i]); - } + done(); + }); } + + private: + std::unique_ptr<thread::ThreadPool> thread_pool_; }; class IteratorDisposeOp : public OpKernel { diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc index bd7556658a..2e1a62d8ba 100644 --- a/tensorflow/core/kernels/tensor_array_ops.cc +++ b/tensorflow/core/kernels/tensor_array_ops.cc @@ -36,6 +36,7 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/dynamic_annotations.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/platform/types.h" @@ -101,7 +102,7 @@ Status SetupFlowControlInputs(OpKernelContext* ctx, bool set_output) { class TensorArrayCreationOp : public OpKernel { public: explicit TensorArrayCreationOp(OpKernelConstruction* context) - : OpKernel(context) {} + : OpKernel(context), device_type_(context->device_type()) {} void Compute(OpKernelContext* ctx) override { Tensor tensor_array_output_handle; @@ -133,6 +134,12 @@ class TensorArrayCreationOp : public OpKernel { // Create the flow output. Tensor* flow; OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &flow)); + if (device_type_ == DEVICE_CPU) { + // Value doesn't matter, but this makes msan not complaint about + // copying an uninitialized value. To do this on GPU would require + // a kernel launch or a host->device memcpy, so we avoid that. + flow->flat<float>()(0) = 0; + } } } @@ -140,6 +147,9 @@ class TensorArrayCreationOp : public OpKernel { virtual Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm, Tensor* tensor_array_output_handle, TensorArray** output_tensor_array) = 0; + + private: + const DeviceType device_type_; }; // A per-run local tensor array. The tensor array uses a "per-step" resource diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc index 5c2bda4770..334444a4a2 100644 --- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc +++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc @@ -238,6 +238,33 @@ bool IsFullSlice(const TensorSlice& slice_spec, } } +Status CorruptFileError(const Status& in_status, const string& filename, + const string& detail) { + if (in_status.ok()) { + return errors::Internal("Unable to read file (", filename, + "). Perhaps the file is corrupt or was produced by " + "a newer version of TensorFlow with format changes " + "(", + detail, ")"); + } + return Status( + in_status.code(), + strings::StrCat("Unable to read file (", filename, + "). Perhaps the file is corrupt or was produced by a " + "newer version of TensorFlow with format changes (", + detail, "): ", in_status.error_message())); +} + +table::Options TableBuilderOptions() { + table::Options o; + // Compressed tables cannot be read by TensorFlow releases prior to 1.1. + // To smoothen the transition, compressed writes are disabled for now + // (version 1.2) with the intention that they will be enabled again at + // some point (perhaps the 1.3 release?). + o.compression = table::kNoCompression; + return o; +} + } // namespace BundleWriter::BundleWriter(Env* env, StringPiece prefix) @@ -425,7 +452,7 @@ static Status MergeOneBundle(Env* env, StringPiece prefix, table::Table* table = nullptr; TF_RETURN_IF_ERROR( - table::Table::Open(table::Options(), file.get(), file_size, &table)); + table::Table::Open(TableBuilderOptions(), file.get(), file_size, &table)); std::unique_ptr<table::Table> table_deleter(table); std::unique_ptr<table::Iterator> iter(table->NewIterator()); @@ -433,11 +460,13 @@ static Status MergeOneBundle(Env* env, StringPiece prefix, // Process header. { iter->Seek(kHeaderEntryKey); - CHECK(iter->Valid()) << "File: " << filename - << ", iterator status: " << iter->status(); + if (!iter->Valid()) { + return CorruptFileError(iter->status(), filename, + "failed to seek to header entry"); + } BundleHeaderProto header; - TF_CHECK_OK(ParseEntryProto(iter->key(), iter->value(), &header)); - CHECK_GE(header.num_shards(), 0); + Status s = ParseEntryProto(iter->key(), iter->value(), &header); + if (!s.ok()) return CorruptFileError(s, filename, "unable to parse header"); merge_state->num_shards += header.num_shards(); if (!merge_state->seen_first_bundle) { @@ -536,7 +565,7 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes, TF_RETURN_IF_ERROR( env->NewWritableFile(MetaFilename(merged_prefix), &merged_metadata)); { - table::TableBuilder builder(table::Options(), merged_metadata.get()); + table::TableBuilder builder(TableBuilderOptions(), merged_metadata.get()); // Header entry. BundleHeaderProto header; header.set_num_shards(merge.num_shards); @@ -584,10 +613,17 @@ BundleReader::BundleReader(Env* env, StringPiece prefix) // Reads "num_shards_" from the first entry. iter_->Seek(kHeaderEntryKey); - CHECK(iter_->Valid()) << "File: " << filename - << ", iterator status: " << iter_->status(); + if (!iter_->Valid()) { + status_ = CorruptFileError(iter_->status(), filename, + "failed to seek to header entry"); + return; + } BundleHeaderProto header; - TF_CHECK_OK(ParseEntryProto(iter_->key(), iter_->value(), &header)); + status_ = ParseEntryProto(iter_->key(), iter_->value(), &header); + if (!status_.ok()) { + status_ = CorruptFileError(status_, filename, "unable to parse header"); + return; + } num_shards_ = header.num_shards(); if ((header.endianness() == BundleHeaderProto::BIG && port::kLittleEndian) || (header.endianness() == BundleHeaderProto::LITTLE && diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py index 0438d95bc4..dbbc2de811 100644 --- a/tensorflow/python/kernel_tests/basic_gpu_test.py +++ b/tensorflow/python/kernel_tests/basic_gpu_test.py @@ -18,15 +18,21 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import math +import itertools +import threading import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops -from tensorflow.python.ops import gen_math_ops +from tensorflow.python.framework import random_seed +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops from tensorflow.python.ops import gradient_checker from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops +from tensorflow.python.ops import variables from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args from tensorflow.python.platform import test @@ -219,5 +225,50 @@ class BroadcastSimpleTest(test.TestCase): self._compareGpu(x, y + 0.1, np.floor_divide, math_ops.floordiv) -if __name__ == "__main__": +class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase): + """Tests concurrent sessions executing on the same GPU.""" + + def _run_session(self, session, results): + n_iterations = 500 + with session as s: + data = variables.Variable(1.0) + with ops.device('/gpu:0'): + random_seed.set_random_seed(1) + matrix1 = variables.Variable( + random_ops.truncated_normal([1024, 1]), name='matrix1') + matrix2 = variables.Variable( + random_ops.truncated_normal([1, 1024]), name='matrix2') + x1 = math_ops.multiply(data, matrix1, name='x1') + x3 = math_ops.matmul(x1, math_ops.matmul(matrix2, matrix1)) + x4 = math_ops.matmul(array_ops.transpose(x3), x3, name='x4') + s.run(variables.global_variables_initializer()) + + for _ in xrange(n_iterations): + value = s.run(x4) + results.add(value.flat[0]) + if len(results) != 1: + break + + def testConcurrentSessions(self): + n_threads = 4 + threads = [] + results = [] + for _ in xrange(n_threads): + session = self.test_session(graph=ops.Graph(), use_gpu=True) + results.append(set()) + args = (session, results[-1]) + threads.append(threading.Thread(target=self._run_session, args=args)) + + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + flat_results = set([x for x in itertools.chain(*results)]) + self.assertEqual(1, + len(flat_results), + 'Expected single value, got %r' % flat_results) + + +if __name__ == '__main__': test.main() diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py index 5b0f318efe..9941c97c30 100644 --- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py +++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py @@ -53,6 +53,16 @@ def _make_converter(tf_dtype): class TensorArrayTest(test.TestCase): + @classmethod + def setUpClass(cls): + super(TensorArrayTest, cls).setUpClass() + cls._workers, _ = test.create_local_cluster(num_workers=3, num_ps=0) + + @classmethod + def tearDownClass(cls): + super(TensorArrayTest, cls).tearDownClass() + session_lib.Session.reset(cls._workers[0].target) + def testTensorArrayWriteRead(self): with self.test_session(use_gpu=True) as session: ta = tensor_array_ops.TensorArray( @@ -1225,8 +1235,7 @@ class TensorArrayTest(test.TestCase): ta = ta.split([1.0, 2.0], [1, 1]) flows.append(ta.flow) - workers, _ = test.create_local_cluster(num_workers=3, num_ps=0) - session = session_lib.Session(workers[0].target) + session = session_lib.Session(self._workers[0].target) run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) @@ -1250,13 +1259,12 @@ class TensorArrayTest(test.TestCase): def _body(i, ta_i): with ops.device("/job:worker/task:1/cpu:0"): - return i + 1, ta_i.write(i, 0.0) + return i + 1, ta_i.write(i, constant_op.constant(0.0)) _, ta_out = control_flow_ops.while_loop( lambda i, ta: i < 2, _body, loop_vars=[0, ta]) - workers, _ = test.create_local_cluster(num_workers=3, num_ps=0) - session = session_lib.Session(workers[0].target) + session = session_lib.Session(self._workers[0].target) run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) @@ -1274,6 +1282,36 @@ class TensorArrayTest(test.TestCase): self.assertFalse( [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) + def testTensorArrayDisabledColocateWithFirstWriteCall(self): + with ops.device("/job:worker/task:0/cpu:0"): + ta = tensor_array_ops.TensorArray( + dtype=dtypes.float32, size=2, colocate_with_first_write_call=False) + + def _body(i, ta_i): + with ops.device("/job:worker/task:1/cpu:0"): + return i + 1, ta_i.write(i, constant_op.constant(0.0)) + + _, ta_out = control_flow_ops.while_loop( + lambda i, ta: i < 2, _body, loop_vars=[0, ta]) + + session = session_lib.Session(self._workers[0].target) + + run_options = config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE) + run_metadata = config_pb2.RunMetadata() + + session.run(ta_out.flow, options=run_options, run_metadata=run_metadata) + self.assertTrue(run_metadata.HasField("step_stats")) + dev_stats = {d.device: list(d.node_stats) + for d in run_metadata.step_stats.dev_stats} + for d in dev_stats: + if "/task:0/" in d and "cpu" in d: # Skip any GPU node stats + self.assertTrue( + [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) + else: + self.assertFalse( + [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) + def testTensorArrayIdentity(self): with self.test_session(use_gpu=True) as session: ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2, diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py index 8410f12f3e..a37308f702 100644 --- a/tensorflow/python/layers/base.py +++ b/tensorflow/python/layers/base.py @@ -23,9 +23,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import copy import functools import re + from six.moves import xrange # pylint: disable=redefined-builtin import numpy as np import six @@ -650,10 +652,10 @@ def _to_list(x): return [x] -def _add_elements_to_collection(elements, collections): +def _add_elements_to_collection(elements, collection_list): elements = _to_list(elements) - collections = _to_list(collections) - for name in collections: + collection_list = _to_list(collection_list) + for name in collection_list: collection = ops.get_collection_ref(name) collection_set = set(collection) for element in elements: @@ -666,6 +668,13 @@ def _object_list_uid(object_list): return ', '.join([str(abs(id(x))) for x in object_list]) +# A global dictionary mapping graph objects to an index of counters used +# for various layer names in each graph. +# Allows to give unique autogenerated names to layers, in a graph-specific way. +PER_GRAPH_LAYER_NAME_UIDS = collections.defaultdict( + lambda: collections.defaultdict(int)) + + def _unique_layer_name(name): """Makes a layer name (or arbitrary string) unique within a TensorFlow graph. @@ -684,14 +693,7 @@ def _unique_layer_name(name): dense_2 ``` """ - layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS') - if not layer_name_uids_collection: - layer_name_uids = {} - ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids) - else: - layer_name_uids = layer_name_uids_collection[0] - if name not in layer_name_uids: - layer_name_uids[name] = 1 - else: - layer_name_uids[name] += 1 + graph = ops.get_default_graph() + layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph] + layer_name_uids[name] += 1 return name + '_' + str(layer_name_uids[name]) diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py index b61168695a..49dcd2370c 100644 --- a/tensorflow/python/layers/convolutional.py +++ b/tensorflow/python/layers/convolutional.py @@ -149,39 +149,13 @@ class _Conv(base.Layer): self.built = True def call(self, inputs): - if (self.data_format == 'channels_first' and - not framework.test_util.gpu_device_name()): - # `nn.convolution` is not implemented on CPU for `channels_first` format. - # In cases where we are most likely running on CPU using `channels_first`, - # we reshape the inputs to use `channels_last` (and reshape them back - # afterwards). This is a temporary fix; a better solution would be a fix - # at the op level. - # TODO(chollet): remove this when `nn.convolution` is feature-complete. - data_format = 'channels_last' - if self.rank == 1: - inputs = array_ops.transpose(inputs, (0, 2, 1)) - elif self.rank == 2: - inputs = array_ops.transpose(inputs, (0, 2, 3, 1)) - elif self.rank == 3: - inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1)) - else: - data_format = self.data_format outputs = nn.convolution( input=inputs, filter=self.kernel, dilation_rate=self.dilation_rate, strides=self.strides, padding=self.padding.upper(), - data_format=utils.convert_data_format(data_format, - self.rank + 2)) - if (self.data_format == 'channels_first' and - not framework.test_util.gpu_device_name()): - if self.rank == 1: - outputs = array_ops.transpose(outputs, (0, 2, 1)) - elif self.rank == 2: - outputs = array_ops.transpose(outputs, (0, 3, 1, 2)) - elif self.rank == 3: - outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3)) + data_format=utils.convert_data_format(self.data_format, self.rank + 2)) if self.bias is not None: if self.data_format == 'channels_first': @@ -202,18 +176,10 @@ class _Conv(base.Layer): [outputs_shape[0], outputs_shape[1], outputs_shape[2] * outputs_shape[3], outputs_shape[4]]) - outputs_4d = nn.bias_add( - outputs_4d, - self.bias, - data_format=utils.convert_data_format(self.data_format, 4)) + outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW') outputs = array_ops.reshape(outputs_4d, outputs_shape) else: - outputs = nn.bias_add( - outputs, - self.bias, - data_format=utils.convert_data_format(self.data_format, 4)) - # Note that we passed rank=4 because bias_add will only accept - # NHWC and NCWH even if the rank of the inputs is 3 or 5. + outputs = nn.bias_add(outputs, self.bias, data_format='NHWC') if self.activation is not None: return self.activation(outputs) diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py index 6cd644b642..e903afa0a8 100644 --- a/tensorflow/python/layers/pooling.py +++ b/tensorflow/python/layers/pooling.py @@ -262,16 +262,7 @@ class _Pooling2D(base.Layer): self.input_spec = base.InputSpec(ndim=4) def call(self, inputs): - if (self.data_format == 'channels_first' and - not framework.test_util.gpu_device_name()): - # `nn.convolution` is not implemented on CPU for `channels_first` format. - # TODO(chollet): remove this when `nn.convolution` is feature-complete. - data_format = 'channels_last' - inputs = array_ops.transpose(inputs, (0, 2, 3, 1)) - else: - data_format = self.data_format - - if data_format == 'channels_last': + if self.data_format == 'channels_last': pool_shape = (1,) + self.pool_size + (1,) strides = (1,) + self.strides + (1,) else: @@ -282,11 +273,7 @@ class _Pooling2D(base.Layer): ksize=pool_shape, strides=strides, padding=self.padding.upper(), - data_format=utils.convert_data_format(data_format, 4)) - - if (self.data_format == 'channels_first' and - not framework.test_util.gpu_device_name()): - outputs = array_ops.transpose(outputs, (0, 3, 1, 2)) + data_format=utils.convert_data_format(self.data_format, 4)) return outputs def _compute_output_shape(self, input_shape): diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py index 96ace6e79b..c98e74fe6b 100644 --- a/tensorflow/python/ops/control_flow_ops.py +++ b/tensorflow/python/ops/control_flow_ops.py @@ -437,10 +437,14 @@ def _convert_tensorarray_to_flow(tensor_or_tensor_array): def _make_tensor_array(ta, t_or_flow): + # pylint: disable=protected-access new_ta = tensor_array_ops.TensorArray( dtype=ta.dtype, handle=ta.handle, flow=t_or_flow, - infer_shape=ta._infer_shape) - new_ta._element_shape = ta._element_shape # pylint: disable=protected-access + infer_shape=ta._infer_shape, + colocate_with_first_write_call=ta._colocate_with_first_write_call) + new_ta._colocate_with = ta._colocate_with + new_ta._element_shape = ta._element_shape + # pylint: enable=protected-access return new_ta diff --git a/tensorflow/python/ops/tensor_array_grad.py b/tensorflow/python/ops/tensor_array_grad.py index 0e7d1880ce..1f70d69548 100644 --- a/tensorflow/python/ops/tensor_array_grad.py +++ b/tensorflow/python/ops/tensor_array_grad.py @@ -99,9 +99,9 @@ def _TensorArrayReadGrad(op, grad): flow = op.inputs[2] dtype = op.get_attr("dtype") grad_source = _GetGradSource(grad) - g = tensor_array_ops.TensorArray( - dtype=dtype, handle=handle, flow=flow).grad( - source=grad_source, flow=flow) + g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow, + colocate_with_first_write_call=False) + .grad(source=grad_source, flow=flow)) w_g = g.write(index, grad) return [None, None, w_g.flow] @@ -125,9 +125,9 @@ def _TensorArrayWriteGrad(op, flow): index = op.inputs[1] dtype = op.get_attr("T") grad_source = _GetGradSource(flow) - g = tensor_array_ops.TensorArray( - dtype=dtype, handle=handle, flow=flow).grad( - source=grad_source, flow=flow) + g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow, + colocate_with_first_write_call=False) + .grad(source=grad_source, flow=flow)) grad = g.read(index) return [None, None, grad, flow] @@ -156,9 +156,9 @@ def _TensorArrayGatherGrad(op, grad): flow = op.inputs[2] dtype = op.get_attr("dtype") grad_source = _GetGradSource(grad) - g = tensor_array_ops.TensorArray( - dtype=dtype, handle=handle, flow=flow).grad( - source=grad_source, flow=flow) + g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow, + colocate_with_first_write_call=False) + .grad(source=grad_source, flow=flow)) u_g = g.scatter(indices, grad) return [None, None, u_g.flow] @@ -180,9 +180,9 @@ def _TensorArrayScatterGrad(op, flow): indices = op.inputs[1] dtype = op.get_attr("T") grad_source = _GetGradSource(flow) - g = tensor_array_ops.TensorArray( - dtype=dtype, handle=handle, flow=flow).grad( - source=grad_source, flow=flow) + g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow, + colocate_with_first_write_call=False) + .grad(source=grad_source, flow=flow)) grad = g.gather(indices) return [None, None, grad, flow] @@ -211,9 +211,9 @@ def _TensorArrayConcatGrad(op, grad, unused_lengths_grad): lengths = op.outputs[1] dtype = op.get_attr("dtype") grad_source = _GetGradSource(grad) - g = tensor_array_ops.TensorArray( - dtype=dtype, handle=handle, flow=flow).grad( - source=grad_source, flow=flow) + g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow, + colocate_with_first_write_call=False) + .grad(source=grad_source, flow=flow)) u_g = g.split(grad, lengths=lengths) # handle, flow_in return [None, u_g.flow] @@ -235,9 +235,9 @@ def _TensorArraySplitGrad(op, flow): handle = op.inputs[0] dtype = op.get_attr("T") grad_source = _GetGradSource(flow) - g = tensor_array_ops.TensorArray( - dtype=dtype, handle=handle, flow=flow).grad( - source=grad_source, flow=flow) + g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow, + colocate_with_first_write_call=False) + .grad(source=grad_source, flow=flow)) grad = g.concat() # handle, value, lengths, flow_in return [None, grad, None, flow] diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py index 8b119f5842..7a6abc8e61 100644 --- a/tensorflow/python/ops/tensor_array_ops.py +++ b/tensorflow/python/ops/tensor_array_ops.py @@ -54,6 +54,7 @@ class TensorArray(object): flow=None, infer_shape=True, element_shape=None, + colocate_with_first_write_call=True, name=None): """Construct a new TensorArray or wrap an existing TensorArray handle. @@ -85,6 +86,11 @@ class TensorArray(object): element_shape: (optional, default: None) A `TensorShape` object specifying the shape constraints of each of the elements of the TensorArray. Need not be fully defined. + colocate_with_first_write_call: If `True`, the TensorArray will be + colocated on the same device as the the Tensor used on its first write + (write operations include `write`, `unstack`, and `split`). If `False`, + the TensorArray will be placed on the device determined by the + device context available during its initialization. name: A name for the operation (optional). Raises: @@ -120,7 +126,11 @@ class TensorArray(object): # Used to keep track of what tensors the TensorArray should be # colocated with. We choose to colocate the TensorArray with the # first tensor written to it. - self._colocate_with = [] + self._colocate_with_first_write_call = colocate_with_first_write_call + if colocate_with_first_write_call: + self._colocate_with = [] + else: + self._colocate_with = None # Record the current static shape for the array elements. The element # shape is defined either by `element_shape` or the shape of the tensor @@ -142,8 +152,8 @@ class TensorArray(object): # Construct the TensorArray with an empty device. The first # write into the TensorArray from a Tensor with a set device # will retroactively set the device value of this op. - with ops.device(None), ops.colocate_with(None, ignore_existing=True): - self._handle, self._flow = gen_data_flow_ops._tensor_array_v3( + def create(): + return gen_data_flow_ops._tensor_array_v3( dtype=dtype, size=size, element_shape=element_shape, @@ -151,6 +161,11 @@ class TensorArray(object): clear_after_read=clear_after_read, tensor_array_name=tensor_array_name, name=scope) + if colocate_with_first_write_call: + with ops.device(None), ops.colocate_with(None, ignore_existing=True): + self._handle, self._flow = create() + else: + self._handle, self._flow = create() @property def flow(self): @@ -200,10 +215,13 @@ class TensorArray(object): If no internal colocation group is set, colocate with `value` and set the internal colocation group to be value. """ - if not self._colocate_with: - self._colocate_with.append(value) - with ops.colocate_with(self._colocate_with[0]): + if not self._colocate_with_first_write_call: yield + else: + if not self._colocate_with: + self._colocate_with.append(value) + with ops.colocate_with(self._colocate_with[0]): + yield def identity(self): """Returns a TensorArray with the same content and properties. @@ -214,8 +232,10 @@ class TensorArray(object): Use this object all for subsequent operations. """ flow = array_ops.identity(self._flow) - ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow, - infer_shape=self._infer_shape) + ta = TensorArray( + dtype=self._dtype, handle=self._handle, flow=flow, + infer_shape=self._infer_shape, + colocate_with_first_write_call=self._colocate_with_first_write_call) ta._element_shape = self._element_shape ta._colocate_with = self._colocate_with return ta @@ -237,7 +257,8 @@ class TensorArray(object): dtype=self._dtype, handle=g_handle, flow=flow, - infer_shape=self._infer_shape) + infer_shape=self._infer_shape, + colocate_with_first_write_call=False) g._element_shape = self._element_shape return g @@ -286,7 +307,9 @@ class TensorArray(object): value=value, flow_in=self._flow, name=name) - ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out) + ta = TensorArray( + dtype=self._dtype, handle=self._handle, flow=flow_out, + colocate_with_first_write_call=self._colocate_with_first_write_call) ta._infer_shape = self._infer_shape ta._element_shape = self._element_shape ta._colocate_with = self._colocate_with @@ -416,7 +439,9 @@ class TensorArray(object): value=value, flow_in=self._flow, name=name) - ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out) + ta = TensorArray( + dtype=self._dtype, handle=self._handle, flow=flow_out, + colocate_with_first_write_call=self._colocate_with_first_write_call) ta._infer_shape = self._infer_shape ta._element_shape = self._element_shape ta._colocate_with = self._colocate_with @@ -456,7 +481,9 @@ class TensorArray(object): lengths=lengths_64, flow_in=self._flow, name=name) - ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out) + ta = TensorArray( + dtype=self._dtype, handle=self._handle, flow=flow_out, + colocate_with_first_write_call=self._colocate_with_first_write_call) ta._infer_shape = self._infer_shape ta._element_shape = self._element_shape ta._colocate_with = self._colocate_with diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py index d713e222ae..4e58602a6f 100644 --- a/tensorflow/python/training/queue_runner_impl.py +++ b/tensorflow/python/training/queue_runner_impl.py @@ -22,6 +22,7 @@ import threading import weakref from tensorflow.core.protobuf import queue_runner_pb2 +from tensorflow.python.client import session from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.platform import tf_logging as logging @@ -401,6 +402,10 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True, collection: A `GraphKey` specifying the graph collection to get the queue runners from. Defaults to `GraphKeys.QUEUE_RUNNERS`. + Raises: + ValueError: if `sess` is None and there isn't any default session. + TypeError: if `sess` is not a `tf.Session` object. + Returns: A list of threads. """ @@ -410,6 +415,15 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True, raise ValueError("Cannot start queue runners: No default session is " "registered. Use `with sess.as_default()` or pass an " "explicit session to tf.start_queue_runners(sess=sess)") + + if not isinstance(sess, session.SessionInterface): + # Following check is due to backward compatibility. (b/62061352) + if sess.__class__.__name__ in [ + "MonitoredSession", "SingularMonitoredSession"]: + return [] + raise TypeError("sess must be a `tf.Session` object. " + "Given class: {}".format(sess.__class__)) + with sess.graph.as_default(): threads = [] for qr in ops.get_collection(collection): diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py index 5b00ac9fc3..51c0eecf46 100644 --- a/tensorflow/python/training/queue_runner_test.py +++ b/tensorflow/python/training/queue_runner_test.py @@ -30,6 +30,7 @@ from tensorflow.python.ops import data_flow_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test from tensorflow.python.training import coordinator +from tensorflow.python.training import monitored_session from tensorflow.python.training import queue_runner_impl @@ -247,6 +248,33 @@ class QueueRunnerTest(test.TestCase): # The variable should be 3. self.assertEqual(3, var.eval()) + def testStartQueueRunnersRaisesIfNotASession(self): + zero64 = constant_op.constant(0, dtype=dtypes.int64) + var = variables.Variable(zero64) + count_up_to = var.count_up_to(3) + queue = data_flow_ops.FIFOQueue(10, dtypes.float32) + init_op = variables.global_variables_initializer() + qr = queue_runner_impl.QueueRunner(queue, [count_up_to]) + queue_runner_impl.add_queue_runner(qr) + with self.test_session(): + init_op.run() + with self.assertRaisesRegexp(TypeError, "tf.Session"): + queue_runner_impl.start_queue_runners("NotASession") + + def testStartQueueRunnersIgnoresMonitoredSession(self): + zero64 = constant_op.constant(0, dtype=dtypes.int64) + var = variables.Variable(zero64) + count_up_to = var.count_up_to(3) + queue = data_flow_ops.FIFOQueue(10, dtypes.float32) + init_op = variables.global_variables_initializer() + qr = queue_runner_impl.QueueRunner(queue, [count_up_to]) + queue_runner_impl.add_queue_runner(qr) + with self.test_session(): + init_op.run() + threads = queue_runner_impl.start_queue_runners( + monitored_session.MonitoredSession()) + self.assertFalse(threads) + def testStartQueueRunnersNonDefaultGraph(self): # CountUpTo will raise OUT_OF_RANGE when it reaches the count. graph = ops.Graph() diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py index 88df3351e6..05c99856d2 100644 --- a/tensorflow/python/util/tf_should_use.py +++ b/tensorflow/python/util/tf_should_use.py @@ -17,14 +17,52 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import functools +import itertools import traceback import types +import six # pylint: disable=unused-import + +from backports import weakref # pylint: disable=g-bad-import-order + from tensorflow.python.platform import tf_logging from tensorflow.python.util import tf_decorator +class _RefInfoField( + collections.namedtuple( + '_RefInfoField', ('type_', 'repr_', 'creation_stack', 'object_used'))): + pass + + +# Thread-safe up to int32max/2 thanks to python's GIL; and may be safe even for +# higher values in Python 3.4+. We don't expect to ever count higher than this. +# https://mail.python.org/pipermail/python-list/2005-April/342279.html +_REF_ITER = itertools.count() + +# Dictionary mapping id(obj) => _RefInfoField. +_REF_INFO = {} + + +def _deleted(obj_id, fatal_error): + obj = _REF_INFO[obj_id] + del _REF_INFO[obj_id] + if not obj.object_used: + if fatal_error: + logger = tf_logging.fatal + else: + logger = tf_logging.error + logger( + '==================================\n' + 'Object was never used (type %s):\n%s\nIf you want to mark it as ' + 'used call its "mark_used()" method.\nIt was originally created ' + 'here:\n%s\n' + '==================================' % + (obj.type_, obj.repr_, obj.creation_stack)) + + def _add_should_use_warning(x, fatal_error=False): """Wraps object x so that if it is never used, a warning is logged. @@ -39,14 +77,14 @@ def _add_should_use_warning(x, fatal_error=False): """ if x is None: # special corner case where x is None return x - has_been_used = getattr(x, '_tf_object_has_been_used', None) - if has_been_used is not None: - x._tf_object_has_been_used = has_been_used # pylint: disable=protected-access + if hasattr(x, '_tf_ref_id'): # this is already a TFShouldUseWarningWrapper return x def override_method(method): def fn(self, *args, **kwargs): - self._tf_object_has_been_used = True # pylint: disable=protected-access + # pylint: disable=protected-access + _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace( + object_used=True) return method(self, *args, **kwargs) return fn @@ -55,38 +93,36 @@ def _add_should_use_warning(x, fatal_error=False): def __init__(self, true_self): self.__dict__ = true_self.__dict__ - stack = [x.strip() for x in traceback.format_stack()] + stack = [s.strip() for s in traceback.format_stack()] # Remove top three stack entries from adding the wrapper - self._tf_object_creation_stack = '\n'.join(stack[:-3]) - self._tf_object_has_been_used = False + self.creation_stack = '\n'.join(stack[:-3]) + self._tf_ref_id = next(_REF_ITER) + _REF_INFO[self._tf_ref_id] = _RefInfoField( + type_=type(x), + repr_=repr(x), + creation_stack=stack, + object_used=False) + + # Create a finalizer for self, which will be called when self is + # garbage collected. Can't add self as the args because the + # loop will break garbage collection. We keep track of + # ourselves via python ids. + weakref.finalize(self, _deleted, self._tf_ref_id, fatal_error) # Not sure why this pylint warning is being used; this is not an # old class form. # pylint: disable=super-on-old-class def __getattribute__(self, name): - if name != '_tf_object_has_been_used': - self._tf_object_has_been_used = True + if name == '_tf_ref_id': + return super(TFShouldUseWarningWrapper, self).__getattribute__(name) + if self._tf_ref_id in _REF_INFO: + _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace( + object_used=True) return super(TFShouldUseWarningWrapper, self).__getattribute__(name) - def __del__(self): - if not self._tf_object_has_been_used: - if fatal_error: - logger = tf_logging.fatal - else: - logger = tf_logging.error - logger( - '==================================\n' - 'Object was never used (type %s):\n%s\nIf you want to mark it as ' - 'used call its "mark_used()" method.\nIt was originally created ' - 'here:\n%s\n' - '==================================' % - (type(x), x, self._tf_object_creation_stack)) - - if hasattr(super(TFShouldUseWarningWrapper, self), '__del__'): - return super(TFShouldUseWarningWrapper, self).__del__() - def mark_used(self, *args, **kwargs): - self._tf_object_has_been_used = True + _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace( + object_used=True) if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'): return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs) # pylint: enable=super-on-old-class @@ -102,7 +138,8 @@ def _add_should_use_warning(x, fatal_error=False): wrapped = TFShouldUseWarningWrapper(x) wrapped.__doc__ = x.__doc__ # functools.wraps fails on some objects. - wrapped._tf_object_has_been_used = False # pylint: disable=protected-access + ref_id = wrapped._tf_ref_id # pylint: disable=protected-access + _REF_INFO[ref_id] = _REF_INFO[ref_id]._replace(object_used=False) return wrapped diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py index 71d48e3dde..c826874400 100644 --- a/tensorflow/python/util/tf_should_use_test.py +++ b/tensorflow/python/util/tf_should_use_test.py @@ -20,6 +20,7 @@ from __future__ import division from __future__ import print_function import contextlib +import gc import sys from tensorflow.python.framework import constant_op @@ -45,7 +46,7 @@ def reroute_error(captured): class TfShouldUseTest(test.TestCase): def testAddShouldUseWarningWhenNotUsed(self): - c = constant_op.constant(0, name='blah') + c = constant_op.constant(0, name='blah0') captured = [] with reroute_error(captured): def in_this_function(): @@ -53,44 +54,52 @@ class TfShouldUseTest(test.TestCase): del h in_this_function() self.assertIn('Object was never used', '\n'.join(captured)) - self.assertIn('blah:0', '\n'.join(captured)) + self.assertIn('blah0:0', '\n'.join(captured)) self.assertIn('in_this_function', '\n'.join(captured)) + gc.collect() + self.assertFalse(gc.garbage) - def _testAddShouldUseWarningWhenUsed(self, fn): - c = constant_op.constant(0, name='blah') + def _testAddShouldUseWarningWhenUsed(self, fn, name): + c = constant_op.constant(0, name=name) captured = [] with reroute_error(captured): h = tf_should_use._add_should_use_warning(c) fn(h) del h self.assertNotIn('Object was never used', '\n'.join(captured)) - self.assertNotIn('blah:0', '\n'.join(captured)) + self.assertNotIn('%s:0' % name, '\n'.join(captured)) def testAddShouldUseWarningWhenUsedWithAdd(self): def add(h): _ = h + 1 - self._testAddShouldUseWarningWhenUsed(add) + self._testAddShouldUseWarningWhenUsed(add, name='blah_add') + gc.collect() + self.assertFalse(gc.garbage) def testAddShouldUseWarningWhenUsedWithGetName(self): def get_name(h): _ = h.name - self._testAddShouldUseWarningWhenUsed(get_name) + self._testAddShouldUseWarningWhenUsed(get_name, name='blah_get_name') + gc.collect() + self.assertFalse(gc.garbage) def testShouldUseResult(self): @tf_should_use.should_use_result def return_const(value): - return constant_op.constant(value, name='blah') + return constant_op.constant(value, name='blah2') captured = [] with reroute_error(captured): return_const(0.0) self.assertIn('Object was never used', '\n'.join(captured)) - self.assertIn('blah:0', '\n'.join(captured)) + self.assertIn('blah2:0', '\n'.join(captured)) self.assertIn('return_const', '\n'.join(captured)) + gc.collect() + self.assertFalse(gc.garbage) def testShouldUseResultWhenNotReallyUsed(self): @tf_should_use.should_use_result def return_const(value): - return constant_op.constant(value, name='blah') + return constant_op.constant(value, name='blah3') captured = [] with reroute_error(captured): with self.test_session(): @@ -100,8 +109,10 @@ class TfShouldUseTest(test.TestCase): v = constant_op.constant(1.0, name='meh') v.eval() self.assertIn('Object was never used', '\n'.join(captured)) - self.assertIn('blah:0', '\n'.join(captured)) + self.assertIn('blah3:0', '\n'.join(captured)) self.assertIn('return_const', '\n'.join(captured)) + gc.collect() + self.assertFalse(gc.garbage) if __name__ == '__main__': diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt index a0fad4df52..ed088c41ed 100644 --- a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt @@ -16,7 +16,7 @@ tf_class { } member_method { name: "__init__" - argspec: "args=[\'self\', \'dtype\', \'size\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'handle\', \'flow\', \'infer_shape\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'dtype\', \'size\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'handle\', \'flow\', \'infer_shape\', \'element_shape\', \'colocate_with_first_write_call\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'True\', \'None\'], " } member_method { name: "close" diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index b8f9fc8453..8768852dc7 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -85,3 +85,6 @@ pip2 install mock pip2 install portpicker pip3 install portpicker + +pip2 install backports.weakref==1.0rc1 +pip3 install backports.weakref==1.0rc1 diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh index e7e2d256cd..edfc4e3a98 100755 --- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh @@ -89,3 +89,6 @@ pip3.5 install wheel==0.29.0 pip3.5 install portpicker pip3.5 install werkzeug + +pip3.5 install backports.weakref==1.0rc1 + diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat index f124012edc..b4f9cc8476 100644 --- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat +++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat @@ -22,12 +22,14 @@ CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" :: Turn echo back on, above script turns it off. ECHO ON -:: Some common variables to be shared between runs. -SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe" -SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe" -SET PY_EXE="C:\Program Files\Anaconda3\python.exe" -SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib" -SET CUDNN_HOME="c:\tools\cuda" +:: Set environment variables to be shared between runs. Do not override if they +:: are set already. + +IF DEFINED CMAKE_EXE (ECHO CMAKE_EXE is set to %CMAKE_EXE%) ELSE (SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe") +IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe") +IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe") +IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib") +IF DEFINED CUDNN_HOME (ECHO CUDNN_HOME is set to %CUDNN_HOME%) ELSE (SET CUDNN_HOME="c:\tools\cuda") SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe" diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat index 9307ebb66b..ba2d939b5f 100644 --- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat +++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat @@ -22,7 +22,7 @@ CD %BUILD_DIR% SET BUILD_CC_TESTS=OFF SET BUILD_PYTHON_TESTS=ON -SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe" +IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe") :: Run the CMAKE build to build the pip package. CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index a85a220270..a1676203c7 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -39,6 +39,7 @@ REQUIRED_PACKAGES = [ 'html5lib == 0.9999999', # identical to 1.0b8 'markdown == 2.2.0', 'bleach == 1.5.0', + 'backports.weakref == 1.0rc1', ] project_name = 'tensorflow' |