aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/legacy_seq2seq
diff options
context:
space:
mode:
authorGravatar Jianwei Xie <xiejw@google.com>2016-12-13 21:44:38 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-12-13 22:05:09 -0800
commitd438a07a2f50401c6a1d5a644512cd46fba4da4f (patch)
tree111ec8790aa29c3408959a43cb359ee46b8ada62 /tensorflow/contrib/legacy_seq2seq
parent2a6bd09d05881d3160885a386b3ac1fb7cf6a6e1 (diff)
Move implementation code of tf.nn.seq2eq to tf.contrib.
Change: 141978567
Diffstat (limited to 'tensorflow/contrib/legacy_seq2seq')
-rw-r--r--tensorflow/contrib/legacy_seq2seq/BUILD8
-rw-r--r--tensorflow/contrib/legacy_seq2seq/__init__.py26
-rw-r--r--tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py1194
3 files changed, 1214 insertions, 14 deletions
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index d4e8582bcc..3fade19838 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -12,7 +12,13 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
py_library(
name = "seq2seq_py",
- srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+ srcs = [
+ "__init__.py",
+ "python/__init__.py",
+ ] + glob(
+ ["python/ops/*.py"],
+ exclude = ["python/ops/**/*_test.py"],
+ ),
srcs_version = "PY2AND3",
visibility = ["//visibility:public"],
)
diff --git a/tensorflow/contrib/legacy_seq2seq/__init__.py b/tensorflow/contrib/legacy_seq2seq/__init__.py
index 1b9043645c..75069fe950 100644
--- a/tensorflow/contrib/legacy_seq2seq/__init__.py
+++ b/tensorflow/contrib/legacy_seq2seq/__init__.py
@@ -33,19 +33,19 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
-from tensorflow.python.ops.seq2seq import attention_decoder
-from tensorflow.python.ops.seq2seq import basic_rnn_seq2seq
-from tensorflow.python.ops.seq2seq import embedding_attention_decoder
-from tensorflow.python.ops.seq2seq import embedding_attention_seq2seq
-from tensorflow.python.ops.seq2seq import embedding_rnn_decoder
-from tensorflow.python.ops.seq2seq import embedding_rnn_seq2seq
-from tensorflow.python.ops.seq2seq import embedding_tied_rnn_seq2seq
-from tensorflow.python.ops.seq2seq import model_with_buckets
-from tensorflow.python.ops.seq2seq import one2many_rnn_seq2seq
-from tensorflow.python.ops.seq2seq import rnn_decoder
-from tensorflow.python.ops.seq2seq import sequence_loss
-from tensorflow.python.ops.seq2seq import sequence_loss_by_example
-from tensorflow.python.ops.seq2seq import tied_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import attention_decoder
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import basic_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_attention_decoder
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_attention_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_rnn_decoder
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_tied_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import model_with_buckets
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import one2many_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import rnn_decoder
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import sequence_loss
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import sequence_loss_by_example
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import tied_rnn_seq2seq
from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
new file mode 100644
index 0000000000..0582028b88
--- /dev/null
+++ b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
@@ -0,0 +1,1194 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for creating sequence-to-sequence models in TensorFlow.
+
+Sequence-to-sequence recurrent neural networks can learn complex functions
+that map input sequences to output sequences. These models yield very good
+results on a number of tasks, such as speech recognition, parsing, machine
+translation, or even constructing automated replies to emails.
+
+Before using this module, it is recommended to read the TensorFlow tutorial
+on sequence-to-sequence models. It explains the basic concepts of this module
+and shows an end-to-end example of how to build a translation model.
+ https://www.tensorflow.org/versions/master/tutorials/seq2seq/index.html
+
+Here is an overview of functions available in this module. They all use
+a very similar interface, so after reading the above tutorial and using
+one of them, others should be easy to substitute.
+
+* Full sequence-to-sequence models.
+ - basic_rnn_seq2seq: The most basic RNN-RNN model.
+ - tied_rnn_seq2seq: The basic model with tied encoder and decoder weights.
+ - embedding_rnn_seq2seq: The basic model with input embedding.
+ - embedding_tied_rnn_seq2seq: The tied model with input embedding.
+ - embedding_attention_seq2seq: Advanced model with input embedding and
+ the neural attention mechanism; recommended for complex tasks.
+
+* Multi-task sequence-to-sequence models.
+ - one2many_rnn_seq2seq: The embedding model with multiple decoders.
+
+* Decoders (when you write your own encoder, you can use these to decode;
+ e.g., if you want to write a model that generates captions for images).
+ - rnn_decoder: The basic decoder based on a pure RNN.
+ - attention_decoder: A decoder that uses the attention mechanism.
+
+* Losses.
+ - sequence_loss: Loss for a sequence model returning average log-perplexity.
+ - sequence_loss_by_example: As above, but not averaging over all examples.
+
+* model_with_buckets: A convenience function to create models with bucketing
+ (see the tutorial above for an explanation of why and how to use it).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# We disable pylint because we need python3 compatibility.
+from six.moves import xrange # pylint: disable=redefined-builtin
+from six.moves import zip # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+# TODO(ebrevdo): Remove once _linear is fully deprecated.
+linear = rnn_cell_impl._linear # pylint: disable=protected-access
+
+
+def _extract_argmax_and_embed(embedding,
+ output_projection=None,
+ update_embedding=True):
+ """Get a loop_function that extracts the previous symbol and embeds it.
+
+ Args:
+ embedding: embedding tensor for symbols.
+ output_projection: None or a pair (W, B). If provided, each fed previous
+ output will first be multiplied by W and added B.
+ update_embedding: Boolean; if False, the gradients will not propagate
+ through the embeddings.
+
+ Returns:
+ A loop function.
+ """
+
+ def loop_function(prev, _):
+ if output_projection is not None:
+ prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])
+ prev_symbol = math_ops.argmax(prev, 1)
+ # Note that gradients will not propagate through the second parameter of
+ # embedding_lookup.
+ emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
+ if not update_embedding:
+ emb_prev = array_ops.stop_gradient(emb_prev)
+ return emb_prev
+
+ return loop_function
+
+
+def rnn_decoder(decoder_inputs,
+ initial_state,
+ cell,
+ loop_function=None,
+ scope=None):
+ """RNN decoder for the sequence-to-sequence model.
+
+ Args:
+ decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+ initial_state: 2D Tensor with shape [batch_size x cell.state_size].
+ cell: rnn_cell.RNNCell defining the cell function and size.
+ loop_function: If not None, this function will be applied to the i-th output
+ in order to generate the i+1-st input, and decoder_inputs will be ignored,
+ except for the first element ("GO" symbol). This can be used for decoding,
+ but also for training to emulate http://arxiv.org/abs/1506.03099.
+ Signature -- loop_function(prev, i) = next
+ * prev is a 2D Tensor of shape [batch_size x output_size],
+ * i is an integer, the step number (when advanced control is needed),
+ * next is a 2D Tensor of shape [batch_size x input_size].
+ scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors with
+ shape [batch_size x output_size] containing generated outputs.
+ state: The state of each cell at the final time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+ (Note that in some cases, like basic RNN cell or GRU cell, outputs and
+ states can be the same. They are different for LSTM cells though.)
+ """
+ with variable_scope.variable_scope(scope or "rnn_decoder"):
+ state = initial_state
+ outputs = []
+ prev = None
+ for i, inp in enumerate(decoder_inputs):
+ if loop_function is not None and prev is not None:
+ with variable_scope.variable_scope("loop_function", reuse=True):
+ inp = loop_function(prev, i)
+ if i > 0:
+ variable_scope.get_variable_scope().reuse_variables()
+ output, state = cell(inp, state)
+ outputs.append(output)
+ if loop_function is not None:
+ prev = output
+ return outputs, state
+
+
+def basic_rnn_seq2seq(encoder_inputs,
+ decoder_inputs,
+ cell,
+ dtype=dtypes.float32,
+ scope=None):
+ """Basic RNN sequence-to-sequence model.
+
+ This model first runs an RNN to encode encoder_inputs into a state vector,
+ then runs decoder, initialized with the last encoder state, on decoder_inputs.
+ Encoder and decoder use the same RNN cell type, but don't share parameters.
+
+ Args:
+ encoder_inputs: A list of 2D Tensors [batch_size x input_size].
+ decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+ cell: rnn_cell.RNNCell defining the cell function and size.
+ dtype: The dtype of the initial state of the RNN cell (default: tf.float32).
+ scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors with
+ shape [batch_size x output_size] containing the generated outputs.
+ state: The state of each decoder cell in the final time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+ """
+ with variable_scope.variable_scope(scope or "basic_rnn_seq2seq"):
+ _, enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtype)
+ return rnn_decoder(decoder_inputs, enc_state, cell)
+
+
+def tied_rnn_seq2seq(encoder_inputs,
+ decoder_inputs,
+ cell,
+ loop_function=None,
+ dtype=dtypes.float32,
+ scope=None):
+ """RNN sequence-to-sequence model with tied encoder and decoder parameters.
+
+ This model first runs an RNN to encode encoder_inputs into a state vector, and
+ then runs decoder, initialized with the last encoder state, on decoder_inputs.
+ Encoder and decoder use the same RNN cell and share parameters.
+
+ Args:
+ encoder_inputs: A list of 2D Tensors [batch_size x input_size].
+ decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+ cell: rnn_cell.RNNCell defining the cell function and size.
+ loop_function: If not None, this function will be applied to i-th output
+ in order to generate i+1-th input, and decoder_inputs will be ignored,
+ except for the first element ("GO" symbol), see rnn_decoder for details.
+ dtype: The dtype of the initial state of the rnn cell (default: tf.float32).
+ scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors with
+ shape [batch_size x output_size] containing the generated outputs.
+ state: The state of each decoder cell in each time-step. This is a list
+ with length len(decoder_inputs) -- one item for each time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+ """
+ with variable_scope.variable_scope("combined_tied_rnn_seq2seq"):
+ scope = scope or "tied_rnn_seq2seq"
+ _, enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtype, scope=scope)
+ variable_scope.get_variable_scope().reuse_variables()
+ return rnn_decoder(
+ decoder_inputs,
+ enc_state,
+ cell,
+ loop_function=loop_function,
+ scope=scope)
+
+
+def embedding_rnn_decoder(decoder_inputs,
+ initial_state,
+ cell,
+ num_symbols,
+ embedding_size,
+ output_projection=None,
+ feed_previous=False,
+ update_embedding_for_previous=True,
+ scope=None):
+ """RNN decoder with embedding and a pure-decoding option.
+
+ Args:
+ decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
+ initial_state: 2D Tensor [batch_size x cell.state_size].
+ cell: rnn_cell.RNNCell defining the cell function.
+ num_symbols: Integer, how many symbols come into the embedding.
+ embedding_size: Integer, the length of the embedding vector for each symbol.
+ output_projection: None or a pair (W, B) of output projection weights and
+ biases; W has shape [output_size x num_symbols] and B has
+ shape [num_symbols]; if provided and feed_previous=True, each fed
+ previous output will first be multiplied by W and added B.
+ feed_previous: Boolean; if True, only the first of decoder_inputs will be
+ used (the "GO" symbol), and all other decoder inputs will be generated by:
+ next = embedding_lookup(embedding, argmax(previous_output)),
+ In effect, this implements a greedy decoder. It can also be used
+ during training to emulate http://arxiv.org/abs/1506.03099.
+ If False, decoder_inputs are used as given (the standard decoder case).
+ update_embedding_for_previous: Boolean; if False and feed_previous=True,
+ only the embedding for the first symbol of decoder_inputs (the "GO"
+ symbol) will be updated by back propagation. Embeddings for the symbols
+ generated from the decoder itself remain unchanged. This parameter has
+ no effect if feed_previous=False.
+ scope: VariableScope for the created subgraph; defaults to
+ "embedding_rnn_decoder".
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors. The
+ output is of shape [batch_size x cell.output_size] when
+ output_projection is not None (and represents the dense representation
+ of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
+ when output_projection is None.
+ state: The state of each decoder cell in each time-step. This is a list
+ with length len(decoder_inputs) -- one item for each time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+ Raises:
+ ValueError: When output_projection has the wrong shape.
+ """
+ with variable_scope.variable_scope(scope or "embedding_rnn_decoder") as scope:
+ if output_projection is not None:
+ dtype = scope.dtype
+ proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype)
+ proj_weights.get_shape().assert_is_compatible_with([None, num_symbols])
+ proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+ proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+ embedding = variable_scope.get_variable("embedding",
+ [num_symbols, embedding_size])
+ loop_function = _extract_argmax_and_embed(
+ embedding, output_projection,
+ update_embedding_for_previous) if feed_previous else None
+ emb_inp = (embedding_ops.embedding_lookup(embedding, i)
+ for i in decoder_inputs)
+ return rnn_decoder(
+ emb_inp, initial_state, cell, loop_function=loop_function)
+
+
+def embedding_rnn_seq2seq(encoder_inputs,
+ decoder_inputs,
+ cell,
+ num_encoder_symbols,
+ num_decoder_symbols,
+ embedding_size,
+ output_projection=None,
+ feed_previous=False,
+ dtype=None,
+ scope=None):
+ """Embedding RNN sequence-to-sequence model.
+
+ This model first embeds encoder_inputs by a newly created embedding (of shape
+ [num_encoder_symbols x input_size]). Then it runs an RNN to encode
+ embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
+ by another newly created embedding (of shape [num_decoder_symbols x
+ input_size]). Then it runs RNN decoder, initialized with the last
+ encoder state, on embedded decoder_inputs.
+
+ Args:
+ encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+ decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+ cell: rnn_cell.RNNCell defining the cell function and size.
+ num_encoder_symbols: Integer; number of symbols on the encoder side.
+ num_decoder_symbols: Integer; number of symbols on the decoder side.
+ embedding_size: Integer, the length of the embedding vector for each symbol.
+ output_projection: None or a pair (W, B) of output projection weights and
+ biases; W has shape [output_size x num_decoder_symbols] and B has
+ shape [num_decoder_symbols]; if provided and feed_previous=True, each
+ fed previous output will first be multiplied by W and added B.
+ feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+ of decoder_inputs will be used (the "GO" symbol), and all other decoder
+ inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+ If False, decoder_inputs are used as given (the standard decoder case).
+ dtype: The dtype of the initial state for both the encoder and encoder
+ rnn cells (default: tf.float32).
+ scope: VariableScope for the created subgraph; defaults to
+ "embedding_rnn_seq2seq"
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors. The
+ output is of shape [batch_size x cell.output_size] when
+ output_projection is not None (and represents the dense representation
+ of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
+ when output_projection is None.
+ state: The state of each decoder cell in each time-step. This is a list
+ with length len(decoder_inputs) -- one item for each time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+ """
+ with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq") as scope:
+ if dtype is not None:
+ scope.set_dtype(dtype)
+ else:
+ dtype = scope.dtype
+
+ # Encoder.
+ encoder_cell = rnn_cell.EmbeddingWrapper(
+ cell,
+ embedding_classes=num_encoder_symbols,
+ embedding_size=embedding_size)
+ _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
+
+ # Decoder.
+ if output_projection is None:
+ cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
+
+ if isinstance(feed_previous, bool):
+ return embedding_rnn_decoder(
+ decoder_inputs,
+ encoder_state,
+ cell,
+ num_decoder_symbols,
+ embedding_size,
+ output_projection=output_projection,
+ feed_previous=feed_previous)
+
+ # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+ def decoder(feed_previous_bool):
+ reuse = None if feed_previous_bool else True
+ with variable_scope.variable_scope(
+ variable_scope.get_variable_scope(), reuse=reuse) as scope:
+ outputs, state = embedding_rnn_decoder(
+ decoder_inputs,
+ encoder_state,
+ cell,
+ num_decoder_symbols,
+ embedding_size,
+ output_projection=output_projection,
+ feed_previous=feed_previous_bool,
+ update_embedding_for_previous=False)
+ state_list = [state]
+ if nest.is_sequence(state):
+ state_list = nest.flatten(state)
+ return outputs + state_list
+
+ outputs_and_state = control_flow_ops.cond(feed_previous,
+ lambda: decoder(True),
+ lambda: decoder(False))
+ outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs.
+ state_list = outputs_and_state[outputs_len:]
+ state = state_list[0]
+ if nest.is_sequence(encoder_state):
+ state = nest.pack_sequence_as(
+ structure=encoder_state, flat_sequence=state_list)
+ return outputs_and_state[:outputs_len], state
+
+
+def embedding_tied_rnn_seq2seq(encoder_inputs,
+ decoder_inputs,
+ cell,
+ num_symbols,
+ embedding_size,
+ num_decoder_symbols=None,
+ output_projection=None,
+ feed_previous=False,
+ dtype=None,
+ scope=None):
+ """Embedding RNN sequence-to-sequence model with tied (shared) parameters.
+
+ This model first embeds encoder_inputs by a newly created embedding (of shape
+ [num_symbols x input_size]). Then it runs an RNN to encode embedded
+ encoder_inputs into a state vector. Next, it embeds decoder_inputs using
+ the same embedding. Then it runs RNN decoder, initialized with the last
+ encoder state, on embedded decoder_inputs. The decoder output is over symbols
+ from 0 to num_decoder_symbols - 1 if num_decoder_symbols is none; otherwise it
+ is over 0 to num_symbols - 1.
+
+ Args:
+ encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+ decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+ cell: rnn_cell.RNNCell defining the cell function and size.
+ num_symbols: Integer; number of symbols for both encoder and decoder.
+ embedding_size: Integer, the length of the embedding vector for each symbol.
+ num_decoder_symbols: Integer; number of output symbols for decoder. If
+ provided, the decoder output is over symbols 0 to num_decoder_symbols - 1.
+ Otherwise, decoder output is over symbols 0 to num_symbols - 1. Note that
+ this assumes that the vocabulary is set up such that the first
+ num_decoder_symbols of num_symbols are part of decoding.
+ output_projection: None or a pair (W, B) of output projection weights and
+ biases; W has shape [output_size x num_symbols] and B has
+ shape [num_symbols]; if provided and feed_previous=True, each
+ fed previous output will first be multiplied by W and added B.
+ feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+ of decoder_inputs will be used (the "GO" symbol), and all other decoder
+ inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+ If False, decoder_inputs are used as given (the standard decoder case).
+ dtype: The dtype to use for the initial RNN states (default: tf.float32).
+ scope: VariableScope for the created subgraph; defaults to
+ "embedding_tied_rnn_seq2seq".
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors with
+ shape [batch_size x output_symbols] containing the generated
+ outputs where output_symbols = num_decoder_symbols if
+ num_decoder_symbols is not None otherwise output_symbols = num_symbols.
+ state: The state of each decoder cell at the final time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+ Raises:
+ ValueError: When output_projection has the wrong shape.
+ """
+ with variable_scope.variable_scope(
+ scope or "embedding_tied_rnn_seq2seq", dtype=dtype) as scope:
+ dtype = scope.dtype
+
+ if output_projection is not None:
+ proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype)
+ proj_weights.get_shape().assert_is_compatible_with([None, num_symbols])
+ proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+ proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+ embedding = variable_scope.get_variable(
+ "embedding", [num_symbols, embedding_size], dtype=dtype)
+
+ emb_encoder_inputs = [
+ embedding_ops.embedding_lookup(embedding, x) for x in encoder_inputs
+ ]
+ emb_decoder_inputs = [
+ embedding_ops.embedding_lookup(embedding, x) for x in decoder_inputs
+ ]
+
+ output_symbols = num_symbols
+ if num_decoder_symbols is not None:
+ output_symbols = num_decoder_symbols
+ if output_projection is None:
+ cell = rnn_cell.OutputProjectionWrapper(cell, output_symbols)
+
+ if isinstance(feed_previous, bool):
+ loop_function = _extract_argmax_and_embed(embedding, output_projection,
+ True) if feed_previous else None
+ return tied_rnn_seq2seq(
+ emb_encoder_inputs,
+ emb_decoder_inputs,
+ cell,
+ loop_function=loop_function,
+ dtype=dtype)
+
+ # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+ def decoder(feed_previous_bool):
+ loop_function = _extract_argmax_and_embed(
+ embedding, output_projection, False) if feed_previous_bool else None
+ reuse = None if feed_previous_bool else True
+ with variable_scope.variable_scope(
+ variable_scope.get_variable_scope(), reuse=reuse):
+ outputs, state = tied_rnn_seq2seq(
+ emb_encoder_inputs,
+ emb_decoder_inputs,
+ cell,
+ loop_function=loop_function,
+ dtype=dtype)
+ state_list = [state]
+ if nest.is_sequence(state):
+ state_list = nest.flatten(state)
+ return outputs + state_list
+
+ outputs_and_state = control_flow_ops.cond(feed_previous,
+ lambda: decoder(True),
+ lambda: decoder(False))
+ outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs.
+ state_list = outputs_and_state[outputs_len:]
+ state = state_list[0]
+ # Calculate zero-state to know it's structure.
+ static_batch_size = encoder_inputs[0].get_shape()[0]
+ for inp in encoder_inputs[1:]:
+ static_batch_size.merge_with(inp.get_shape()[0])
+ batch_size = static_batch_size.value
+ if batch_size is None:
+ batch_size = array_ops.shape(encoder_inputs[0])[0]
+ zero_state = cell.zero_state(batch_size, dtype)
+ if nest.is_sequence(zero_state):
+ state = nest.pack_sequence_as(
+ structure=zero_state, flat_sequence=state_list)
+ return outputs_and_state[:outputs_len], state
+
+
+def attention_decoder(decoder_inputs,
+ initial_state,
+ attention_states,
+ cell,
+ output_size=None,
+ num_heads=1,
+ loop_function=None,
+ dtype=None,
+ scope=None,
+ initial_state_attention=False):
+ """RNN decoder with attention for the sequence-to-sequence model.
+
+ In this context "attention" means that, during decoding, the RNN can look up
+ information in the additional tensor attention_states, and it does this by
+ focusing on a few entries from the tensor. This model has proven to yield
+ especially good results in a number of sequence-to-sequence tasks. This
+ implementation is based on http://arxiv.org/abs/1412.7449 (see below for
+ details). It is recommended for complex sequence-to-sequence tasks.
+
+ Args:
+ decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+ initial_state: 2D Tensor [batch_size x cell.state_size].
+ attention_states: 3D Tensor [batch_size x attn_length x attn_size].
+ cell: rnn_cell.RNNCell defining the cell function and size.
+ output_size: Size of the output vectors; if None, we use cell.output_size.
+ num_heads: Number of attention heads that read from attention_states.
+ loop_function: If not None, this function will be applied to i-th output
+ in order to generate i+1-th input, and decoder_inputs will be ignored,
+ except for the first element ("GO" symbol). This can be used for decoding,
+ but also for training to emulate http://arxiv.org/abs/1506.03099.
+ Signature -- loop_function(prev, i) = next
+ * prev is a 2D Tensor of shape [batch_size x output_size],
+ * i is an integer, the step number (when advanced control is needed),
+ * next is a 2D Tensor of shape [batch_size x input_size].
+ dtype: The dtype to use for the RNN initial state (default: tf.float32).
+ scope: VariableScope for the created subgraph; default: "attention_decoder".
+ initial_state_attention: If False (default), initial attentions are zero.
+ If True, initialize the attentions from the initial state and attention
+ states -- useful when we wish to resume decoding from a previously
+ stored decoder state and attention states.
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors of
+ shape [batch_size x output_size]. These represent the generated outputs.
+ Output i is computed from input i (which is either the i-th element
+ of decoder_inputs or loop_function(output {i-1}, i)) as follows.
+ First, we run the cell on a combination of the input and previous
+ attention masks:
+ cell_output, new_state = cell(linear(input, prev_attn), prev_state).
+ Then, we calculate new attention masks:
+ new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
+ and then we calculate the output:
+ output = linear(cell_output, new_attn).
+ state: The state of each decoder cell the final time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+ Raises:
+ ValueError: when num_heads is not positive, there are no inputs, shapes
+ of attention_states are not set, or input size cannot be inferred
+ from the input.
+ """
+ if not decoder_inputs:
+ raise ValueError("Must provide at least 1 input to attention decoder.")
+ if num_heads < 1:
+ raise ValueError("With less than 1 heads, use a non-attention decoder.")
+ if attention_states.get_shape()[2].value is None:
+ raise ValueError("Shape[2] of attention_states must be known: %s" %
+ attention_states.get_shape())
+ if output_size is None:
+ output_size = cell.output_size
+
+ with variable_scope.variable_scope(
+ scope or "attention_decoder", dtype=dtype) as scope:
+ dtype = scope.dtype
+
+ batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping.
+ attn_length = attention_states.get_shape()[1].value
+ if attn_length is None:
+ attn_length = array_ops.shape(attention_states)[1]
+ attn_size = attention_states.get_shape()[2].value
+
+ # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
+ hidden = array_ops.reshape(attention_states,
+ [-1, attn_length, 1, attn_size])
+ hidden_features = []
+ v = []
+ attention_vec_size = attn_size # Size of query vectors for attention.
+ for a in xrange(num_heads):
+ k = variable_scope.get_variable("AttnW_%d" % a,
+ [1, 1, attn_size, attention_vec_size])
+ hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
+ v.append(
+ variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))
+
+ state = initial_state
+
+ def attention(query):
+ """Put attention masks on hidden using hidden_features and query."""
+ ds = [] # Results of attention reads will be stored here.
+ if nest.is_sequence(query): # If the query is a tuple, flatten it.
+ query_list = nest.flatten(query)
+ for q in query_list: # Check that ndims == 2 if specified.
+ ndims = q.get_shape().ndims
+ if ndims:
+ assert ndims == 2
+ query = array_ops.concat_v2(query_list, 1)
+ for a in xrange(num_heads):
+ with variable_scope.variable_scope("Attention_%d" % a):
+ y = linear(query, attention_vec_size, True)
+ y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
+ # Attention mask is a softmax of v^T * tanh(...).
+ s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y),
+ [2, 3])
+ a = nn_ops.softmax(s)
+ # Now calculate the attention-weighted vector d.
+ d = math_ops.reduce_sum(
+ array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
+ ds.append(array_ops.reshape(d, [-1, attn_size]))
+ return ds
+
+ outputs = []
+ prev = None
+ batch_attn_size = array_ops.pack([batch_size, attn_size])
+ attns = [
+ array_ops.zeros(
+ batch_attn_size, dtype=dtype) for _ in xrange(num_heads)
+ ]
+ for a in attns: # Ensure the second shape of attention vectors is set.
+ a.set_shape([None, attn_size])
+ if initial_state_attention:
+ attns = attention(initial_state)
+ for i, inp in enumerate(decoder_inputs):
+ if i > 0:
+ variable_scope.get_variable_scope().reuse_variables()
+ # If loop_function is set, we use it instead of decoder_inputs.
+ if loop_function is not None and prev is not None:
+ with variable_scope.variable_scope("loop_function", reuse=True):
+ inp = loop_function(prev, i)
+ # Merge input and previous attentions into one vector of the right size.
+ input_size = inp.get_shape().with_rank(2)[1]
+ if input_size.value is None:
+ raise ValueError("Could not infer input size from input: %s" % inp.name)
+ x = linear([inp] + attns, input_size, True)
+ # Run the RNN.
+ cell_output, state = cell(x, state)
+ # Run the attention mechanism.
+ if i == 0 and initial_state_attention:
+ with variable_scope.variable_scope(
+ variable_scope.get_variable_scope(), reuse=True):
+ attns = attention(state)
+ else:
+ attns = attention(state)
+
+ with variable_scope.variable_scope("AttnOutputProjection"):
+ output = linear([cell_output] + attns, output_size, True)
+ if loop_function is not None:
+ prev = output
+ outputs.append(output)
+
+ return outputs, state
+
+
+def embedding_attention_decoder(decoder_inputs,
+ initial_state,
+ attention_states,
+ cell,
+ num_symbols,
+ embedding_size,
+ num_heads=1,
+ output_size=None,
+ output_projection=None,
+ feed_previous=False,
+ update_embedding_for_previous=True,
+ dtype=None,
+ scope=None,
+ initial_state_attention=False):
+ """RNN decoder with embedding and attention and a pure-decoding option.
+
+ Args:
+ decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
+ initial_state: 2D Tensor [batch_size x cell.state_size].
+ attention_states: 3D Tensor [batch_size x attn_length x attn_size].
+ cell: rnn_cell.RNNCell defining the cell function.
+ num_symbols: Integer, how many symbols come into the embedding.
+ embedding_size: Integer, the length of the embedding vector for each symbol.
+ num_heads: Number of attention heads that read from attention_states.
+ output_size: Size of the output vectors; if None, use output_size.
+ output_projection: None or a pair (W, B) of output projection weights and
+ biases; W has shape [output_size x num_symbols] and B has shape
+ [num_symbols]; if provided and feed_previous=True, each fed previous
+ output will first be multiplied by W and added B.
+ feed_previous: Boolean; if True, only the first of decoder_inputs will be
+ used (the "GO" symbol), and all other decoder inputs will be generated by:
+ next = embedding_lookup(embedding, argmax(previous_output)),
+ In effect, this implements a greedy decoder. It can also be used
+ during training to emulate http://arxiv.org/abs/1506.03099.
+ If False, decoder_inputs are used as given (the standard decoder case).
+ update_embedding_for_previous: Boolean; if False and feed_previous=True,
+ only the embedding for the first symbol of decoder_inputs (the "GO"
+ symbol) will be updated by back propagation. Embeddings for the symbols
+ generated from the decoder itself remain unchanged. This parameter has
+ no effect if feed_previous=False.
+ dtype: The dtype to use for the RNN initial states (default: tf.float32).
+ scope: VariableScope for the created subgraph; defaults to
+ "embedding_attention_decoder".
+ initial_state_attention: If False (default), initial attentions are zero.
+ If True, initialize the attentions from the initial state and attention
+ states -- useful when we wish to resume decoding from a previously
+ stored decoder state and attention states.
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors with
+ shape [batch_size x output_size] containing the generated outputs.
+ state: The state of each decoder cell at the final time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+ Raises:
+ ValueError: When output_projection has the wrong shape.
+ """
+ if output_size is None:
+ output_size = cell.output_size
+ if output_projection is not None:
+ proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+ proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+ with variable_scope.variable_scope(
+ scope or "embedding_attention_decoder", dtype=dtype) as scope:
+
+ embedding = variable_scope.get_variable("embedding",
+ [num_symbols, embedding_size])
+ loop_function = _extract_argmax_and_embed(
+ embedding, output_projection,
+ update_embedding_for_previous) if feed_previous else None
+ emb_inp = [
+ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs
+ ]
+ return attention_decoder(
+ emb_inp,
+ initial_state,
+ attention_states,
+ cell,
+ output_size=output_size,
+ num_heads=num_heads,
+ loop_function=loop_function,
+ initial_state_attention=initial_state_attention)
+
+
+def embedding_attention_seq2seq(encoder_inputs,
+ decoder_inputs,
+ cell,
+ num_encoder_symbols,
+ num_decoder_symbols,
+ embedding_size,
+ num_heads=1,
+ output_projection=None,
+ feed_previous=False,
+ dtype=None,
+ scope=None,
+ initial_state_attention=False):
+ """Embedding sequence-to-sequence model with attention.
+
+ This model first embeds encoder_inputs by a newly created embedding (of shape
+ [num_encoder_symbols x input_size]). Then it runs an RNN to encode
+ embedded encoder_inputs into a state vector. It keeps the outputs of this
+ RNN at every step to use for attention later. Next, it embeds decoder_inputs
+ by another newly created embedding (of shape [num_decoder_symbols x
+ input_size]). Then it runs attention decoder, initialized with the last
+ encoder state, on embedded decoder_inputs and attending to encoder outputs.
+
+ Warning: when output_projection is None, the size of the attention vectors
+ and variables will be made proportional to num_decoder_symbols, can be large.
+
+ Args:
+ encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+ decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+ cell: rnn_cell.RNNCell defining the cell function and size.
+ num_encoder_symbols: Integer; number of symbols on the encoder side.
+ num_decoder_symbols: Integer; number of symbols on the decoder side.
+ embedding_size: Integer, the length of the embedding vector for each symbol.
+ num_heads: Number of attention heads that read from attention_states.
+ output_projection: None or a pair (W, B) of output projection weights and
+ biases; W has shape [output_size x num_decoder_symbols] and B has
+ shape [num_decoder_symbols]; if provided and feed_previous=True, each
+ fed previous output will first be multiplied by W and added B.
+ feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+ of decoder_inputs will be used (the "GO" symbol), and all other decoder
+ inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+ If False, decoder_inputs are used as given (the standard decoder case).
+ dtype: The dtype of the initial RNN state (default: tf.float32).
+ scope: VariableScope for the created subgraph; defaults to
+ "embedding_attention_seq2seq".
+ initial_state_attention: If False (default), initial attentions are zero.
+ If True, initialize the attentions from the initial state and attention
+ states.
+
+ Returns:
+ A tuple of the form (outputs, state), where:
+ outputs: A list of the same length as decoder_inputs of 2D Tensors with
+ shape [batch_size x num_decoder_symbols] containing the generated
+ outputs.
+ state: The state of each decoder cell at the final time-step.
+ It is a 2D Tensor of shape [batch_size x cell.state_size].
+ """
+ with variable_scope.variable_scope(
+ scope or "embedding_attention_seq2seq", dtype=dtype) as scope:
+ dtype = scope.dtype
+ # Encoder.
+ encoder_cell = rnn_cell.EmbeddingWrapper(
+ cell,
+ embedding_classes=num_encoder_symbols,
+ embedding_size=embedding_size)
+ encoder_outputs, encoder_state = rnn.rnn(encoder_cell,
+ encoder_inputs,
+ dtype=dtype)
+
+ # First calculate a concatenation of encoder outputs to put attention on.
+ top_states = [
+ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs
+ ]
+ attention_states = array_ops.concat_v2(top_states, 1)
+
+ # Decoder.
+ output_size = None
+ if output_projection is None:
+ cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
+ output_size = num_decoder_symbols
+
+ if isinstance(feed_previous, bool):
+ return embedding_attention_decoder(
+ decoder_inputs,
+ encoder_state,
+ attention_states,
+ cell,
+ num_decoder_symbols,
+ embedding_size,
+ num_heads=num_heads,
+ output_size=output_size,
+ output_projection=output_projection,
+ feed_previous=feed_previous,
+ initial_state_attention=initial_state_attention)
+
+ # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+ def decoder(feed_previous_bool):
+ reuse = None if feed_previous_bool else True
+ with variable_scope.variable_scope(
+ variable_scope.get_variable_scope(), reuse=reuse) as scope:
+ outputs, state = embedding_attention_decoder(
+ decoder_inputs,
+ encoder_state,
+ attention_states,
+ cell,
+ num_decoder_symbols,
+ embedding_size,
+ num_heads=num_heads,
+ output_size=output_size,
+ output_projection=output_projection,
+ feed_previous=feed_previous_bool,
+ update_embedding_for_previous=False,
+ initial_state_attention=initial_state_attention)
+ state_list = [state]
+ if nest.is_sequence(state):
+ state_list = nest.flatten(state)
+ return outputs + state_list
+
+ outputs_and_state = control_flow_ops.cond(feed_previous,
+ lambda: decoder(True),
+ lambda: decoder(False))
+ outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs.
+ state_list = outputs_and_state[outputs_len:]
+ state = state_list[0]
+ if nest.is_sequence(encoder_state):
+ state = nest.pack_sequence_as(
+ structure=encoder_state, flat_sequence=state_list)
+ return outputs_and_state[:outputs_len], state
+
+
+def one2many_rnn_seq2seq(encoder_inputs,
+ decoder_inputs_dict,
+ cell,
+ num_encoder_symbols,
+ num_decoder_symbols_dict,
+ embedding_size,
+ feed_previous=False,
+ dtype=None,
+ scope=None):
+ """One-to-many RNN sequence-to-sequence model (multi-task).
+
+ This is a multi-task sequence-to-sequence model with one encoder and multiple
+ decoders. Reference to multi-task sequence-to-sequence learning can be found
+ here: http://arxiv.org/abs/1511.06114
+
+ Args:
+ encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+ decoder_inputs_dict: A dictionany mapping decoder name (string) to
+ the corresponding decoder_inputs; each decoder_inputs is a list of 1D
+ Tensors of shape [batch_size]; num_decoders is defined as
+ len(decoder_inputs_dict).
+ cell: rnn_cell.RNNCell defining the cell function and size.
+ num_encoder_symbols: Integer; number of symbols on the encoder side.
+ num_decoder_symbols_dict: A dictionary mapping decoder name (string) to an
+ integer specifying number of symbols for the corresponding decoder;
+ len(num_decoder_symbols_dict) must be equal to num_decoders.
+ embedding_size: Integer, the length of the embedding vector for each symbol.
+ feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of
+ decoder_inputs will be used (the "GO" symbol), and all other decoder
+ inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+ If False, decoder_inputs are used as given (the standard decoder case).
+ dtype: The dtype of the initial state for both the encoder and encoder
+ rnn cells (default: tf.float32).
+ scope: VariableScope for the created subgraph; defaults to
+ "one2many_rnn_seq2seq"
+
+ Returns:
+ A tuple of the form (outputs_dict, state_dict), where:
+ outputs_dict: A mapping from decoder name (string) to a list of the same
+ length as decoder_inputs_dict[name]; each element in the list is a 2D
+ Tensors with shape [batch_size x num_decoder_symbol_list[name]]
+ containing the generated outputs.
+ state_dict: A mapping from decoder name (string) to the final state of the
+ corresponding decoder RNN; it is a 2D Tensor of shape
+ [batch_size x cell.state_size].
+ """
+ outputs_dict = {}
+ state_dict = {}
+
+ with variable_scope.variable_scope(
+ scope or "one2many_rnn_seq2seq", dtype=dtype) as scope:
+ dtype = scope.dtype
+
+ # Encoder.
+ encoder_cell = rnn_cell.EmbeddingWrapper(
+ cell,
+ embedding_classes=num_encoder_symbols,
+ embedding_size=embedding_size)
+ _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
+
+ # Decoder.
+ for name, decoder_inputs in decoder_inputs_dict.items():
+ num_decoder_symbols = num_decoder_symbols_dict[name]
+
+ with variable_scope.variable_scope("one2many_decoder_" + str(
+ name)) as scope:
+ decoder_cell = rnn_cell.OutputProjectionWrapper(cell,
+ num_decoder_symbols)
+ if isinstance(feed_previous, bool):
+ outputs, state = embedding_rnn_decoder(
+ decoder_inputs,
+ encoder_state,
+ decoder_cell,
+ num_decoder_symbols,
+ embedding_size,
+ feed_previous=feed_previous)
+ else:
+ # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+ def filled_embedding_rnn_decoder(feed_previous):
+ """The current decoder with a fixed feed_previous parameter."""
+ # pylint: disable=cell-var-from-loop
+ reuse = None if feed_previous else True
+ vs = variable_scope.get_variable_scope()
+ with variable_scope.variable_scope(vs, reuse=reuse):
+ outputs, state = embedding_rnn_decoder(
+ decoder_inputs,
+ encoder_state,
+ decoder_cell,
+ num_decoder_symbols,
+ embedding_size,
+ feed_previous=feed_previous)
+ # pylint: enable=cell-var-from-loop
+ state_list = [state]
+ if nest.is_sequence(state):
+ state_list = nest.flatten(state)
+ return outputs + state_list
+
+ outputs_and_state = control_flow_ops.cond(
+ feed_previous, lambda: filled_embedding_rnn_decoder(True),
+ lambda: filled_embedding_rnn_decoder(False))
+ # Outputs length is the same as for decoder inputs.
+ outputs_len = len(decoder_inputs)
+ outputs = outputs_and_state[:outputs_len]
+ state_list = outputs_and_state[outputs_len:]
+ state = state_list[0]
+ if nest.is_sequence(encoder_state):
+ state = nest.pack_sequence_as(
+ structure=encoder_state, flat_sequence=state_list)
+ outputs_dict[name] = outputs
+ state_dict[name] = state
+
+ return outputs_dict, state_dict
+
+
+def sequence_loss_by_example(logits,
+ targets,
+ weights,
+ average_across_timesteps=True,
+ softmax_loss_function=None,
+ name=None):
+ """Weighted cross-entropy loss for a sequence of logits (per example).
+
+ Args:
+ logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+ targets: List of 1D batch-sized int32 Tensors of the same length as logits.
+ weights: List of 1D batch-sized float-Tensors of the same length as logits.
+ average_across_timesteps: If set, divide the returned cost by the total
+ label weight.
+ softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch
+ to be used instead of the standard softmax (the default if this is None).
+ name: Optional name for this operation, default: "sequence_loss_by_example".
+
+ Returns:
+ 1D batch-sized float Tensor: The log-perplexity for each sequence.
+
+ Raises:
+ ValueError: If len(logits) is different from len(targets) or len(weights).
+ """
+ if len(targets) != len(logits) or len(weights) != len(logits):
+ raise ValueError("Lengths of logits, weights, and targets must be the same "
+ "%d, %d, %d." % (len(logits), len(weights), len(targets)))
+ with ops.name_scope(name, "sequence_loss_by_example",
+ logits + targets + weights):
+ log_perp_list = []
+ for logit, target, weight in zip(logits, targets, weights):
+ if softmax_loss_function is None:
+ # TODO(irving,ebrevdo): This reshape is needed because
+ # sequence_loss_by_example is called with scalars sometimes, which
+ # violates our general scalar strictness policy.
+ target = array_ops.reshape(target, [-1])
+ crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
+ logits=logit, labels=target)
+ else:
+ crossent = softmax_loss_function(target, logit)
+ log_perp_list.append(crossent * weight)
+ log_perps = math_ops.add_n(log_perp_list)
+ if average_across_timesteps:
+ total_size = math_ops.add_n(weights)
+ total_size += 1e-12 # Just to avoid division by 0 for all-0 weights.
+ log_perps /= total_size
+ return log_perps
+
+
+def sequence_loss(logits,
+ targets,
+ weights,
+ average_across_timesteps=True,
+ average_across_batch=True,
+ softmax_loss_function=None,
+ name=None):
+ """Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
+
+ Args:
+ logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+ targets: List of 1D batch-sized int32 Tensors of the same length as logits.
+ weights: List of 1D batch-sized float-Tensors of the same length as logits.
+ average_across_timesteps: If set, divide the returned cost by the total
+ label weight.
+ average_across_batch: If set, divide the returned cost by the batch size.
+ softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
+ to be used instead of the standard softmax (the default if this is None).
+ name: Optional name for this operation, defaults to "sequence_loss".
+
+ Returns:
+ A scalar float Tensor: The average log-perplexity per symbol (weighted).
+
+ Raises:
+ ValueError: If len(logits) is different from len(targets) or len(weights).
+ """
+ with ops.name_scope(name, "sequence_loss", logits + targets + weights):
+ cost = math_ops.reduce_sum(
+ sequence_loss_by_example(
+ logits,
+ targets,
+ weights,
+ average_across_timesteps=average_across_timesteps,
+ softmax_loss_function=softmax_loss_function))
+ if average_across_batch:
+ batch_size = array_ops.shape(targets[0])[0]
+ return cost / math_ops.cast(batch_size, cost.dtype)
+ else:
+ return cost
+
+
+def model_with_buckets(encoder_inputs,
+ decoder_inputs,
+ targets,
+ weights,
+ buckets,
+ seq2seq,
+ softmax_loss_function=None,
+ per_example_loss=False,
+ name=None):
+ """Create a sequence-to-sequence model with support for bucketing.
+
+ The seq2seq argument is a function that defines a sequence-to-sequence model,
+ e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))
+
+ Args:
+ encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input.
+ decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input.
+ targets: A list of 1D batch-sized int32 Tensors (desired output sequence).
+ weights: List of 1D batch-sized float-Tensors to weight the targets.
+ buckets: A list of pairs of (input size, output size) for each bucket.
+ seq2seq: A sequence-to-sequence model function; it takes 2 input that
+ agree with encoder_inputs and decoder_inputs, and returns a pair
+ consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
+ softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
+ to be used instead of the standard softmax (the default if this is None).
+ per_example_loss: Boolean. If set, the returned loss will be a batch-sized
+ tensor of losses for each sequence in the batch. If unset, it will be
+ a scalar with the averaged loss from all examples.
+ name: Optional name for this operation, defaults to "model_with_buckets".
+
+ Returns:
+ A tuple of the form (outputs, losses), where:
+ outputs: The outputs for each bucket. Its j'th element consists of a list
+ of 2D Tensors. The shape of output tensors can be either
+ [batch_size x output_size] or [batch_size x num_decoder_symbols]
+ depending on the seq2seq model used.
+ losses: List of scalar Tensors, representing losses for each bucket, or,
+ if per_example_loss is set, a list of 1D batch-sized float Tensors.
+
+ Raises:
+ ValueError: If length of encoder_inputsut, targets, or weights is smaller
+ than the largest (last) bucket.
+ """
+ if len(encoder_inputs) < buckets[-1][0]:
+ raise ValueError("Length of encoder_inputs (%d) must be at least that of la"
+ "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
+ if len(targets) < buckets[-1][1]:
+ raise ValueError("Length of targets (%d) must be at least that of last"
+ "bucket (%d)." % (len(targets), buckets[-1][1]))
+ if len(weights) < buckets[-1][1]:
+ raise ValueError("Length of weights (%d) must be at least that of last"
+ "bucket (%d)." % (len(weights), buckets[-1][1]))
+
+ all_inputs = encoder_inputs + decoder_inputs + targets + weights
+ losses = []
+ outputs = []
+ with ops.name_scope(name, "model_with_buckets", all_inputs):
+ for j, bucket in enumerate(buckets):
+ with variable_scope.variable_scope(
+ variable_scope.get_variable_scope(), reuse=True if j > 0 else None):
+ bucket_outputs, _ = seq2seq(encoder_inputs[:bucket[0]],
+ decoder_inputs[:bucket[1]])
+ outputs.append(bucket_outputs)
+ if per_example_loss:
+ losses.append(
+ sequence_loss_by_example(
+ outputs[-1],
+ targets[:bucket[1]],
+ weights[:bucket[1]],
+ softmax_loss_function=softmax_loss_function))
+ else:
+ losses.append(
+ sequence_loss(
+ outputs[-1],
+ targets[:bucket[1]],
+ weights[:bucket[1]],
+ softmax_loss_function=softmax_loss_function))
+
+ return outputs, losses