Move implementation code of tf.nn.seq2eq to tf.contrib.

Change: 141978567
author: Jianwei Xie <xiejw@google.com> 2016-12-13 21:44:38 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-12-13 22:05:09 -0800
commit: d438a07a2f50401c6a1d5a644512cd46fba4da4f (patch)
tree: 111ec8790aa29c3408959a43cb359ee46b8ada62 /tensorflow/contrib/legacy_seq2seq
parent: 2a6bd09d05881d3160885a386b3ac1fb7cf6a6e1 (diff)
3 files changed, 1214 insertions, 14 deletions
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index d4e8582bcc..3fade19838 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -12,7 +12,13 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 
 py_library(
     name = "seq2seq_py",
-    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ] + glob(
+        ["python/ops/*.py"],
+        exclude = ["python/ops/**/*_test.py"],
+    ),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/contrib/legacy_seq2seq/__init__.py b/tensorflow/contrib/legacy_seq2seq/__init__.py
index 1b9043645c..75069fe950 100644
--- a/tensorflow/contrib/legacy_seq2seq/__init__.py
+++ b/tensorflow/contrib/legacy_seq2seq/__init__.py
@@ -33,19 +33,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops.seq2seq import attention_decoder
-from tensorflow.python.ops.seq2seq import basic_rnn_seq2seq
-from tensorflow.python.ops.seq2seq import embedding_attention_decoder
-from tensorflow.python.ops.seq2seq import embedding_attention_seq2seq
-from tensorflow.python.ops.seq2seq import embedding_rnn_decoder
-from tensorflow.python.ops.seq2seq import embedding_rnn_seq2seq
-from tensorflow.python.ops.seq2seq import embedding_tied_rnn_seq2seq
-from tensorflow.python.ops.seq2seq import model_with_buckets
-from tensorflow.python.ops.seq2seq import one2many_rnn_seq2seq
-from tensorflow.python.ops.seq2seq import rnn_decoder
-from tensorflow.python.ops.seq2seq import sequence_loss
-from tensorflow.python.ops.seq2seq import sequence_loss_by_example
-from tensorflow.python.ops.seq2seq import tied_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import attention_decoder
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import basic_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_attention_decoder
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_attention_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_rnn_decoder
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import embedding_tied_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import model_with_buckets
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import one2many_rnn_seq2seq
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import rnn_decoder
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import sequence_loss
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import sequence_loss_by_example
+from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import tied_rnn_seq2seq
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
new file mode 100644
index 0000000000..0582028b88
--- /dev/null
+++ b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
@@ -0,0 +1,1194 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for creating sequence-to-sequence models in TensorFlow.
+
+Sequence-to-sequence recurrent neural networks can learn complex functions
+that map input sequences to output sequences. These models yield very good
+results on a number of tasks, such as speech recognition, parsing, machine
+translation, or even constructing automated replies to emails.
+
+Before using this module, it is recommended to read the TensorFlow tutorial
+on sequence-to-sequence models. It explains the basic concepts of this module
+and shows an end-to-end example of how to build a translation model.
+  https://www.tensorflow.org/versions/master/tutorials/seq2seq/index.html
+
+Here is an overview of functions available in this module. They all use
+a very similar interface, so after reading the above tutorial and using
+one of them, others should be easy to substitute.
+
+* Full sequence-to-sequence models.
+  - basic_rnn_seq2seq: The most basic RNN-RNN model.
+  - tied_rnn_seq2seq: The basic model with tied encoder and decoder weights.
+  - embedding_rnn_seq2seq: The basic model with input embedding.
+  - embedding_tied_rnn_seq2seq: The tied model with input embedding.
+  - embedding_attention_seq2seq: Advanced model with input embedding and
+      the neural attention mechanism; recommended for complex tasks.
+
+* Multi-task sequence-to-sequence models.
+  - one2many_rnn_seq2seq: The embedding model with multiple decoders.
+
+* Decoders (when you write your own encoder, you can use these to decode;
+    e.g., if you want to write a model that generates captions for images).
+  - rnn_decoder: The basic decoder based on a pure RNN.
+  - attention_decoder: A decoder that uses the attention mechanism.
+
+* Losses.
+  - sequence_loss: Loss for a sequence model returning average log-perplexity.
+  - sequence_loss_by_example: As above, but not averaging over all examples.
+
+* model_with_buckets: A convenience function to create models with bucketing
+    (see the tutorial above for an explanation of why and how to use it).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# We disable pylint because we need python3 compatibility.
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+# TODO(ebrevdo): Remove once _linear is fully deprecated.
+linear = rnn_cell_impl._linear  # pylint: disable=protected-access
+
+
+def _extract_argmax_and_embed(embedding,
+                              output_projection=None,
+                              update_embedding=True):
+  """Get a loop_function that extracts the previous symbol and embeds it.
+
+  Args:
+    embedding: embedding tensor for symbols.
+    output_projection: None or a pair (W, B). If provided, each fed previous
+      output will first be multiplied by W and added B.
+    update_embedding: Boolean; if False, the gradients will not propagate
+      through the embeddings.
+
+  Returns:
+    A loop function.
+  """
+
+  def loop_function(prev, _):
+    if output_projection is not None:
+      prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])
+    prev_symbol = math_ops.argmax(prev, 1)
+    # Note that gradients will not propagate through the second parameter of
+    # embedding_lookup.
+    emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
+    if not update_embedding:
+      emb_prev = array_ops.stop_gradient(emb_prev)
+    return emb_prev
+
+  return loop_function
+
+
+def rnn_decoder(decoder_inputs,
+                initial_state,
+                cell,
+                loop_function=None,
+                scope=None):
+  """RNN decoder for the sequence-to-sequence model.
+
+  Args:
+    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    loop_function: If not None, this function will be applied to the i-th output
+      in order to generate the i+1-st input, and decoder_inputs will be ignored,
+      except for the first element ("GO" symbol). This can be used for decoding,
+      but also for training to emulate http://arxiv.org/abs/1506.03099.
+      Signature -- loop_function(prev, i) = next
+        * prev is a 2D Tensor of shape [batch_size x output_size],
+        * i is an integer, the step number (when advanced control is needed),
+        * next is a 2D Tensor of shape [batch_size x input_size].
+    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors with
+        shape [batch_size x output_size] containing generated outputs.
+      state: The state of each cell at the final time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+        (Note that in some cases, like basic RNN cell or GRU cell, outputs and
+         states can be the same. They are different for LSTM cells though.)
+  """
+  with variable_scope.variable_scope(scope or "rnn_decoder"):
+    state = initial_state
+    outputs = []
+    prev = None
+    for i, inp in enumerate(decoder_inputs):
+      if loop_function is not None and prev is not None:
+        with variable_scope.variable_scope("loop_function", reuse=True):
+          inp = loop_function(prev, i)
+      if i > 0:
+        variable_scope.get_variable_scope().reuse_variables()
+      output, state = cell(inp, state)
+      outputs.append(output)
+      if loop_function is not None:
+        prev = output
+  return outputs, state
+
+
+def basic_rnn_seq2seq(encoder_inputs,
+                      decoder_inputs,
+                      cell,
+                      dtype=dtypes.float32,
+                      scope=None):
+  """Basic RNN sequence-to-sequence model.
+
+  This model first runs an RNN to encode encoder_inputs into a state vector,
+  then runs decoder, initialized with the last encoder state, on decoder_inputs.
+  Encoder and decoder use the same RNN cell type, but don't share parameters.
+
+  Args:
+    encoder_inputs: A list of 2D Tensors [batch_size x input_size].
+    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    dtype: The dtype of the initial state of the RNN cell (default: tf.float32).
+    scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors with
+        shape [batch_size x output_size] containing the generated outputs.
+      state: The state of each decoder cell in the final time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+  """
+  with variable_scope.variable_scope(scope or "basic_rnn_seq2seq"):
+    _, enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtype)
+    return rnn_decoder(decoder_inputs, enc_state, cell)
+
+
+def tied_rnn_seq2seq(encoder_inputs,
+                     decoder_inputs,
+                     cell,
+                     loop_function=None,
+                     dtype=dtypes.float32,
+                     scope=None):
+  """RNN sequence-to-sequence model with tied encoder and decoder parameters.
+
+  This model first runs an RNN to encode encoder_inputs into a state vector, and
+  then runs decoder, initialized with the last encoder state, on decoder_inputs.
+  Encoder and decoder use the same RNN cell and share parameters.
+
+  Args:
+    encoder_inputs: A list of 2D Tensors [batch_size x input_size].
+    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    loop_function: If not None, this function will be applied to i-th output
+      in order to generate i+1-th input, and decoder_inputs will be ignored,
+      except for the first element ("GO" symbol), see rnn_decoder for details.
+    dtype: The dtype of the initial state of the rnn cell (default: tf.float32).
+    scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors with
+        shape [batch_size x output_size] containing the generated outputs.
+      state: The state of each decoder cell in each time-step. This is a list
+        with length len(decoder_inputs) -- one item for each time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+  """
+  with variable_scope.variable_scope("combined_tied_rnn_seq2seq"):
+    scope = scope or "tied_rnn_seq2seq"
+    _, enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtype, scope=scope)
+    variable_scope.get_variable_scope().reuse_variables()
+    return rnn_decoder(
+        decoder_inputs,
+        enc_state,
+        cell,
+        loop_function=loop_function,
+        scope=scope)
+
+
+def embedding_rnn_decoder(decoder_inputs,
+                          initial_state,
+                          cell,
+                          num_symbols,
+                          embedding_size,
+                          output_projection=None,
+                          feed_previous=False,
+                          update_embedding_for_previous=True,
+                          scope=None):
+  """RNN decoder with embedding and a pure-decoding option.
+
+  Args:
+    decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
+    initial_state: 2D Tensor [batch_size x cell.state_size].
+    cell: rnn_cell.RNNCell defining the cell function.
+    num_symbols: Integer, how many symbols come into the embedding.
+    embedding_size: Integer, the length of the embedding vector for each symbol.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [output_size x num_symbols] and B has
+      shape [num_symbols]; if provided and feed_previous=True, each fed
+      previous output will first be multiplied by W and added B.
+    feed_previous: Boolean; if True, only the first of decoder_inputs will be
+      used (the "GO" symbol), and all other decoder inputs will be generated by:
+        next = embedding_lookup(embedding, argmax(previous_output)),
+      In effect, this implements a greedy decoder. It can also be used
+      during training to emulate http://arxiv.org/abs/1506.03099.
+      If False, decoder_inputs are used as given (the standard decoder case).
+    update_embedding_for_previous: Boolean; if False and feed_previous=True,
+      only the embedding for the first symbol of decoder_inputs (the "GO"
+      symbol) will be updated by back propagation. Embeddings for the symbols
+      generated from the decoder itself remain unchanged. This parameter has
+      no effect if feed_previous=False.
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_rnn_decoder".
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors. The
+        output is of shape [batch_size x cell.output_size] when
+        output_projection is not None (and represents the dense representation
+        of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
+        when output_projection is None.
+      state: The state of each decoder cell in each time-step. This is a list
+        with length len(decoder_inputs) -- one item for each time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+  Raises:
+    ValueError: When output_projection has the wrong shape.
+  """
+  with variable_scope.variable_scope(scope or "embedding_rnn_decoder") as scope:
+    if output_projection is not None:
+      dtype = scope.dtype
+      proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype)
+      proj_weights.get_shape().assert_is_compatible_with([None, num_symbols])
+      proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+      proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+    embedding = variable_scope.get_variable("embedding",
+                                            [num_symbols, embedding_size])
+    loop_function = _extract_argmax_and_embed(
+        embedding, output_projection,
+        update_embedding_for_previous) if feed_previous else None
+    emb_inp = (embedding_ops.embedding_lookup(embedding, i)
+               for i in decoder_inputs)
+    return rnn_decoder(
+        emb_inp, initial_state, cell, loop_function=loop_function)
+
+
+def embedding_rnn_seq2seq(encoder_inputs,
+                          decoder_inputs,
+                          cell,
+                          num_encoder_symbols,
+                          num_decoder_symbols,
+                          embedding_size,
+                          output_projection=None,
+                          feed_previous=False,
+                          dtype=None,
+                          scope=None):
+  """Embedding RNN sequence-to-sequence model.
+
+  This model first embeds encoder_inputs by a newly created embedding (of shape
+  [num_encoder_symbols x input_size]). Then it runs an RNN to encode
+  embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
+  by another newly created embedding (of shape [num_decoder_symbols x
+  input_size]). Then it runs RNN decoder, initialized with the last
+  encoder state, on embedded decoder_inputs.
+
+  Args:
+    encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+    decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    num_encoder_symbols: Integer; number of symbols on the encoder side.
+    num_decoder_symbols: Integer; number of symbols on the decoder side.
+    embedding_size: Integer, the length of the embedding vector for each symbol.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [output_size x num_decoder_symbols] and B has
+      shape [num_decoder_symbols]; if provided and feed_previous=True, each
+      fed previous output will first be multiplied by W and added B.
+    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+      of decoder_inputs will be used (the "GO" symbol), and all other decoder
+      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+      If False, decoder_inputs are used as given (the standard decoder case).
+    dtype: The dtype of the initial state for both the encoder and encoder
+      rnn cells (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_rnn_seq2seq"
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors. The
+        output is of shape [batch_size x cell.output_size] when
+        output_projection is not None (and represents the dense representation
+        of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
+        when output_projection is None.
+      state: The state of each decoder cell in each time-step. This is a list
+        with length len(decoder_inputs) -- one item for each time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+  """
+  with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq") as scope:
+    if dtype is not None:
+      scope.set_dtype(dtype)
+    else:
+      dtype = scope.dtype
+
+    # Encoder.
+    encoder_cell = rnn_cell.EmbeddingWrapper(
+        cell,
+        embedding_classes=num_encoder_symbols,
+        embedding_size=embedding_size)
+    _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
+
+    # Decoder.
+    if output_projection is None:
+      cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
+
+    if isinstance(feed_previous, bool):
+      return embedding_rnn_decoder(
+          decoder_inputs,
+          encoder_state,
+          cell,
+          num_decoder_symbols,
+          embedding_size,
+          output_projection=output_projection,
+          feed_previous=feed_previous)
+
+    # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+    def decoder(feed_previous_bool):
+      reuse = None if feed_previous_bool else True
+      with variable_scope.variable_scope(
+          variable_scope.get_variable_scope(), reuse=reuse) as scope:
+        outputs, state = embedding_rnn_decoder(
+            decoder_inputs,
+            encoder_state,
+            cell,
+            num_decoder_symbols,
+            embedding_size,
+            output_projection=output_projection,
+            feed_previous=feed_previous_bool,
+            update_embedding_for_previous=False)
+        state_list = [state]
+        if nest.is_sequence(state):
+          state_list = nest.flatten(state)
+        return outputs + state_list
+
+    outputs_and_state = control_flow_ops.cond(feed_previous,
+                                              lambda: decoder(True),
+                                              lambda: decoder(False))
+    outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
+    state_list = outputs_and_state[outputs_len:]
+    state = state_list[0]
+    if nest.is_sequence(encoder_state):
+      state = nest.pack_sequence_as(
+          structure=encoder_state, flat_sequence=state_list)
+    return outputs_and_state[:outputs_len], state
+
+
+def embedding_tied_rnn_seq2seq(encoder_inputs,
+                               decoder_inputs,
+                               cell,
+                               num_symbols,
+                               embedding_size,
+                               num_decoder_symbols=None,
+                               output_projection=None,
+                               feed_previous=False,
+                               dtype=None,
+                               scope=None):
+  """Embedding RNN sequence-to-sequence model with tied (shared) parameters.
+
+  This model first embeds encoder_inputs by a newly created embedding (of shape
+  [num_symbols x input_size]). Then it runs an RNN to encode embedded
+  encoder_inputs into a state vector. Next, it embeds decoder_inputs using
+  the same embedding. Then it runs RNN decoder, initialized with the last
+  encoder state, on embedded decoder_inputs. The decoder output is over symbols
+  from 0 to num_decoder_symbols - 1 if num_decoder_symbols is none; otherwise it
+  is over 0 to num_symbols - 1.
+
+  Args:
+    encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+    decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    num_symbols: Integer; number of symbols for both encoder and decoder.
+    embedding_size: Integer, the length of the embedding vector for each symbol.
+    num_decoder_symbols: Integer; number of output symbols for decoder. If
+      provided, the decoder output is over symbols 0 to num_decoder_symbols - 1.
+      Otherwise, decoder output is over symbols 0 to num_symbols - 1. Note that
+      this assumes that the vocabulary is set up such that the first
+      num_decoder_symbols of num_symbols are part of decoding.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [output_size x num_symbols] and B has
+      shape [num_symbols]; if provided and feed_previous=True, each
+      fed previous output will first be multiplied by W and added B.
+    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+      of decoder_inputs will be used (the "GO" symbol), and all other decoder
+      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+      If False, decoder_inputs are used as given (the standard decoder case).
+    dtype: The dtype to use for the initial RNN states (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_tied_rnn_seq2seq".
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors with
+        shape [batch_size x output_symbols] containing the generated
+        outputs where output_symbols = num_decoder_symbols if
+        num_decoder_symbols is not None otherwise output_symbols = num_symbols.
+      state: The state of each decoder cell at the final time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+  Raises:
+    ValueError: When output_projection has the wrong shape.
+  """
+  with variable_scope.variable_scope(
+      scope or "embedding_tied_rnn_seq2seq", dtype=dtype) as scope:
+    dtype = scope.dtype
+
+    if output_projection is not None:
+      proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype)
+      proj_weights.get_shape().assert_is_compatible_with([None, num_symbols])
+      proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+      proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+    embedding = variable_scope.get_variable(
+        "embedding", [num_symbols, embedding_size], dtype=dtype)
+
+    emb_encoder_inputs = [
+        embedding_ops.embedding_lookup(embedding, x) for x in encoder_inputs
+    ]
+    emb_decoder_inputs = [
+        embedding_ops.embedding_lookup(embedding, x) for x in decoder_inputs
+    ]
+
+    output_symbols = num_symbols
+    if num_decoder_symbols is not None:
+      output_symbols = num_decoder_symbols
+    if output_projection is None:
+      cell = rnn_cell.OutputProjectionWrapper(cell, output_symbols)
+
+    if isinstance(feed_previous, bool):
+      loop_function = _extract_argmax_and_embed(embedding, output_projection,
+                                                True) if feed_previous else None
+      return tied_rnn_seq2seq(
+          emb_encoder_inputs,
+          emb_decoder_inputs,
+          cell,
+          loop_function=loop_function,
+          dtype=dtype)
+
+    # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+    def decoder(feed_previous_bool):
+      loop_function = _extract_argmax_and_embed(
+          embedding, output_projection, False) if feed_previous_bool else None
+      reuse = None if feed_previous_bool else True
+      with variable_scope.variable_scope(
+          variable_scope.get_variable_scope(), reuse=reuse):
+        outputs, state = tied_rnn_seq2seq(
+            emb_encoder_inputs,
+            emb_decoder_inputs,
+            cell,
+            loop_function=loop_function,
+            dtype=dtype)
+        state_list = [state]
+        if nest.is_sequence(state):
+          state_list = nest.flatten(state)
+        return outputs + state_list
+
+    outputs_and_state = control_flow_ops.cond(feed_previous,
+                                              lambda: decoder(True),
+                                              lambda: decoder(False))
+    outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
+    state_list = outputs_and_state[outputs_len:]
+    state = state_list[0]
+    # Calculate zero-state to know it's structure.
+    static_batch_size = encoder_inputs[0].get_shape()[0]
+    for inp in encoder_inputs[1:]:
+      static_batch_size.merge_with(inp.get_shape()[0])
+    batch_size = static_batch_size.value
+    if batch_size is None:
+      batch_size = array_ops.shape(encoder_inputs[0])[0]
+    zero_state = cell.zero_state(batch_size, dtype)
+    if nest.is_sequence(zero_state):
+      state = nest.pack_sequence_as(
+          structure=zero_state, flat_sequence=state_list)
+    return outputs_and_state[:outputs_len], state
+
+
+def attention_decoder(decoder_inputs,
+                      initial_state,
+                      attention_states,
+                      cell,
+                      output_size=None,
+                      num_heads=1,
+                      loop_function=None,
+                      dtype=None,
+                      scope=None,
+                      initial_state_attention=False):
+  """RNN decoder with attention for the sequence-to-sequence model.
+
+  In this context "attention" means that, during decoding, the RNN can look up
+  information in the additional tensor attention_states, and it does this by
+  focusing on a few entries from the tensor. This model has proven to yield
+  especially good results in a number of sequence-to-sequence tasks. This
+  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
+  details). It is recommended for complex sequence-to-sequence tasks.
+
+  Args:
+    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+    initial_state: 2D Tensor [batch_size x cell.state_size].
+    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    output_size: Size of the output vectors; if None, we use cell.output_size.
+    num_heads: Number of attention heads that read from attention_states.
+    loop_function: If not None, this function will be applied to i-th output
+      in order to generate i+1-th input, and decoder_inputs will be ignored,
+      except for the first element ("GO" symbol). This can be used for decoding,
+      but also for training to emulate http://arxiv.org/abs/1506.03099.
+      Signature -- loop_function(prev, i) = next
+        * prev is a 2D Tensor of shape [batch_size x output_size],
+        * i is an integer, the step number (when advanced control is needed),
+        * next is a 2D Tensor of shape [batch_size x input_size].
+    dtype: The dtype to use for the RNN initial state (default: tf.float32).
+    scope: VariableScope for the created subgraph; default: "attention_decoder".
+    initial_state_attention: If False (default), initial attentions are zero.
+      If True, initialize the attentions from the initial state and attention
+      states -- useful when we wish to resume decoding from a previously
+      stored decoder state and attention states.
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors of
+        shape [batch_size x output_size]. These represent the generated outputs.
+        Output i is computed from input i (which is either the i-th element
+        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
+        First, we run the cell on a combination of the input and previous
+        attention masks:
+          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
+        Then, we calculate new attention masks:
+          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
+        and then we calculate the output:
+          output = linear(cell_output, new_attn).
+      state: The state of each decoder cell the final time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+  Raises:
+    ValueError: when num_heads is not positive, there are no inputs, shapes
+      of attention_states are not set, or input size cannot be inferred
+      from the input.
+  """
+  if not decoder_inputs:
+    raise ValueError("Must provide at least 1 input to attention decoder.")
+  if num_heads < 1:
+    raise ValueError("With less than 1 heads, use a non-attention decoder.")
+  if attention_states.get_shape()[2].value is None:
+    raise ValueError("Shape[2] of attention_states must be known: %s" %
+                     attention_states.get_shape())
+  if output_size is None:
+    output_size = cell.output_size
+
+  with variable_scope.variable_scope(
+      scope or "attention_decoder", dtype=dtype) as scope:
+    dtype = scope.dtype
+
+    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
+    attn_length = attention_states.get_shape()[1].value
+    if attn_length is None:
+      attn_length = array_ops.shape(attention_states)[1]
+    attn_size = attention_states.get_shape()[2].value
+
+    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
+    hidden = array_ops.reshape(attention_states,
+                               [-1, attn_length, 1, attn_size])
+    hidden_features = []
+    v = []
+    attention_vec_size = attn_size  # Size of query vectors for attention.
+    for a in xrange(num_heads):
+      k = variable_scope.get_variable("AttnW_%d" % a,
+                                      [1, 1, attn_size, attention_vec_size])
+      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
+      v.append(
+          variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))
+
+    state = initial_state
+
+    def attention(query):
+      """Put attention masks on hidden using hidden_features and query."""
+      ds = []  # Results of attention reads will be stored here.
+      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
+        query_list = nest.flatten(query)
+        for q in query_list:  # Check that ndims == 2 if specified.
+          ndims = q.get_shape().ndims
+          if ndims:
+            assert ndims == 2
+        query = array_ops.concat_v2(query_list, 1)
+      for a in xrange(num_heads):
+        with variable_scope.variable_scope("Attention_%d" % a):
+          y = linear(query, attention_vec_size, True)
+          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
+          # Attention mask is a softmax of v^T * tanh(...).
+          s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y),
+                                  [2, 3])
+          a = nn_ops.softmax(s)
+          # Now calculate the attention-weighted vector d.
+          d = math_ops.reduce_sum(
+              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
+          ds.append(array_ops.reshape(d, [-1, attn_size]))
+      return ds
+
+    outputs = []
+    prev = None
+    batch_attn_size = array_ops.pack([batch_size, attn_size])
+    attns = [
+        array_ops.zeros(
+            batch_attn_size, dtype=dtype) for _ in xrange(num_heads)
+    ]
+    for a in attns:  # Ensure the second shape of attention vectors is set.
+      a.set_shape([None, attn_size])
+    if initial_state_attention:
+      attns = attention(initial_state)
+    for i, inp in enumerate(decoder_inputs):
+      if i > 0:
+        variable_scope.get_variable_scope().reuse_variables()
+      # If loop_function is set, we use it instead of decoder_inputs.
+      if loop_function is not None and prev is not None:
+        with variable_scope.variable_scope("loop_function", reuse=True):
+          inp = loop_function(prev, i)
+      # Merge input and previous attentions into one vector of the right size.
+      input_size = inp.get_shape().with_rank(2)[1]
+      if input_size.value is None:
+        raise ValueError("Could not infer input size from input: %s" % inp.name)
+      x = linear([inp] + attns, input_size, True)
+      # Run the RNN.
+      cell_output, state = cell(x, state)
+      # Run the attention mechanism.
+      if i == 0 and initial_state_attention:
+        with variable_scope.variable_scope(
+            variable_scope.get_variable_scope(), reuse=True):
+          attns = attention(state)
+      else:
+        attns = attention(state)
+
+      with variable_scope.variable_scope("AttnOutputProjection"):
+        output = linear([cell_output] + attns, output_size, True)
+      if loop_function is not None:
+        prev = output
+      outputs.append(output)
+
+  return outputs, state
+
+
+def embedding_attention_decoder(decoder_inputs,
+                                initial_state,
+                                attention_states,
+                                cell,
+                                num_symbols,
+                                embedding_size,
+                                num_heads=1,
+                                output_size=None,
+                                output_projection=None,
+                                feed_previous=False,
+                                update_embedding_for_previous=True,
+                                dtype=None,
+                                scope=None,
+                                initial_state_attention=False):
+  """RNN decoder with embedding and attention and a pure-decoding option.
+
+  Args:
+    decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
+    initial_state: 2D Tensor [batch_size x cell.state_size].
+    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
+    cell: rnn_cell.RNNCell defining the cell function.
+    num_symbols: Integer, how many symbols come into the embedding.
+    embedding_size: Integer, the length of the embedding vector for each symbol.
+    num_heads: Number of attention heads that read from attention_states.
+    output_size: Size of the output vectors; if None, use output_size.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [output_size x num_symbols] and B has shape
+      [num_symbols]; if provided and feed_previous=True, each fed previous
+      output will first be multiplied by W and added B.
+    feed_previous: Boolean; if True, only the first of decoder_inputs will be
+      used (the "GO" symbol), and all other decoder inputs will be generated by:
+        next = embedding_lookup(embedding, argmax(previous_output)),
+      In effect, this implements a greedy decoder. It can also be used
+      during training to emulate http://arxiv.org/abs/1506.03099.
+      If False, decoder_inputs are used as given (the standard decoder case).
+    update_embedding_for_previous: Boolean; if False and feed_previous=True,
+      only the embedding for the first symbol of decoder_inputs (the "GO"
+      symbol) will be updated by back propagation. Embeddings for the symbols
+      generated from the decoder itself remain unchanged. This parameter has
+      no effect if feed_previous=False.
+    dtype: The dtype to use for the RNN initial states (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_attention_decoder".
+    initial_state_attention: If False (default), initial attentions are zero.
+      If True, initialize the attentions from the initial state and attention
+      states -- useful when we wish to resume decoding from a previously
+      stored decoder state and attention states.
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors with
+        shape [batch_size x output_size] containing the generated outputs.
+      state: The state of each decoder cell at the final time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+  Raises:
+    ValueError: When output_projection has the wrong shape.
+  """
+  if output_size is None:
+    output_size = cell.output_size
+  if output_projection is not None:
+    proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+    proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+  with variable_scope.variable_scope(
+      scope or "embedding_attention_decoder", dtype=dtype) as scope:
+
+    embedding = variable_scope.get_variable("embedding",
+                                            [num_symbols, embedding_size])
+    loop_function = _extract_argmax_and_embed(
+        embedding, output_projection,
+        update_embedding_for_previous) if feed_previous else None
+    emb_inp = [
+        embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs
+    ]
+    return attention_decoder(
+        emb_inp,
+        initial_state,
+        attention_states,
+        cell,
+        output_size=output_size,
+        num_heads=num_heads,
+        loop_function=loop_function,
+        initial_state_attention=initial_state_attention)
+
+
+def embedding_attention_seq2seq(encoder_inputs,
+                                decoder_inputs,
+                                cell,
+                                num_encoder_symbols,
+                                num_decoder_symbols,
+                                embedding_size,
+                                num_heads=1,
+                                output_projection=None,
+                                feed_previous=False,
+                                dtype=None,
+                                scope=None,
+                                initial_state_attention=False):
+  """Embedding sequence-to-sequence model with attention.
+
+  This model first embeds encoder_inputs by a newly created embedding (of shape
+  [num_encoder_symbols x input_size]). Then it runs an RNN to encode
+  embedded encoder_inputs into a state vector. It keeps the outputs of this
+  RNN at every step to use for attention later. Next, it embeds decoder_inputs
+  by another newly created embedding (of shape [num_decoder_symbols x
+  input_size]). Then it runs attention decoder, initialized with the last
+  encoder state, on embedded decoder_inputs and attending to encoder outputs.
+
+  Warning: when output_projection is None, the size of the attention vectors
+  and variables will be made proportional to num_decoder_symbols, can be large.
+
+  Args:
+    encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+    decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    num_encoder_symbols: Integer; number of symbols on the encoder side.
+    num_decoder_symbols: Integer; number of symbols on the decoder side.
+    embedding_size: Integer, the length of the embedding vector for each symbol.
+    num_heads: Number of attention heads that read from attention_states.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [output_size x num_decoder_symbols] and B has
+      shape [num_decoder_symbols]; if provided and feed_previous=True, each
+      fed previous output will first be multiplied by W and added B.
+    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+      of decoder_inputs will be used (the "GO" symbol), and all other decoder
+      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+      If False, decoder_inputs are used as given (the standard decoder case).
+    dtype: The dtype of the initial RNN state (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_attention_seq2seq".
+    initial_state_attention: If False (default), initial attentions are zero.
+      If True, initialize the attentions from the initial state and attention
+      states.
+
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors with
+        shape [batch_size x num_decoder_symbols] containing the generated
+        outputs.
+      state: The state of each decoder cell at the final time-step.
+        It is a 2D Tensor of shape [batch_size x cell.state_size].
+  """
+  with variable_scope.variable_scope(
+      scope or "embedding_attention_seq2seq", dtype=dtype) as scope:
+    dtype = scope.dtype
+    # Encoder.
+    encoder_cell = rnn_cell.EmbeddingWrapper(
+        cell,
+        embedding_classes=num_encoder_symbols,
+        embedding_size=embedding_size)
+    encoder_outputs, encoder_state = rnn.rnn(encoder_cell,
+                                             encoder_inputs,
+                                             dtype=dtype)
+
+    # First calculate a concatenation of encoder outputs to put attention on.
+    top_states = [
+        array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs
+    ]
+    attention_states = array_ops.concat_v2(top_states, 1)
+
+    # Decoder.
+    output_size = None
+    if output_projection is None:
+      cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
+      output_size = num_decoder_symbols
+
+    if isinstance(feed_previous, bool):
+      return embedding_attention_decoder(
+          decoder_inputs,
+          encoder_state,
+          attention_states,
+          cell,
+          num_decoder_symbols,
+          embedding_size,
+          num_heads=num_heads,
+          output_size=output_size,
+          output_projection=output_projection,
+          feed_previous=feed_previous,
+          initial_state_attention=initial_state_attention)
+
+    # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+    def decoder(feed_previous_bool):
+      reuse = None if feed_previous_bool else True
+      with variable_scope.variable_scope(
+          variable_scope.get_variable_scope(), reuse=reuse) as scope:
+        outputs, state = embedding_attention_decoder(
+            decoder_inputs,
+            encoder_state,
+            attention_states,
+            cell,
+            num_decoder_symbols,
+            embedding_size,
+            num_heads=num_heads,
+            output_size=output_size,
+            output_projection=output_projection,
+            feed_previous=feed_previous_bool,
+            update_embedding_for_previous=False,
+            initial_state_attention=initial_state_attention)
+        state_list = [state]
+        if nest.is_sequence(state):
+          state_list = nest.flatten(state)
+        return outputs + state_list
+
+    outputs_and_state = control_flow_ops.cond(feed_previous,
+                                              lambda: decoder(True),
+                                              lambda: decoder(False))
+    outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
+    state_list = outputs_and_state[outputs_len:]
+    state = state_list[0]
+    if nest.is_sequence(encoder_state):
+      state = nest.pack_sequence_as(
+          structure=encoder_state, flat_sequence=state_list)
+    return outputs_and_state[:outputs_len], state
+
+
+def one2many_rnn_seq2seq(encoder_inputs,
+                         decoder_inputs_dict,
+                         cell,
+                         num_encoder_symbols,
+                         num_decoder_symbols_dict,
+                         embedding_size,
+                         feed_previous=False,
+                         dtype=None,
+                         scope=None):
+  """One-to-many RNN sequence-to-sequence model (multi-task).
+
+  This is a multi-task sequence-to-sequence model with one encoder and multiple
+  decoders. Reference to multi-task sequence-to-sequence learning can be found
+  here: http://arxiv.org/abs/1511.06114
+
+  Args:
+    encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+    decoder_inputs_dict: A dictionany mapping decoder name (string) to
+      the corresponding decoder_inputs; each decoder_inputs is a list of 1D
+      Tensors of shape [batch_size]; num_decoders is defined as
+      len(decoder_inputs_dict).
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    num_encoder_symbols: Integer; number of symbols on the encoder side.
+    num_decoder_symbols_dict: A dictionary mapping decoder name (string) to an
+      integer specifying number of symbols for the corresponding decoder;
+      len(num_decoder_symbols_dict) must be equal to num_decoders.
+    embedding_size: Integer, the length of the embedding vector for each symbol.
+    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of
+      decoder_inputs will be used (the "GO" symbol), and all other decoder
+      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+      If False, decoder_inputs are used as given (the standard decoder case).
+    dtype: The dtype of the initial state for both the encoder and encoder
+      rnn cells (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "one2many_rnn_seq2seq"
+
+  Returns:
+    A tuple of the form (outputs_dict, state_dict), where:
+      outputs_dict: A mapping from decoder name (string) to a list of the same
+        length as decoder_inputs_dict[name]; each element in the list is a 2D
+        Tensors with shape [batch_size x num_decoder_symbol_list[name]]
+        containing the generated outputs.
+      state_dict: A mapping from decoder name (string) to the final state of the
+        corresponding decoder RNN; it is a 2D Tensor of shape
+        [batch_size x cell.state_size].
+  """
+  outputs_dict = {}
+  state_dict = {}
+
+  with variable_scope.variable_scope(
+      scope or "one2many_rnn_seq2seq", dtype=dtype) as scope:
+    dtype = scope.dtype
+
+    # Encoder.
+    encoder_cell = rnn_cell.EmbeddingWrapper(
+        cell,
+        embedding_classes=num_encoder_symbols,
+        embedding_size=embedding_size)
+    _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
+
+    # Decoder.
+    for name, decoder_inputs in decoder_inputs_dict.items():
+      num_decoder_symbols = num_decoder_symbols_dict[name]
+
+      with variable_scope.variable_scope("one2many_decoder_" + str(
+          name)) as scope:
+        decoder_cell = rnn_cell.OutputProjectionWrapper(cell,
+                                                        num_decoder_symbols)
+        if isinstance(feed_previous, bool):
+          outputs, state = embedding_rnn_decoder(
+              decoder_inputs,
+              encoder_state,
+              decoder_cell,
+              num_decoder_symbols,
+              embedding_size,
+              feed_previous=feed_previous)
+        else:
+          # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+          def filled_embedding_rnn_decoder(feed_previous):
+            """The current decoder with a fixed feed_previous parameter."""
+            # pylint: disable=cell-var-from-loop
+            reuse = None if feed_previous else True
+            vs = variable_scope.get_variable_scope()
+            with variable_scope.variable_scope(vs, reuse=reuse):
+              outputs, state = embedding_rnn_decoder(
+                  decoder_inputs,
+                  encoder_state,
+                  decoder_cell,
+                  num_decoder_symbols,
+                  embedding_size,
+                  feed_previous=feed_previous)
+            # pylint: enable=cell-var-from-loop
+            state_list = [state]
+            if nest.is_sequence(state):
+              state_list = nest.flatten(state)
+            return outputs + state_list
+
+          outputs_and_state = control_flow_ops.cond(
+              feed_previous, lambda: filled_embedding_rnn_decoder(True),
+              lambda: filled_embedding_rnn_decoder(False))
+          # Outputs length is the same as for decoder inputs.
+          outputs_len = len(decoder_inputs)
+          outputs = outputs_and_state[:outputs_len]
+          state_list = outputs_and_state[outputs_len:]
+          state = state_list[0]
+          if nest.is_sequence(encoder_state):
+            state = nest.pack_sequence_as(
+                structure=encoder_state, flat_sequence=state_list)
+      outputs_dict[name] = outputs
+      state_dict[name] = state
+
+  return outputs_dict, state_dict
+
+
+def sequence_loss_by_example(logits,
+                             targets,
+                             weights,
+                             average_across_timesteps=True,
+                             softmax_loss_function=None,
+                             name=None):
+  """Weighted cross-entropy loss for a sequence of logits (per example).
+
+  Args:
+    logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+    targets: List of 1D batch-sized int32 Tensors of the same length as logits.
+    weights: List of 1D batch-sized float-Tensors of the same length as logits.
+    average_across_timesteps: If set, divide the returned cost by the total
+      label weight.
+    softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch
+      to be used instead of the standard softmax (the default if this is None).
+    name: Optional name for this operation, default: "sequence_loss_by_example".
+
+  Returns:
+    1D batch-sized float Tensor: The log-perplexity for each sequence.
+
+  Raises:
+    ValueError: If len(logits) is different from len(targets) or len(weights).
+  """
+  if len(targets) != len(logits) or len(weights) != len(logits):
+    raise ValueError("Lengths of logits, weights, and targets must be the same "
+                     "%d, %d, %d." % (len(logits), len(weights), len(targets)))
+  with ops.name_scope(name, "sequence_loss_by_example",
+                      logits + targets + weights):
+    log_perp_list = []
+    for logit, target, weight in zip(logits, targets, weights):
+      if softmax_loss_function is None:
+        # TODO(irving,ebrevdo): This reshape is needed because
+        # sequence_loss_by_example is called with scalars sometimes, which
+        # violates our general scalar strictness policy.
+        target = array_ops.reshape(target, [-1])
+        crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
+            logits=logit, labels=target)
+      else:
+        crossent = softmax_loss_function(target, logit)
+      log_perp_list.append(crossent * weight)
+    log_perps = math_ops.add_n(log_perp_list)
+    if average_across_timesteps:
+      total_size = math_ops.add_n(weights)
+      total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
+      log_perps /= total_size
+  return log_perps
+
+
+def sequence_loss(logits,
+                  targets,
+                  weights,
+                  average_across_timesteps=True,
+                  average_across_batch=True,
+                  softmax_loss_function=None,
+                  name=None):
+  """Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
+
+  Args:
+    logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+    targets: List of 1D batch-sized int32 Tensors of the same length as logits.
+    weights: List of 1D batch-sized float-Tensors of the same length as logits.
+    average_across_timesteps: If set, divide the returned cost by the total
+      label weight.
+    average_across_batch: If set, divide the returned cost by the batch size.
+    softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
+      to be used instead of the standard softmax (the default if this is None).
+    name: Optional name for this operation, defaults to "sequence_loss".
+
+  Returns:
+    A scalar float Tensor: The average log-perplexity per symbol (weighted).
+
+  Raises:
+    ValueError: If len(logits) is different from len(targets) or len(weights).
+  """
+  with ops.name_scope(name, "sequence_loss", logits + targets + weights):
+    cost = math_ops.reduce_sum(
+        sequence_loss_by_example(
+            logits,
+            targets,
+            weights,
+            average_across_timesteps=average_across_timesteps,
+            softmax_loss_function=softmax_loss_function))
+    if average_across_batch:
+      batch_size = array_ops.shape(targets[0])[0]
+      return cost / math_ops.cast(batch_size, cost.dtype)
+    else:
+      return cost
+
+
+def model_with_buckets(encoder_inputs,
+                       decoder_inputs,
+                       targets,
+                       weights,
+                       buckets,
+                       seq2seq,
+                       softmax_loss_function=None,
+                       per_example_loss=False,
+                       name=None):
+  """Create a sequence-to-sequence model with support for bucketing.
+
+  The seq2seq argument is a function that defines a sequence-to-sequence model,
+  e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))
+
+  Args:
+    encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input.
+    decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input.
+    targets: A list of 1D batch-sized int32 Tensors (desired output sequence).
+    weights: List of 1D batch-sized float-Tensors to weight the targets.
+    buckets: A list of pairs of (input size, output size) for each bucket.
+    seq2seq: A sequence-to-sequence model function; it takes 2 input that
+      agree with encoder_inputs and decoder_inputs, and returns a pair
+      consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
+    softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
+      to be used instead of the standard softmax (the default if this is None).
+    per_example_loss: Boolean. If set, the returned loss will be a batch-sized
+      tensor of losses for each sequence in the batch. If unset, it will be
+      a scalar with the averaged loss from all examples.
+    name: Optional name for this operation, defaults to "model_with_buckets".
+
+  Returns:
+    A tuple of the form (outputs, losses), where:
+      outputs: The outputs for each bucket. Its j'th element consists of a list
+        of 2D Tensors. The shape of output tensors can be either
+        [batch_size x output_size] or [batch_size x num_decoder_symbols]
+        depending on the seq2seq model used.
+      losses: List of scalar Tensors, representing losses for each bucket, or,
+        if per_example_loss is set, a list of 1D batch-sized float Tensors.
+
+  Raises:
+    ValueError: If length of encoder_inputsut, targets, or weights is smaller
+      than the largest (last) bucket.
+  """
+  if len(encoder_inputs) < buckets[-1][0]:
+    raise ValueError("Length of encoder_inputs (%d) must be at least that of la"
+                     "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
+  if len(targets) < buckets[-1][1]:
+    raise ValueError("Length of targets (%d) must be at least that of last"
+                     "bucket (%d)." % (len(targets), buckets[-1][1]))
+  if len(weights) < buckets[-1][1]:
+    raise ValueError("Length of weights (%d) must be at least that of last"
+                     "bucket (%d)." % (len(weights), buckets[-1][1]))
+
+  all_inputs = encoder_inputs + decoder_inputs + targets + weights
+  losses = []
+  outputs = []
+  with ops.name_scope(name, "model_with_buckets", all_inputs):
+    for j, bucket in enumerate(buckets):
+      with variable_scope.variable_scope(
+          variable_scope.get_variable_scope(), reuse=True if j > 0 else None):
+        bucket_outputs, _ = seq2seq(encoder_inputs[:bucket[0]],
+                                    decoder_inputs[:bucket[1]])
+        outputs.append(bucket_outputs)
+        if per_example_loss:
+          losses.append(
+              sequence_loss_by_example(
+                  outputs[-1],
+                  targets[:bucket[1]],
+                  weights[:bucket[1]],
+                  softmax_loss_function=softmax_loss_function))
+        else:
+          losses.append(
+              sequence_loss(
+                  outputs[-1],
+                  targets[:bucket[1]],
+                  weights[:bucket[1]],
+                  softmax_loss_function=softmax_loss_function))
+
+  return outputs, losses
author	Jianwei Xie <xiejw@google.com>	2016-12-13 21:44:38 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-12-13 22:05:09 -0800
commit	d438a07a2f50401c6a1d5a644512cd46fba4da4f (patch)
tree	111ec8790aa29c3408959a43cb359ee46b8ada62 /tensorflow/contrib/legacy_seq2seq
parent	2a6bd09d05881d3160885a386b3ac1fb7cf6a6e1 (diff)