From 9cc5098388295fabde838433ccfc5c418f4c327a Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Thu, 16 Mar 2017 18:24:44 -0800
Subject: Initial cut of documentation for tf.contrib.seq2seq Change: 150400474

---
 tensorflow/contrib/seq2seq/README.md               |   9 --
 tensorflow/contrib/seq2seq/__init__.py             |  14 +--
 .../python/ops/dynamic_attention_wrapper.py        |   4 +-
 .../docs_src/api_guides/python/contrib.seq2seq.md  | 108 +++++++++++++++++++++
 4 files changed, 114 insertions(+), 21 deletions(-)
 delete mode 100644 tensorflow/contrib/seq2seq/README.md
 create mode 100644 tensorflow/docs_src/api_guides/python/contrib.seq2seq.md

diff --git a/tensorflow/contrib/seq2seq/README.md b/tensorflow/contrib/seq2seq/README.md
deleted file mode 100644
index 50ac32ec15..0000000000
--- a/tensorflow/contrib/seq2seq/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# TensorFlow contrib seq2seq layers and losses
-
-## Layers
-
-Information to be added.
-
-## Losses
-
-Information to be added.
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index f4fc557085..29bce7bbae 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -15,15 +15,14 @@
 
 """Ops for building neural network seq2seq decoders and losses.
 
-## Decoder base class and functions
+See the @{$python/contrib.seq2seq} guide.
+
 @@Decoder
 @@dynamic_decode
 
-## Basic Decoder
 @@BasicDecoderOutput
 @@BasicDecoder
 
-## Decoder Helpers
 @@Helper
 @@CustomHelper
 @@GreedyEmbeddingHelper
@@ -31,16 +30,11 @@
 @@ScheduledOutputTrainingHelper
 @@TrainingHelper
 
-## Attention
-
-### Scorers
-@@BahdanauScorer
-@@LuongScorer
+@@BahdanauAttention
+@@LuongAttention
 
-### Helper functions
 @@hardmax
 
-### RNNCells
 @@DynamicAttentionWrapperState
 @@DynamicAttentionWrapper
 """
diff --git a/tensorflow/contrib/seq2seq/python/ops/dynamic_attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/dynamic_attention_wrapper.py
index 0f6bb72fff..94678629c8 100644
--- a/tensorflow/contrib/seq2seq/python/ops/dynamic_attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/dynamic_attention_wrapper.py
@@ -425,8 +425,8 @@ class DynamicAttentionWrapper(core_rnn_cell.RNNCell):
       cell_input_fn: (optional) A `callable`.  The default is:
         `lambda inputs, attention: array_ops.concat([inputs, attention], -1)`.
       probability_fn: (optional) A `callable`.  Converts the score to
-        probabilities.  The default is `tf.nn.softmax`. Other options include
-        `tf.contrib.seq2seq.hardmax` and `tf.contrib.sparsemax.sparsemax`.
+        probabilities.  The default is @{tf.nn.softmax}. Other options include
+        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
       output_attention: Python bool.  If `True` (default), the output at each
         time step is the attention value.  This is the behavior of Luong-style
         attention mechanisms.  If `False`, the output at each time step is
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
new file mode 100644
index 0000000000..223bf4a0a3
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -0,0 +1,108 @@
+# Seq2seq Library (contrib)
+[TOC]
+
+Module for constructing seq2seq models and dynamic decoding.  Builds on top of
+libraries in @{tf.contrib.rnn}.
+
+This library is composed of two primary components:
+
+*   New attention wrappers for @{tf.contrib.rnn.RNNCell} objects.
+*   A new object-oriented dynamic decoding framework.
+
+## Attention
+
+Attention wrappers are `RNNCell` objects that wrap other `RNNCell` objects and
+implement attention.  The form of attention is determined by a subclass of
+@{tf.contrib.seq2seq.AttentionMechanism}.  These subclasses describe the form
+of attention (e.g. additive vs. multiplicative) to use when creating the
+wrapper.  An instance of an `AttentionMechanism` is constructed with a
+`memory` tensor, from which lookup keys and values tensors are created.
+
+### Attention Mechanisms
+
+The two basic attention mechanisms are:
+*   @{tf.contrib.seq2seq.BahdanauAttention} (additive attention,
+    [ref.](https://arxiv.org/abs/1409.0473))
+*   @{tf.contrib.seq2seq.LuongAttention} (multiplicative attention,
+    [ref.](https://arxiv.org/abs/1508.04025))
+
+The `memory` tensor passed the attention mechanism's constructor is expected to
+be shaped `[batch_size, memory_max_time, memory_depth]`; and often an additional
+`memory_sequence_length` vector is accepted.  If provided, the `memory`
+tensors' rows are masked with zeros past their true sequence lengths.
+
+Attention mechanisms also have a concept of depth, usually determined as a
+construction parameter `num_units`.  For some kinds of attention (like
+`BahdanauAttention`), both queries and memory are projected to tensors of depth
+`num_units`.  For other kinds (like `LuongAttention`), `num_units` should match
+the depth of the queries; and the `memory` tensor will be projected to this
+depth.
+
+### Attention Wrappers
+
+The basic attention wrapper is @{tf.contrib.seq2seq.DynamicAttentionWrapper}.
+This wrapper accepts an `RNNCell` instance, an instance of `AttentionMechanism`,
+and an attention depth parameter (`attention_size`); as well as several
+optional arguments that allow one to customize intermediate calculations.
+
+At each time step, the basic calculation performed by this wrapper is:
+
+```python
+cell_inputs = concat([inputs, prev_state.attention], -1)
+cell_output, next_cell_state = cell(cell_inputs, prev_state.cell_state)
+score = attention_mechanism(cell_output)
+alignments = softmax(score)
+context = matmul(alignments, attention_mechanism.values)
+attention = tf.layers.Dense(attention_size)(concat([cell_output, context], 1))
+next_state = DynamicAttentionWrapperState(
+  cell_state=next_cell_state,
+  attention=attention)
+output = attention
+return output, next_state
+```
+
+In practice, a number of the intermediate calculations are configurable.
+For example, the initial concatenation of `inputs` and `prev_state.attention`
+can be replaced with another mixing function.  The function `softmax` can
+be replaced with alternative options when calculating `alignments` from the
+`score`.  Finally, the outputs returned by the wrapper can be configured to
+be the value `cell_output` instead of `attention`.
+
+The benefit of using a `DynamicAttentionWrapper` is that it plays nicely with
+other wrappers and the dynamic decoder described below.  For example, one can
+write:
+
+```python
+cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:0")
+attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs)
+attn_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(
+  cell, attention_mechanism, attention_size=256)
+attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/gpu:1")
+top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:1")
+multi_cell = MultiRNNCell([attn_cell, top_cell])
+```
+
+The `multi_rnn` cell will perform the bottom layer calculations on GPU 0;
+attention calculations will be performed on GPU 1 and immediately passed
+up to the top layer which is also calculated on GPU 1.  The attention is
+also passed forward in time to the next time step and copied to GPU 0 for the
+next time step of `cell`.  (*Note*: This is just an example of use,
+not a suggested device partitioning strategy.)
+
+## Dynamic Decoding
+
+### Decoder base class and functions
+*   @{tf.contrib.seq2seq.Decoder}
+*   @{tf.contrib.seq2seq.dynamic_decode}
+
+### Basic Decoder
+*   @{tf.contrib.seq2seq.BasicDecoderOutput}
+*   @{tf.contrib.seq2seq.BasicDecoder}
+
+### Decoder Helpers
+*   @{tf.contrib.seq2seq.Helper}
+*   @{tf.contrib.seq2seq.CustomHelper}
+*   @{tf.contrib.seq2seq.GreedyEmbeddingHelper}
+*   @{tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper}
+*   @{tf.contrib.seq2seq.ScheduledOutputTrainingHelper}
+*   @{tf.contrib.seq2seq.TrainingHelper}
-- 
cgit v1.2.3