1 files changed, 268 insertions, 0 deletions
diff --git a/tensorflow/models/rnn/translate/seq2seq_model.py b/tensorflow/models/rnn/translate/seq2seq_model.py
new file mode 100644
index 0000000000..3c9cfb007f
--- /dev/null
+++ b/tensorflow/models/rnn/translate/seq2seq_model.py
@@ -0,0 +1,268 @@
+"""Sequence-to-sequence model with an attention mechanism."""
+
+import random
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.models.rnn import rnn_cell
+from tensorflow.models.rnn import seq2seq
+
+from tensorflow.models.rnn.translate import data_utils
+
+
+class Seq2SeqModel(object):
+  """Sequence-to-sequence model with attention and for multiple buckets.
+
+  This class implements a multi-layer recurrent neural network as encoder,
+  and an attention-based decoder. This is the same as the model described in
+  this paper: http://arxiv.org/abs/1412.7449 - please look there for details,
+  or into the seq2seq library for complete model implementation.
+  This class also allows to use GRU cells in addition to LSTM cells, and
+  sampled softmax to handle large output vocabulary size. A single-layer
+  version of this model, but with bi-directional encoder, was presented in
+    http://arxiv.org/abs/1409.0473
+  and sampled softmax is described in Section 3 of the following paper.
+    http://arxiv.org/pdf/1412.2007v2.pdf
+  """
+
+  def __init__(self, source_vocab_size, target_vocab_size, buckets, size,
+               num_layers, max_gradient_norm, batch_size, learning_rate,
+               learning_rate_decay_factor, use_lstm=False,
+               num_samples=512, forward_only=False):
+    """Create the model.
+
+    Args:
+      source_vocab_size: size of the source vocabulary.
+      target_vocab_size: size of the target vocabulary.
+      buckets: a list of pairs (I, O), where I specifies maximum input length
+        that will be processed in that bucket, and O specifies maximum output
+        length. Training instances that have inputs longer than I or outputs
+        longer than O will be pushed to the next bucket and padded accordingly.
+        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
+      size: number of units in each layer of the model.
+      num_layers: number of layers in the model.
+      max_gradient_norm: gradients will be clipped to maximally this norm.
+      batch_size: the size of the batches used during training;
+        the model construction is independent of batch_size, so it can be
+        changed after initialization if this is convenient, e.g., for decoding.
+      learning_rate: learning rate to start with.
+      learning_rate_decay_factor: decay learning rate by this much when needed.
+      use_lstm: if true, we use LSTM cells instead of GRU cells.
+      num_samples: number of samples for sampled softmax.
+      forward_only: if set, we do not construct the backward pass in the model.
+    """
+    self.source_vocab_size = source_vocab_size
+    self.target_vocab_size = target_vocab_size
+    self.buckets = buckets
+    self.batch_size = batch_size
+    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
+    self.learning_rate_decay_op = self.learning_rate.assign(
+        self.learning_rate * learning_rate_decay_factor)
+    self.global_step = tf.Variable(0, trainable=False)
+
+    # If we use sampled softmax, we need an output projection.
+    output_projection = None
+    softmax_loss_function = None
+    # Sampled softmax only makes sense if we sample less than vocabulary size.
+    if num_samples > 0 and num_samples < self.target_vocab_size:
+      with tf.device("/cpu:0"):
+        w = tf.get_variable("proj_w", [size, self.target_vocab_size])
+        w_t = tf.transpose(w)
+        b = tf.get_variable("proj_b", [self.target_vocab_size])
+      output_projection = (w, b)
+
+      def sampled_loss(inputs, labels):
+        with tf.device("/cpu:0"):
+          labels = tf.reshape(labels, [-1, 1])
+          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
+                                            self.target_vocab_size)
+      softmax_loss_function = sampled_loss
+
+    # Create the internal multi-layer cell for our RNN.
+    single_cell = rnn_cell.GRUCell(size)
+    if use_lstm:
+      single_cell = rnn_cell.BasicLSTMCell(size)
+    cell = single_cell
+    if num_layers > 1:
+      cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
+
+    # The seq2seq function: we use embedding for the input and attention.
+    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
+      return seq2seq.embedding_attention_seq2seq(
+          encoder_inputs, decoder_inputs, cell, source_vocab_size,
+          target_vocab_size, output_projection=output_projection,
+          feed_previous=do_decode)
+
+    # Feeds for inputs.
+    self.encoder_inputs = []
+    self.decoder_inputs = []
+    self.target_weights = []
+    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
+      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
+                                                name="encoder{0}".format(i)))
+    for i in xrange(buckets[-1][1] + 1):
+      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
+                                                name="decoder{0}".format(i)))
+      self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
+                                                name="weight{0}".format(i)))
+
+    # Our targets are decoder inputs shifted by one.
+    targets = [self.decoder_inputs[i + 1]
+               for i in xrange(len(self.decoder_inputs) - 1)]
+
+    # Training outputs and losses.
+    if forward_only:
+      self.outputs, self.losses = seq2seq.model_with_buckets(
+          self.encoder_inputs, self.decoder_inputs, targets,
+          self.target_weights, buckets, self.target_vocab_size,
+          lambda x, y: seq2seq_f(x, y, True),
+          softmax_loss_function=softmax_loss_function)
+      # If we use output projection, we need to project outputs for decoding.
+      if output_projection is not None:
+        for b in xrange(len(buckets)):
+          self.outputs[b] = [tf.nn.xw_plus_b(output, output_projection[0],
+                                             output_projection[1])
+                             for output in self.outputs[b]]
+    else:
+      self.outputs, self.losses = seq2seq.model_with_buckets(
+          self.encoder_inputs, self.decoder_inputs, targets,
+          self.target_weights, buckets, self.target_vocab_size,
+          lambda x, y: seq2seq_f(x, y, False),
+          softmax_loss_function=softmax_loss_function)
+
+    # Gradients and SGD update operation for training the model.
+    params = tf.trainable_variables()
+    if not forward_only:
+      self.gradient_norms = []
+      self.updates = []
+      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
+      for b in xrange(len(buckets)):
+        gradients = tf.gradients(self.losses[b], params)
+        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
+                                                         max_gradient_norm)
+        self.gradient_norms.append(norm)
+        self.updates.append(opt.apply_gradients(
+            zip(clipped_gradients, params), global_step=self.global_step))
+
+    self.saver = tf.train.Saver(tf.all_variables())
+
+  def step(self, session, encoder_inputs, decoder_inputs, target_weights,
+           bucket_id, forward_only):
+    """Run a step of the model feeding the given inputs.
+
+    Args:
+      session: tensorflow session to use.
+      encoder_inputs: list of numpy int vectors to feed as encoder inputs.
+      decoder_inputs: list of numpy int vectors to feed as decoder inputs.
+      target_weights: list of numpy float vectors to feed as target weights.
+      bucket_id: which bucket of the model to use.
+      forward_only: whether to do the backward step or only forward.
+
+    Returns:
+      A triple consisting of gradient norm (or None if we did not do backward),
+      average perplexity, and the outputs.
+
+    Raises:
+      ValueError: if length of enconder_inputs, decoder_inputs, or
+        target_weights disagrees with bucket size for the specified bucket_id.
+    """
+    # Check if the sizes match.
+    encoder_size, decoder_size = self.buckets[bucket_id]
+    if len(encoder_inputs) != encoder_size:
+      raise ValueError("Encoder length must be equal to the one in bucket,"
+                       " %d != %d." % (len(encoder_inputs), encoder_size))
+    if len(decoder_inputs) != decoder_size:
+      raise ValueError("Decoder length must be equal to the one in bucket,"
+                       " %d != %d." % (len(decoder_inputs), decoder_size))
+    if len(target_weights) != decoder_size:
+      raise ValueError("Weights length must be equal to the one in bucket,"
+                       " %d != %d." % (len(target_weights), decoder_size))
+
+    # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
+    input_feed = {}
+    for l in xrange(encoder_size):
+      input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
+    for l in xrange(decoder_size):
+      input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
+      input_feed[self.target_weights[l].name] = target_weights[l]
+
+    # Since our targets are decoder inputs shifted by one, we need one more.
+    last_target = self.decoder_inputs[decoder_size].name
+    input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
+
+    # Output feed: depends on whether we do a backward step or not.
+    if not forward_only:
+      output_feed = [self.updates[bucket_id],  # Update Op that does SGD.
+                     self.gradient_norms[bucket_id],  # Gradient norm.
+                     self.losses[bucket_id]]  # Loss for this batch.
+    else:
+      output_feed = [self.losses[bucket_id]]  # Loss for this batch.
+      for l in xrange(decoder_size):  # Output logits.
+        output_feed.append(self.outputs[bucket_id][l])
+
+    outputs = session.run(output_feed, input_feed)
+    if not forward_only:
+      return outputs[1], outputs[2], None  # Gradient norm, loss, no outputs.
+    else:
+      return None, outputs[0], outputs[1:]  # No gradient norm, loss, outputs.
+
+  def get_batch(self, data, bucket_id):
+    """Get a random batch of data from the specified bucket, prepare for step.
+
+    To feed data in step(..) it must be a list of batch-major vectors, while
+    data here contains single length-major cases. So the main logic of this
+    function is to re-index data cases to be in the proper format for feeding.
+
+    Args:
+      data: a tuple of size len(self.buckets) in which each element contains
+        lists of pairs of input and output data that we use to create a batch.
+      bucket_id: integer, which bucket to get the batch for.
+
+    Returns:
+      The triple (encoder_inputs, decoder_inputs, target_weights) for
+      the constructed batch that has the proper format to call step(...) later.
+    """
+    encoder_size, decoder_size = self.buckets[bucket_id]
+    encoder_inputs, decoder_inputs = [], []
+
+    # Get a random batch of encoder and decoder inputs from data,
+    # pad them if needed, reverse encoder inputs and add GO to decoder.
+    for _ in xrange(self.batch_size):
+      encoder_input, decoder_input = random.choice(data[bucket_id])
+
+      # Encoder inputs are padded and then reversed.
+      encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
+      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
+
+      # Decoder inputs get an extra "GO" symbol, and are padded then.
+      decoder_pad_size = decoder_size - len(decoder_input) - 1
+      decoder_inputs.append([data_utils.GO_ID] + decoder_input +
+                            [data_utils.PAD_ID] * decoder_pad_size)
+
+    # Now we create batch-major vectors from the data selected above.
+    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
+
+    # Batch encoder inputs are just re-indexed encoder_inputs.
+    for length_idx in xrange(encoder_size):
+      batch_encoder_inputs.append(
+          np.array([encoder_inputs[batch_idx][length_idx]
+                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))
+
+    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
+    for length_idx in xrange(decoder_size):
+      batch_decoder_inputs.append(
+          np.array([decoder_inputs[batch_idx][length_idx]
+                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))
+
+      # Create target_weights to be 0 for targets that are padding.
+      batch_weight = np.ones(self.batch_size, dtype=np.float32)
+      for batch_idx in xrange(self.batch_size):
+        # We set weight to 0 if the corresponding target is a PAD symbol.
+        # The corresponding target is decoder_input shifted by 1 forward.
+        if length_idx < decoder_size - 1:
+          target = decoder_inputs[batch_idx][length_idx + 1]
+        if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
+          batch_weight[batch_idx] = 0.0
+      batch_weights.append(batch_weight)
+    return batch_encoder_inputs, batch_decoder_inputs, batch_weights