diff options
Diffstat (limited to 'tensorflow/models/rnn/translate/seq2seq_model.py')
-rw-r--r-- | tensorflow/models/rnn/translate/seq2seq_model.py | 268 |
1 files changed, 268 insertions, 0 deletions
diff --git a/tensorflow/models/rnn/translate/seq2seq_model.py b/tensorflow/models/rnn/translate/seq2seq_model.py new file mode 100644 index 0000000000..3c9cfb007f --- /dev/null +++ b/tensorflow/models/rnn/translate/seq2seq_model.py @@ -0,0 +1,268 @@ +"""Sequence-to-sequence model with an attention mechanism.""" + +import random + +import numpy as np +import tensorflow as tf + +from tensorflow.models.rnn import rnn_cell +from tensorflow.models.rnn import seq2seq + +from tensorflow.models.rnn.translate import data_utils + + +class Seq2SeqModel(object): + """Sequence-to-sequence model with attention and for multiple buckets. + + This class implements a multi-layer recurrent neural network as encoder, + and an attention-based decoder. This is the same as the model described in + this paper: http://arxiv.org/abs/1412.7449 - please look there for details, + or into the seq2seq library for complete model implementation. + This class also allows to use GRU cells in addition to LSTM cells, and + sampled softmax to handle large output vocabulary size. A single-layer + version of this model, but with bi-directional encoder, was presented in + http://arxiv.org/abs/1409.0473 + and sampled softmax is described in Section 3 of the following paper. + http://arxiv.org/pdf/1412.2007v2.pdf + """ + + def __init__(self, source_vocab_size, target_vocab_size, buckets, size, + num_layers, max_gradient_norm, batch_size, learning_rate, + learning_rate_decay_factor, use_lstm=False, + num_samples=512, forward_only=False): + """Create the model. + + Args: + source_vocab_size: size of the source vocabulary. + target_vocab_size: size of the target vocabulary. + buckets: a list of pairs (I, O), where I specifies maximum input length + that will be processed in that bucket, and O specifies maximum output + length. Training instances that have inputs longer than I or outputs + longer than O will be pushed to the next bucket and padded accordingly. + We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. + size: number of units in each layer of the model. + num_layers: number of layers in the model. + max_gradient_norm: gradients will be clipped to maximally this norm. + batch_size: the size of the batches used during training; + the model construction is independent of batch_size, so it can be + changed after initialization if this is convenient, e.g., for decoding. + learning_rate: learning rate to start with. + learning_rate_decay_factor: decay learning rate by this much when needed. + use_lstm: if true, we use LSTM cells instead of GRU cells. + num_samples: number of samples for sampled softmax. + forward_only: if set, we do not construct the backward pass in the model. + """ + self.source_vocab_size = source_vocab_size + self.target_vocab_size = target_vocab_size + self.buckets = buckets + self.batch_size = batch_size + self.learning_rate = tf.Variable(float(learning_rate), trainable=False) + self.learning_rate_decay_op = self.learning_rate.assign( + self.learning_rate * learning_rate_decay_factor) + self.global_step = tf.Variable(0, trainable=False) + + # If we use sampled softmax, we need an output projection. + output_projection = None + softmax_loss_function = None + # Sampled softmax only makes sense if we sample less than vocabulary size. + if num_samples > 0 and num_samples < self.target_vocab_size: + with tf.device("/cpu:0"): + w = tf.get_variable("proj_w", [size, self.target_vocab_size]) + w_t = tf.transpose(w) + b = tf.get_variable("proj_b", [self.target_vocab_size]) + output_projection = (w, b) + + def sampled_loss(inputs, labels): + with tf.device("/cpu:0"): + labels = tf.reshape(labels, [-1, 1]) + return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, + self.target_vocab_size) + softmax_loss_function = sampled_loss + + # Create the internal multi-layer cell for our RNN. + single_cell = rnn_cell.GRUCell(size) + if use_lstm: + single_cell = rnn_cell.BasicLSTMCell(size) + cell = single_cell + if num_layers > 1: + cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) + + # The seq2seq function: we use embedding for the input and attention. + def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): + return seq2seq.embedding_attention_seq2seq( + encoder_inputs, decoder_inputs, cell, source_vocab_size, + target_vocab_size, output_projection=output_projection, + feed_previous=do_decode) + + # Feeds for inputs. + self.encoder_inputs = [] + self.decoder_inputs = [] + self.target_weights = [] + for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. + self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], + name="encoder{0}".format(i))) + for i in xrange(buckets[-1][1] + 1): + self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], + name="decoder{0}".format(i))) + self.target_weights.append(tf.placeholder(tf.float32, shape=[None], + name="weight{0}".format(i))) + + # Our targets are decoder inputs shifted by one. + targets = [self.decoder_inputs[i + 1] + for i in xrange(len(self.decoder_inputs) - 1)] + + # Training outputs and losses. + if forward_only: + self.outputs, self.losses = seq2seq.model_with_buckets( + self.encoder_inputs, self.decoder_inputs, targets, + self.target_weights, buckets, self.target_vocab_size, + lambda x, y: seq2seq_f(x, y, True), + softmax_loss_function=softmax_loss_function) + # If we use output projection, we need to project outputs for decoding. + if output_projection is not None: + for b in xrange(len(buckets)): + self.outputs[b] = [tf.nn.xw_plus_b(output, output_projection[0], + output_projection[1]) + for output in self.outputs[b]] + else: + self.outputs, self.losses = seq2seq.model_with_buckets( + self.encoder_inputs, self.decoder_inputs, targets, + self.target_weights, buckets, self.target_vocab_size, + lambda x, y: seq2seq_f(x, y, False), + softmax_loss_function=softmax_loss_function) + + # Gradients and SGD update operation for training the model. + params = tf.trainable_variables() + if not forward_only: + self.gradient_norms = [] + self.updates = [] + opt = tf.train.GradientDescentOptimizer(self.learning_rate) + for b in xrange(len(buckets)): + gradients = tf.gradients(self.losses[b], params) + clipped_gradients, norm = tf.clip_by_global_norm(gradients, + max_gradient_norm) + self.gradient_norms.append(norm) + self.updates.append(opt.apply_gradients( + zip(clipped_gradients, params), global_step=self.global_step)) + + self.saver = tf.train.Saver(tf.all_variables()) + + def step(self, session, encoder_inputs, decoder_inputs, target_weights, + bucket_id, forward_only): + """Run a step of the model feeding the given inputs. + + Args: + session: tensorflow session to use. + encoder_inputs: list of numpy int vectors to feed as encoder inputs. + decoder_inputs: list of numpy int vectors to feed as decoder inputs. + target_weights: list of numpy float vectors to feed as target weights. + bucket_id: which bucket of the model to use. + forward_only: whether to do the backward step or only forward. + + Returns: + A triple consisting of gradient norm (or None if we did not do backward), + average perplexity, and the outputs. + + Raises: + ValueError: if length of enconder_inputs, decoder_inputs, or + target_weights disagrees with bucket size for the specified bucket_id. + """ + # Check if the sizes match. + encoder_size, decoder_size = self.buckets[bucket_id] + if len(encoder_inputs) != encoder_size: + raise ValueError("Encoder length must be equal to the one in bucket," + " %d != %d." % (len(encoder_inputs), encoder_size)) + if len(decoder_inputs) != decoder_size: + raise ValueError("Decoder length must be equal to the one in bucket," + " %d != %d." % (len(decoder_inputs), decoder_size)) + if len(target_weights) != decoder_size: + raise ValueError("Weights length must be equal to the one in bucket," + " %d != %d." % (len(target_weights), decoder_size)) + + # Input feed: encoder inputs, decoder inputs, target_weights, as provided. + input_feed = {} + for l in xrange(encoder_size): + input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] + for l in xrange(decoder_size): + input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] + input_feed[self.target_weights[l].name] = target_weights[l] + + # Since our targets are decoder inputs shifted by one, we need one more. + last_target = self.decoder_inputs[decoder_size].name + input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) + + # Output feed: depends on whether we do a backward step or not. + if not forward_only: + output_feed = [self.updates[bucket_id], # Update Op that does SGD. + self.gradient_norms[bucket_id], # Gradient norm. + self.losses[bucket_id]] # Loss for this batch. + else: + output_feed = [self.losses[bucket_id]] # Loss for this batch. + for l in xrange(decoder_size): # Output logits. + output_feed.append(self.outputs[bucket_id][l]) + + outputs = session.run(output_feed, input_feed) + if not forward_only: + return outputs[1], outputs[2], None # Gradient norm, loss, no outputs. + else: + return None, outputs[0], outputs[1:] # No gradient norm, loss, outputs. + + def get_batch(self, data, bucket_id): + """Get a random batch of data from the specified bucket, prepare for step. + + To feed data in step(..) it must be a list of batch-major vectors, while + data here contains single length-major cases. So the main logic of this + function is to re-index data cases to be in the proper format for feeding. + + Args: + data: a tuple of size len(self.buckets) in which each element contains + lists of pairs of input and output data that we use to create a batch. + bucket_id: integer, which bucket to get the batch for. + + Returns: + The triple (encoder_inputs, decoder_inputs, target_weights) for + the constructed batch that has the proper format to call step(...) later. + """ + encoder_size, decoder_size = self.buckets[bucket_id] + encoder_inputs, decoder_inputs = [], [] + + # Get a random batch of encoder and decoder inputs from data, + # pad them if needed, reverse encoder inputs and add GO to decoder. + for _ in xrange(self.batch_size): + encoder_input, decoder_input = random.choice(data[bucket_id]) + + # Encoder inputs are padded and then reversed. + encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) + encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) + + # Decoder inputs get an extra "GO" symbol, and are padded then. + decoder_pad_size = decoder_size - len(decoder_input) - 1 + decoder_inputs.append([data_utils.GO_ID] + decoder_input + + [data_utils.PAD_ID] * decoder_pad_size) + + # Now we create batch-major vectors from the data selected above. + batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] + + # Batch encoder inputs are just re-indexed encoder_inputs. + for length_idx in xrange(encoder_size): + batch_encoder_inputs.append( + np.array([encoder_inputs[batch_idx][length_idx] + for batch_idx in xrange(self.batch_size)], dtype=np.int32)) + + # Batch decoder inputs are re-indexed decoder_inputs, we create weights. + for length_idx in xrange(decoder_size): + batch_decoder_inputs.append( + np.array([decoder_inputs[batch_idx][length_idx] + for batch_idx in xrange(self.batch_size)], dtype=np.int32)) + + # Create target_weights to be 0 for targets that are padding. + batch_weight = np.ones(self.batch_size, dtype=np.float32) + for batch_idx in xrange(self.batch_size): + # We set weight to 0 if the corresponding target is a PAD symbol. + # The corresponding target is decoder_input shifted by 1 forward. + if length_idx < decoder_size - 1: + target = decoder_inputs[batch_idx][length_idx + 1] + if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: + batch_weight[batch_idx] = 0.0 + batch_weights.append(batch_weight) + return batch_encoder_inputs, batch_decoder_inputs, batch_weights |