1 files changed, 574 insertions, 1809 deletions
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 2bc74d5f80..139523403c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,209 +29,99 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.platform import tf_logging as logging
 
 
-class StackedRNNCells(Layer):
-  """Wrapper allowing a stack of RNN cells to behave as a single cell.
+# pylint: disable=access-member-before-definition
 
-  Used to implement efficient stacked RNNs.
+
+def _time_distributed_dense(x,
+                            w,
+                            b=None,
+                            dropout=None,
+                            input_dim=None,
+                            output_dim=None,
+                            timesteps=None,
+                            training=None):
+  """Apply `y . w + b` for every temporal slice y of x.
 
   Arguments:
-      cells: List of RNN cell instances.
+      x: input tensor.
+      w: weight matrix.
+      b: optional bias vector.
+      dropout: whether to apply dropout (same dropout mask
+          for every temporal slice of the input).
+      input_dim: integer; optional dimensionality of the input.
+      output_dim: integer; optional dimensionality of the output.
+      timesteps: integer; optional number of timesteps.
+      training: training phase tensor or boolean.
+
+  Returns:
+      Output tensor.
+  """
+  if not input_dim:
+    input_dim = K.shape(x)[2]
+  if not timesteps:
+    timesteps = K.shape(x)[1]
+  if not output_dim:
+    output_dim = K.shape(w)[1]
+
+  if dropout is not None and 0. < dropout < 1.:
+    # apply the same dropout pattern at every timestep
+    ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
+    dropout_matrix = K.dropout(ones, dropout)
+    expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
+    x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)
+
+  # collapse time dimension and batch dimension together
+  x = K.reshape(x, (-1, input_dim))
+  x = K.dot(x, w)
+  if b is not None:
+    x = K.bias_add(x, b)
+  # reshape to 3D tensor
+  if K.backend() == 'tensorflow':
+    x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
+    x.set_shape([None, None, output_dim])
+  else:
+    x = K.reshape(x, (-1, timesteps, output_dim))
+  return x
 
-  Examples:
 
-  ```python
-      cells = [
-          keras.layers.LSTMCell(output_dim),
-          keras.layers.LSTMCell(output_dim),
-          keras.layers.LSTMCell(output_dim),
-      ]
-
-      inputs = keras.Input((timesteps, input_dim))
-      x = keras.layers.RNN(cells)(inputs)
-  ```
-  """
+class Recurrent(Layer):
+  """Abstract base class for recurrent layers.
 
-  def __init__(self, cells, **kwargs):
-    for cell in cells:
-      if not hasattr(cell, 'call'):
-        raise ValueError('All cells must have a `call` method. '
-                         'received cells:', cells)
-      if not hasattr(cell, 'state_size'):
-        raise ValueError('All cells must have a '
-                         '`state_size` attribute. '
-                         'received cells:', cells)
-    self.cells = cells
-    super(StackedRNNCells, self).__init__(**kwargs)
-
-  @property
-  def state_size(self):
-    # States are a flat list
-    # in reverse order of the cell stack.
-    # This allows to preserve the requirement
-    # `stack.state_size[0] == output_dim`.
-    # e.g. states of a 2-layer LSTM would be
-    # `[h2, c2, h1, c1]`
-    # (assuming one LSTM has states [h, c])
-    state_size = []
-    for cell in self.cells[::-1]:
-      if hasattr(cell.state_size, '__len__'):
-        state_size += list(cell.state_size)
-      else:
-        state_size.append(cell.state_size)
-    return tuple(state_size)
-
-  def call(self, inputs, states, **kwargs):
-    # Recover per-cell states.
-    nested_states = []
-    for cell in self.cells[::-1]:
-      if hasattr(cell.state_size, '__len__'):
-        nested_states.append(states[:len(cell.state_size)])
-        states = states[len(cell.state_size):]
-      else:
-        nested_states.append([states[0]])
-        states = states[1:]
-    nested_states = nested_states[::-1]
-
-    # Call the cells in order and store the returned states.
-    new_nested_states = []
-    for cell, states in zip(self.cells, nested_states):
-      inputs, states = cell.call(inputs, states, **kwargs)
-      new_nested_states.append(states)
-
-    # Format the new states as a flat list
-    # in reverse cell order.
-    states = []
-    for cell_states in new_nested_states[::-1]:
-      states += cell_states
-    return inputs, states
+  Do not use in a model -- it's not a valid layer!
+  Use its children classes `LSTM`, `GRU` and `SimpleRNN` instead.
 
-  def build(self, input_shape):
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        cell.build(input_shape)
-      if hasattr(cell.state_size, '__len__'):
-        output_dim = cell.state_size[0]
-      else:
-        output_dim = cell.state_size
-      input_shape = (input_shape[0], input_shape[1], output_dim)
-    self.built = True
+  All recurrent layers (`LSTM`, `GRU`, `SimpleRNN`) also
+  follow the specifications of this class and accept
+  the keyword arguments listed below.
 
-  def get_config(self):
-    cells = []
-    for cell in self.cells:
-      cells.append({
-          'class_name': cell.__class__.__name__,
-          'config': cell.get_config()
-      })
-    config = {'cells': cells}
-    base_config = super(StackedRNNCells, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  Example:
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    cells = []
-    for cell_config in config.pop('cells'):
-      cells.append(
-          deserialize_layer(cell_config, custom_objects=custom_objects))
-    return cls(cells, **config)
-
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.trainable_weights
-    return weights
-
-  @property
-  def non_trainable_weights(self):
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for cell in self.cells:
-        if isinstance(cell, Layer):
-          trainable_weights += cell.trainable_weights
-      return trainable_weights + weights
-    return weights
-
-  def get_weights(self):
-    """Retrieves the weights of the model.
-
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.weights
-    return K.batch_get_value(weights)
-
-  def set_weights(self, weights):
-    """Sets the weights of the model.
-
-    Arguments:
-        weights: A list of Numpy arrays with shapes and types matching
-            the output of `model.get_weights()`.
-    """
-    tuples = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        num_param = len(cell.weights)
-        weights = weights[:num_param]
-        for sw, w in zip(cell.weights, weights):
-          tuples.append((sw, w))
-        weights = weights[num_param:]
-    K.batch_set_value(tuples)
-
-  @property
-  def losses(self):
-    losses = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        cell_losses = cell.losses
-        losses += cell_losses
-    return losses
-
-  def get_losses_for(self, inputs=None):
-    losses = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        cell_losses = cell.get_losses_for(inputs)
-        losses += cell_losses
-    return losses
-
-
-class RNN(Layer):
-  """Base class for recurrent layers.
+  ```python
+      # as the first layer in a Sequential model
+      model = Sequential()
+      model.add(LSTM(32, input_shape=(10, 64)))
+      # now model.output_shape == (None, 32)
+      # note: `None` is the batch dimension.
+
+      # for subsequent layers, no need to specify the input size:
+      model.add(LSTM(16))
+
+      # to stack recurrent layers, you must use return_sequences=True
+      # on any recurrent layer that feeds into another recurrent layer.
+      # note that you only need to specify the input size on the first layer.
+      model = Sequential()
+      model.add(LSTM(64, input_dim=64, input_length=10, return_sequences=True))
+      model.add(LSTM(32, return_sequences=True))
+      model.add(LSTM(10))
+  ```
 
   Arguments:
-      cell: A RNN cell instance. A RNN cell is a class that has:
-          - a `call(input_at_t, states_at_t)` method, returning
-              `(output_at_t, states_at_t_plus_1)`. The call method of the
-              cell can also take the optional argument `constants`, see
-              section "Note on passing external constants" below.
-          - a `state_size` attribute. This can be a single integer
-              (single state) in which case it is
-              the size of the recurrent state
-              (which should be the same as the size of the cell output).
-              This can also be a list/tuple of integers
-              (one size per state). In this case, the first entry
-              (`state_size[0]`) should be the same as
-              the size of the cell output.
-          It is also possible for `cell` to be a list of RNN cell instances,
-          in which cases the cells get stacked on after the other in the RNN,
-          implementing an efficient stacked RNN.
-      return_sequences: Boolean. Whether to return the last output.
+      weights: list of Numpy arrays to set as initial weights.
+          The list should have 3 elements, of shapes:
+          `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -247,9 +137,21 @@ class RNN(Layer):
           Unrolling can speed-up a RNN,
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
+      implementation: one of {0, 1, or 2}.
+          If set to 0, the RNN will use
+          an implementation that uses fewer, larger matrix products,
+          thus running faster on CPU but consuming more memory.
+          If set to 1, the RNN will use more matrix products,
+          but smaller ones, thus running slower
+          (may actually be faster on GPU) while consuming less memory.
+          If set to 2 (LSTM/GRU only),
+          the RNN will combine the input gate,
+          the forget gate and the output gate into a single matrix,
+          enabling more time-efficient parallelization on the GPU.
+          Note: RNN dropout must be shared for all gates,
+          resulting in a slightly reduced regularization.
       input_dim: dimensionality of the input (integer).
-          This argument (or alternatively,
-          the keyword argument `input_shape`)
+          This argument (or alternatively, the keyword argument `input_shape`)
           is required when using this layer as the first layer in a model.
       input_length: Length of input sequences, to be specified
           when it is constant.
@@ -261,7 +163,7 @@ class RNN(Layer):
           at the level of the first layer
           (e.g. via the `input_shape` argument)
 
-  Input shape:
+  Input shape:s
       3D tensor with shape `(batch_size, timesteps, input_dim)`,
       (Optional) 2D tensors with shape `(batch_size, output_dim)`.
 
@@ -276,7 +178,7 @@ class RNN(Layer):
   # Masking
       This layer supports masking for input data with a variable number
       of timesteps. To introduce masks to your data,
-      use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
+      use an `Embedding` layer with the `mask_zero` parameter
       set to `True`.
 
   # Note on using statefulness in RNNs
@@ -310,128 +212,42 @@ class RNN(Layer):
       calling `reset_states` with the keyword argument `states`. The value of
       `states` should be a numpy array or list of numpy arrays representing
       the initial state of the RNN layer.
-
-  # Note on passing external constants to RNNs
-      You can pass "external" constants to the cell using the `constants`
-      keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
-      requires that the `cell.call` method accepts the same keyword argument
-      `constants`. Such constants can be used to condition the cell
-      transformation on additional static inputs (not changing over time),
-      a.k.a. an attention mechanism.
-
-  Examples:
-
-  ```python
-      # First, let's define a RNN Cell, as a layer subclass.
-
-      class MinimalRNNCell(keras.layers.Layer):
-
-          def __init__(self, units, **kwargs):
-              self.units = units
-              self.state_size = units
-              super(MinimalRNNCell, self).__init__(**kwargs)
-
-          def build(self, input_shape):
-              self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                            initializer='uniform',
-                                            name='kernel')
-              self.recurrent_kernel = self.add_weight(
-                  shape=(self.units, self.units),
-                  initializer='uniform',
-                  name='recurrent_kernel')
-              self.built = True
-
-          def call(self, inputs, states):
-              prev_output = states[0]
-              h = K.dot(inputs, self.kernel)
-              output = h + K.dot(prev_output, self.recurrent_kernel)
-              return output, [output]
-
-      # Let's use this cell in a RNN layer:
-
-      cell = MinimalRNNCell(32)
-      x = keras.Input((None, 5))
-      layer = RNN(cell)
-      y = layer(x)
-
-      # Here's how to use the cell to build a stacked RNN:
-
-      cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
-      x = keras.Input((None, 5))
-      layer = RNN(cells)
-      y = layer(x)
-  ```
   """
 
   def __init__(self,
-               cell,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
                stateful=False,
                unroll=False,
-               activity_regularizer=None,
+               implementation=0,
                **kwargs):
-    if isinstance(cell, (list, tuple)):
-      cell = StackedRNNCells(cell)
-    if not hasattr(cell, 'call'):
-      raise ValueError('`cell` should have a `call` method. '
-                       'The RNN was passed:', cell)
-    if not hasattr(cell, 'state_size'):
-      raise ValueError('The RNN cell should have '
-                       'an attribute `state_size` '
-                       '(tuple of integers, '
-                       'one integer per RNN state).')
-    super(RNN, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
-    self.cell = cell
+    super(Recurrent, self).__init__(**kwargs)
     self.return_sequences = return_sequences
     self.return_state = return_state
     self.go_backwards = go_backwards
     self.stateful = stateful
     self.unroll = unroll
-
+    self.implementation = implementation
     self.supports_masking = True
     self.input_spec = [InputSpec(ndim=3)]
     self.state_spec = None
-    self._states = None
-    self.constants_spec = None
-    self._num_constants = None
-
-  @property
-  def states(self):
-    if self._states is None:
-      if isinstance(self.cell.state_size, int):
-        num_states = 1
-      else:
-        num_states = len(self.cell.state_size)
-      return [None for _ in range(num_states)]
-    return self._states
-
-  @states.setter
-  def states(self, states):
-    self._states = states
+    self.dropout = 0
+    self.recurrent_dropout = 0
 
   def _compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-
-    if hasattr(self.cell.state_size, '__len__'):
-      output_dim = self.cell.state_size[0]
-    else:
-      output_dim = self.cell.state_size
-
     if self.return_sequences:
-      output_shape = (input_shape[0], input_shape[1], output_dim)
+      output_shape = (input_shape[0], input_shape[1], self.units)
     else:
-      output_shape = (input_shape[0], output_dim)
+      output_shape = (input_shape[0], self.units)
 
     if self.return_state:
-      state_shape = [(input_shape[0], output_dim) for _ in self.states]
-      output_shape = [output_shape] + state_shape
-    else:
-      output_shape = output_shape
+      state_shape = [tensor_shape.TensorShape(
+          (input_shape[0], self.units)) for _ in self.states]
+      return [tensor_shape.TensorShape(output_shape)] + state_shape
     return tensor_shape.TensorShape(output_shape)
 
   def compute_mask(self, inputs, mask):
@@ -441,123 +257,82 @@ class RNN(Layer):
     if self.return_state:
       state_mask = [None for _ in self.states]
       return [output_mask] + state_mask
-    else:
-      return output_mask
-
-  def build(self, input_shape):
-    # Note input_shape will be list of shapes of initial states and
-    # constants if these are passed in __call__.
-    if self._num_constants is not None:
-      constants_shape = input_shape[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-    else:
-      constants_shape = None
-
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    return output_mask
 
-    batch_size = input_shape[0] if self.stateful else None
-    input_dim = input_shape[-1]
-    self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
-
-    # allow cell (if layer) to build before we set or validate state_spec
-    if isinstance(self.cell, Layer):
-      step_input_shape = (input_shape[0],) + input_shape[2:]
-      if constants_shape is not None:
-        self.cell.build([step_input_shape] + constants_shape)
-      else:
-        self.cell.build(step_input_shape)
+  def step(self, inputs, states):
+    raise NotImplementedError
 
-    # set or validate state_spec
-    if hasattr(self.cell.state_size, '__len__'):
-      state_size = list(self.cell.state_size)
-    else:
-      state_size = [self.cell.state_size]
-
-    if self.state_spec is not None:
-      # initial_state was passed in call, check compatibility
-      if [spec.shape[-1] for spec in self.state_spec] != state_size:
-        raise ValueError(
-            'An initial_state was passed that is not compatible with '
-            '`cell.state_size`. Received `state_spec`={}; '
-            'However `cell.state_size` is '
-            '{}'.format(self.state_spec, self.cell.state_size))
-    else:
-      self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
-    if self.stateful:
-      self.reset_states()
+  def get_constants(self, inputs, training=None):
+    return []
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
     initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
     initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
     initial_state = K.expand_dims(initial_state)  # (samples, 1)
-    if hasattr(self.cell.state_size, '__len__'):
-      return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
-    else:
-      return [K.tile(initial_state, [1, self.cell.state_size])]
+    initial_state = K.tile(initial_state, [1,
+                                           self.units])  # (samples, output_dim)
+    initial_state = [initial_state for _ in range(len(self.states))]
+    return initial_state
 
-  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    inputs, initial_state, constants = self._standardize_args(
-        inputs, initial_state, constants)
+  def preprocess_input(self, inputs, training=None):
+    return inputs
 
-    if initial_state is None and constants is None:
-      return super(RNN, self).__call__(inputs, **kwargs)
+  def __call__(self, inputs, initial_state=None, **kwargs):
+    if (isinstance(inputs, (list, tuple)) and
+        len(inputs) > 1
+        and initial_state is None):
+      initial_state = inputs[1:]
+      inputs = inputs[0]
 
-    # If any of `initial_state` or `constants` are specified and are Keras
-    # tensors, then add them to the inputs and temporarily modify the
-    # input_spec to include them.
+    # If `initial_state` is specified,
+    # and if it a Keras tensor,
+    # then add it to the inputs and temporarily
+    # modify the input spec to include the state.
+    if initial_state is None:
+      return super(Recurrent, self).__call__(inputs, **kwargs)
 
-    additional_inputs = []
-    additional_specs = []
-    if initial_state is not None:
-      kwargs['initial_state'] = initial_state
-      additional_inputs += initial_state
-      self.state_spec = [
-          InputSpec(shape=K.int_shape(state)) for state in initial_state
-      ]
-      additional_specs += self.state_spec
-    if constants is not None:
-      kwargs['constants'] = constants
-      additional_inputs += constants
-      self.constants_spec = [
-          InputSpec(shape=K.int_shape(constant)) for constant in constants
-      ]
-      self._num_constants = len(constants)
-      additional_specs += self.constants_spec
-    # at this point additional_inputs cannot be empty
-    is_keras_tensor = hasattr(additional_inputs[0], '_keras_history')
-    for tensor in additional_inputs:
+    if not isinstance(initial_state, (list, tuple)):
+      initial_state = [initial_state]
+
+    is_keras_tensor = hasattr(initial_state[0], '_keras_history')
+    for tensor in initial_state:
       if hasattr(tensor, '_keras_history') != is_keras_tensor:
-        raise ValueError('The initial state or constants of an RNN'
-                         ' layer cannot be specified with a mix of'
-                         ' Keras tensors and non-Keras tensors')
+        raise ValueError('The initial state of an RNN layer cannot be'
+                         ' specified with a mix of Keras tensors and'
+                         ' non-Keras tensors')
 
     if is_keras_tensor:
-      # Compute the full input spec, including state and constants
-      full_input = [inputs] + additional_inputs
-      full_input_spec = self.input_spec + additional_specs
-      # Perform the call with temporarily replaced input_spec
-      original_input_spec = self.input_spec
-      self.input_spec = full_input_spec
-      output = super(RNN, self).__call__(full_input, **kwargs)
-      self.input_spec = original_input_spec
+      # Compute the full input spec, including state
+      input_spec = self.input_spec
+      state_spec = self.state_spec
+      if not isinstance(input_spec, list):
+        input_spec = [input_spec]
+      if not isinstance(state_spec, list):
+        state_spec = [state_spec]
+      self.input_spec = input_spec + state_spec
+
+      # Compute the full inputs, including state
+      inputs = [inputs] + list(initial_state)
+
+      # Perform the call
+      output = super(Recurrent, self).__call__(inputs, **kwargs)
+
+      # Restore original input spec
+      self.input_spec = input_spec
       return output
     else:
-      return super(RNN, self).__call__(inputs, **kwargs)
-
-  def call(self,
-           inputs,
-           mask=None,
-           training=None,
-           initial_state=None,
-           constants=None):
+      kwargs['initial_state'] = initial_state
+      return super(Recurrent, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
     # input shape: `(samples, time (padded with zeros), input_dim)`
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
     if isinstance(inputs, list):
+      initial_state = inputs[1:]
       inputs = inputs[0]
-    if initial_state is not None:
+    elif initial_state is not None:
       pass
     elif self.stateful:
       initial_state = self.states
@@ -568,14 +343,13 @@ class RNN(Layer):
       mask = mask[0]
 
     if len(initial_state) != len(self.states):
-      raise ValueError(
-          'Layer has ' + str(len(self.states)) + ' states but was passed ' +
-          str(len(initial_state)) + ' initial states.')
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
     input_shape = K.int_shape(inputs)
-    timesteps = input_shape[1]
-    if self.unroll and timesteps in [None, 1]:
+    if self.unroll and input_shape[1] is None:
       raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined or equal to 1. \n'
+                       'time dimension is undefined. \n'
                        '- If using a Sequential model, '
                        'specify the time dimension by passing '
                        'an `input_shape` or `batch_input_shape` '
@@ -585,31 +359,15 @@ class RNN(Layer):
                        '- If using the functional API, specify '
                        'the time dimension by passing a `shape` '
                        'or `batch_shape` argument to your Input layer.')
-
-    kwargs = {}
-    if has_arg(self.cell.call, 'training'):
-      kwargs['training'] = training
-
-    if constants:
-      if not has_arg(self.cell.call, 'constants'):
-        raise ValueError('RNN cell does not support constants')
-
-      def step(inputs, states):
-        constants = states[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-        states = states[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-        return self.cell.call(inputs, states, constants=constants, **kwargs)
-    else:
-
-      def step(inputs, states):
-        return self.cell.call(inputs, states, **kwargs)
-
+    constants = self.get_constants(inputs, training=None)
+    preprocessed_input = self.preprocess_input(inputs, training=None)
     last_output, outputs, states = K.rnn(
-        step,
-        inputs,
+        self.step,
+        preprocessed_input,
         initial_state,
-        constants=constants,
         go_backwards=self.go_backwards,
         mask=mask,
+        constants=constants,
         unroll=self.unroll)
     if self.stateful:
       updates = []
@@ -617,63 +375,21 @@ class RNN(Layer):
         updates.append((self.states[i], states[i]))
       self.add_update(updates, inputs)
 
-    if self.return_sequences:
-      output = outputs
-    else:
-      output = last_output
-
     # Properly set learning phase
-    if getattr(last_output, '_uses_learning_phase', False):
-      output._uses_learning_phase = True
+    if 0 < self.dropout + self.recurrent_dropout:
+      last_output._uses_learning_phase = True
+      outputs._uses_learning_phase = True
+
+    if not self.return_sequences:
+      outputs = last_output
 
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
       else:
         states = list(states)
-      return [output] + states
-    else:
-      return output
-
-  def _standardize_args(self, inputs, initial_state, constants):
-    """Standardize `__call__` arguments to a single list of tensor inputs.
-
-    When running a model loaded from file, the input tensors
-    `initial_state` and `constants` can be passed to `RNN.__call__` as part
-    of `inputs` instead of by the dedicated keyword arguments. This method
-    makes sure the arguments are separated and that `initial_state` and
-    `constants` are lists of tensors (or None).
-
-    Arguments:
-        inputs: tensor or list/tuple of tensors
-        initial_state: tensor or list of tensors or None
-        constants: tensor or list of tensors or None
-
-    Returns:
-        inputs: tensor
-        initial_state: list of tensors or None
-        constants: list of tensors or None
-    """
-    if isinstance(inputs, list):
-      assert initial_state is None and constants is None
-      if self._num_constants is not None:
-        constants = inputs[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-        inputs = inputs[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-      if len(inputs) > 1:
-        initial_state = inputs[1:]
-      inputs = inputs[0]
-
-    def to_list_or_none(x):
-      if x is None or isinstance(x, list):
-        return x
-      if isinstance(x, tuple):
-        return list(x)
-      return [x]
-
-    initial_state = to_list_or_none(initial_state)
-    constants = to_list_or_none(constants)
-
-    return inputs, initial_state, constants
+      return [outputs] + states
+    return outputs
 
   def reset_states(self, states=None):
     if not self.stateful:
@@ -692,19 +408,10 @@ class RNN(Layer):
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if self.states[0] is None:
-      if hasattr(self.cell.state_size, '__len__'):
-        self.states = [
-            K.zeros((batch_size, dim)) for dim in self.cell.state_size
-        ]
-      else:
-        self.states = [K.zeros((batch_size, self.cell.state_size))]
+      self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
     elif states is None:
-      if hasattr(self.cell.state_size, '__len__'):
-        for state, dim in zip(self.states, self.cell.state_size):
-          K.set_value(state, np.zeros((batch_size, dim)))
-      else:
-        K.set_value(self.states[0], np.zeros((batch_size,
-                                              self.cell.state_size)))
+      for state in self.states:
+        K.set_value(state, np.zeros((batch_size, self.units)))
     else:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -714,16 +421,11 @@ class RNN(Layer):
                          'but it received ' + str(len(states)) +
                          ' state values. Input received: ' + str(states))
       for index, (value, state) in enumerate(zip(states, self.states)):
-        if hasattr(self.cell.state_size, '__len__'):
-          dim = self.cell.state_size[index]
-        else:
-          dim = self.cell.state_size
-        if value.shape != (batch_size, dim):
-          raise ValueError(
-              'State ' + str(index) + ' is incompatible with layer ' +
-              self.name + ': expected shape=' + str(
-                  (batch_size, dim)) + ', found shape=' + str(value.shape))
-        # TODO(fchollet): consider batch calls to `set_value`.
+        if value.shape != (batch_size, self.units):
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' + self.name +
+                           ': expected shape=' + str((batch_size, self.units)) +
+                           ', found shape=' + str(value.shape))
         K.set_value(state, value)
 
   def get_config(self):
@@ -732,94 +434,51 @@ class RNN(Layer):
         'return_state': self.return_state,
         'go_backwards': self.go_backwards,
         'stateful': self.stateful,
-        'unroll': self.unroll
-    }
-    if self._num_constants is not None:
-      config['num_constants'] = self._num_constants
-
-    cell_config = self.cell.get_config()
-    config['cell'] = {
-        'class_name': self.cell.__class__.__name__,
-        'config': cell_config
+        'unroll': self.unroll,
+        'implementation': self.implementation
     }
-    base_config = super(RNN, self).get_config()
+    base_config = super(Recurrent, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    cell = deserialize_layer(config.pop('cell'), custom_objects=custom_objects)
-    num_constants = config.pop('num_constants', None)
-    layer = cls(cell, **config)
-    layer._num_constants = num_constants
-    return layer
-
-  @property
-  def trainable_weights(self):
-    if isinstance(self.cell, Layer):
-      return self.cell.trainable_weights
-    return []
-
-  @property
-  def non_trainable_weights(self):
-    if isinstance(self.cell, Layer):
-      return self.cell.non_trainable_weights
-    return []
 
-  @property
-  def losses(self):
-    if isinstance(self.cell, Layer):
-      return self.cell.losses
-    return []
-
-  def get_losses_for(self, inputs=None):
-    if isinstance(self.cell, Layer):
-      cell_losses = self.cell.get_losses_for(inputs)
-      return cell_losses + super(RNN, self).get_losses_for(inputs)
-    return super(RNN, self).get_losses_for(inputs)
-
-
-class SimpleRNNCell(Layer):
-  """Cell class for SimpleRNN.
+class SimpleRNN(Recurrent):
+  """Fully-connected RNN where the output is to be fed back to input.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
+
+  References:
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -832,13 +491,15 @@ class SimpleRNNCell(Layer):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
+               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
                **kwargs):
-    super(SimpleRNNCell, self).__init__(**kwargs)
+    super(SimpleRNN, self).__init__(
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.use_bias = use_bias
@@ -857,13 +518,23 @@ class SimpleRNNCell(Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_size = self.units
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
+    self.state_spec = InputSpec(shape=(None, self.units))
 
   def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None]
+    if self.stateful:
+      self.reset_states()
+
     self.kernel = self.add_weight(
-        shape=(input_shape[-1], self.units),
+        shape=(self.input_dim, self.units),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -885,327 +556,146 @@ class SimpleRNNCell(Layer):
       self.bias = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation > 0:
+      return inputs
     else:
-      self._dropout_mask = None
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+      return _time_distributed_dense(
+          inputs,
+          self.kernel,
+          self.bias,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
 
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
+  def step(self, inputs, states):
+    if self.implementation == 0:
+      h = inputs
     else:
-      self._recurrent_dropout_mask = None
+      if 0 < self.dropout < 1:
+        h = K.dot(inputs * states[1], self.kernel)
+      else:
+        h = K.dot(inputs, self.kernel)
+      if self.bias is not None:
+        h = K.bias_add(h, self.bias)
 
-  def call(self, inputs, states, training=None):
     prev_output = states[0]
-    dp_mask = self._dropout_mask
-    rec_dp_mask = self._recurrent_dropout_mask
-
-    if dp_mask is not None:
-      h = K.dot(inputs * dp_mask, self.kernel)
-    else:
-      h = K.dot(inputs, self.kernel)
-    if self.bias is not None:
-      h = K.bias_add(h, self.bias)
-
-    if rec_dp_mask is not None:
-      prev_output *= rec_dp_mask
+    if 0 < self.recurrent_dropout < 1:
+      prev_output *= states[2]
     output = h + K.dot(prev_output, self.recurrent_kernel)
     if self.activation is not None:
       output = self.activation(output)
 
     # Properly set learning phase on output tensor.
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        output._uses_learning_phase = True
+      output._uses_learning_phase = True
     return output, [output]
 
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
 
-class SimpleRNN(RNN):
-  """Fully-connected RNN where the output is to be fed back to input.
-
-  Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          If you pass None, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      return_sequences: Boolean. Whether to return the last output.
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    if 'implementation' in kwargs:
-      kwargs.pop('implementation')
-      logging.warning('The `implementation` argument '
-                      'in `SimpleRNN` has been deprecated. '
-                      'Please remove it from your layer call.')
-    cell = SimpleRNNCell(
-        units,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout)
-    super(SimpleRNN, self).__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        activity_regularizer=regularizers.get(activity_regularizer),
-        **kwargs)
-    # self.activity_regularizer = regularizers.get(activity_regularizer)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
-    return super(SimpleRNN, self).call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
 
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
+      dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
+      constants.append(dp_mask)
+    else:
+      constants.append(K.cast_to_floatx(1.))
 
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
+    if 0 < self.recurrent_dropout < 1:
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, self.units))
 
-  @property
-  def dropout(self):
-    return self.cell.dropout
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
 
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
+      rec_dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
+      constants.append(rec_dp_mask)
+    else:
+      constants.append(K.cast_to_floatx(1.))
+    return constants
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(SimpleRNN, self).get_config()
-    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
 
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config:
-      config.pop('implementation')
-    return cls(**config)
 
+class GRU(Recurrent):
+  """Gated Recurrent Unit - Cho et al.
 
-class GRUCell(Layer):
-  """Cell class for the GRU layer.
+  2014.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
+
+  References:
+      - [On the Properties of Neural Machine Translation: Encoder-Decoder
+        Approaches](https://arxiv.org/abs/1409.1259)
+      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+        Modeling](http://arxiv.org/abs/1412.3555v1)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -1219,14 +709,15 @@ class GRUCell(Layer):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
+               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                **kwargs):
-    super(GRUCell, self).__init__(**kwargs)
+    super(GRU, self).__init__(
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -1246,15 +737,22 @@ class GRUCell(Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.implementation = implementation
-    self.state_size = self.units
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
+    self.state_spec = InputSpec(shape=(None, self.units))
 
   def build(self, input_shape):
-    input_dim = input_shape[-1]
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None]
+    if self.stateful:
+      self.reset_states()
+
     self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 3),
+        shape=(self.input_dim, self.units * 3),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -1294,83 +792,89 @@ class GRUCell(Layer):
       self.bias_h = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation == 0:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+
+      x_z = _time_distributed_dense(
+          inputs,
+          self.kernel_z,
+          self.bias_z,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_r = _time_distributed_dense(
+          inputs,
+          self.kernel_r,
+          self.bias_r,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_h = _time_distributed_dense(
+          inputs,
+          self.kernel_h,
+          self.bias_h,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      return K.concatenate([x_z, x_r, x_h], axis=2)
+    else:
+      return inputs
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
 
       def dropped_inputs():
         return K.dropout(ones, self.dropout)
 
-      self._dropout_mask = [
+      dp_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(3)
       ]
+      constants.append(dp_mask)
     else:
-      self._dropout_mask = None
+      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
 
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
     if 0 < self.recurrent_dropout < 1:
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
       ones = K.tile(ones, (1, self.units))
 
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
 
-      self._recurrent_dropout_mask = [
+      rec_dp_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(3)
       ]
+      constants.append(rec_dp_mask)
     else:
-      self._recurrent_dropout_mask = None
+      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
+    return constants
 
-  def call(self, inputs, states, training=None):
+  def step(self, inputs, states):
     h_tm1 = states[0]  # previous memory
+    dp_mask = states[1]  # dropout matrices for recurrent units
+    rec_dp_mask = states[2]
 
-    # dropout matrices for input units
-    dp_mask = self._dropout_mask
-    # dropout matrices for recurrent units
-    rec_dp_mask = self._recurrent_dropout_mask
-
-    if self.implementation == 1:
-      if 0. < self.dropout < 1.:
-        inputs_z = inputs * dp_mask[0]
-        inputs_r = inputs * dp_mask[1]
-        inputs_h = inputs * dp_mask[2]
-      else:
-        inputs_z = inputs
-        inputs_r = inputs
-        inputs_h = inputs
-      x_z = K.dot(inputs_z, self.kernel_z)
-      x_r = K.dot(inputs_r, self.kernel_r)
-      x_h = K.dot(inputs_h, self.kernel_h)
-      if self.use_bias:
-        x_z = K.bias_add(x_z, self.bias_z)
-        x_r = K.bias_add(x_r, self.bias_r)
-        x_h = K.bias_add(x_h, self.bias_h)
-
-      if 0. < self.recurrent_dropout < 1.:
-        h_tm1_z = h_tm1 * rec_dp_mask[0]
-        h_tm1_r = h_tm1 * rec_dp_mask[1]
-        h_tm1_h = h_tm1 * rec_dp_mask[2]
-      else:
-        h_tm1_z = h_tm1
-        h_tm1_r = h_tm1
-        h_tm1_h = h_tm1
-      z = self.recurrent_activation(
-          x_z + K.dot(h_tm1_z, self.recurrent_kernel_z))
-      r = self.recurrent_activation(
-          x_r + K.dot(h_tm1_r, self.recurrent_kernel_r))
-
-      hh = self.activation(x_h + K.dot(r * h_tm1_h, self.recurrent_kernel_h))
-    else:
-      if 0. < self.dropout < 1.:
-        inputs *= dp_mask[0]
-      matrix_x = K.dot(inputs, self.kernel)
+    if self.implementation == 2:
+      matrix_x = K.dot(inputs * dp_mask[0], self.kernel)
       if self.use_bias:
         matrix_x = K.bias_add(matrix_x, self.bias)
-      if 0. < self.recurrent_dropout < 1.:
-        h_tm1 *= rec_dp_mask[0]
-      matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
+      matrix_inner = K.dot(h_tm1 * rec_dp_mask[0],
+                           self.recurrent_kernel[:, :2 * self.units])
 
       x_z = matrix_x[:, :self.units]
       x_r = matrix_x[:, self.units:2 * self.units]
@@ -1381,323 +885,116 @@ class GRUCell(Layer):
       r = self.recurrent_activation(x_r + recurrent_r)
 
       x_h = matrix_x[:, 2 * self.units:]
-      recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:])
+      recurrent_h = K.dot(r * h_tm1 * rec_dp_mask[0],
+                          self.recurrent_kernel[:, 2 * self.units:])
       hh = self.activation(x_h + recurrent_h)
+    else:
+      if self.implementation == 0:
+        x_z = inputs[:, :self.units]
+        x_r = inputs[:, self.units:2 * self.units]
+        x_h = inputs[:, 2 * self.units:]
+      elif self.implementation == 1:
+        x_z = K.dot(inputs * dp_mask[0], self.kernel_z)
+        x_r = K.dot(inputs * dp_mask[1], self.kernel_r)
+        x_h = K.dot(inputs * dp_mask[2], self.kernel_h)
+        if self.use_bias:
+          x_z = K.bias_add(x_z, self.bias_z)
+          x_r = K.bias_add(x_r, self.bias_r)
+          x_h = K.bias_add(x_h, self.bias_h)
+      else:
+        raise ValueError('Unknown `implementation` mode.')
+      z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0],
+                                                self.recurrent_kernel_z))
+      r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1],
+                                                self.recurrent_kernel_r))
+
+      hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2],
+                                       self.recurrent_kernel_h))
     h = z * h_tm1 + (1 - z) * hh
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        h._uses_learning_phase = True
+      h._uses_learning_phase = True
     return h, [h]
 
-
-class GRU(RNN):
-  # pylint: disable=line-too-long
-  """Gated Recurrent Unit - Cho et al.
-
-  2014.
-
-  Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          If you pass None, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
-      return_sequences: Boolean. Whether to return the last output.
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-
-  References:
-      - [On the Properties of Neural Machine Translation: Encoder-Decoder Approaches](https://arxiv.org/abs/1409.1259)
-      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](http://arxiv.org/abs/1412.3555v1)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
-  """
-  # pylint: enable=line-too-long
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               implementation=1,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    if implementation == 0:
-      logging.warning('`implementation=0` has been deprecated, '
-                      'and now defaults to `implementation=1`.'
-                      'Please update your layer call.')
-    cell = GRUCell(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
-    super(GRU, self).__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
-    return super(GRU, self).call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  @property
-  def implementation(self):
-    return self.cell.implementation
-
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(GRU, self).get_config()
-    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
 
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config and config['implementation'] == 0:
-      config['implementation'] = 1
-    return cls(**config)
 
+class LSTM(Recurrent):
+  """Long-Short Term Memory unit - Hochreiter 1997.
 
-class LSTMCell(Layer):
-  """Cell class for the LSTM layer.
+  For a step-by-step description of the algorithm, see
+  [this tutorial](http://deeplearning.net/tutorial/lstm.html).
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
+
+  References:
+      - [Long short-term
+        memory]((http://www.bioinf.jku.at/publications/older/2604.pdf)
+        (original 1997 paper)
+      - [Supervised sequence labeling with recurrent neural
+        networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -1712,14 +1009,15 @@ class LSTMCell(Layer):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
+               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                **kwargs):
-    super(LSTMCell, self).__init__(**kwargs)
+    super(LSTM, self).__init__(
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -1740,15 +1038,25 @@ class LSTMCell(Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.implementation = implementation
-    self.state_size = (self.units, self.units)
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
+    self.state_spec = [
+        InputSpec(shape=(None, self.units)),
+        InputSpec(shape=(None, self.units))
+    ]
 
   def build(self, input_shape):
-    input_dim = input_shape[-1]
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None, None]
+    if self.stateful:
+      self.reset_states()
+
     self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 4),
+        shape=(self.input_dim, self.units * 4),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -1804,90 +1112,96 @@ class LSTMCell(Layer):
       self.bias_o = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation == 0:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+
+      x_i = _time_distributed_dense(
+          inputs,
+          self.kernel_i,
+          self.bias_i,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_f = _time_distributed_dense(
+          inputs,
+          self.kernel_f,
+          self.bias_f,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_c = _time_distributed_dense(
+          inputs,
+          self.kernel_c,
+          self.bias_c,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_o = _time_distributed_dense(
+          inputs,
+          self.kernel_o,
+          self.bias_o,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      return K.concatenate([x_i, x_f, x_c, x_o], axis=2)
+    else:
+      return inputs
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
 
       def dropped_inputs():
         return K.dropout(ones, self.dropout)
 
-      self._dropout_mask = [
+      dp_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(4)
       ]
+      constants.append(dp_mask)
     else:
-      self._dropout_mask = None
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
 
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
     if 0 < self.recurrent_dropout < 1:
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
       ones = K.tile(ones, (1, self.units))
 
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
 
-      self._recurrent_dropout_mask = [
+      rec_dp_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(4)
       ]
+      constants.append(rec_dp_mask)
     else:
-      self._recurrent_dropout_mask = None
-
-  def call(self, inputs, states, training=None):
-    # dropout matrices for input units
-    dp_mask = self._dropout_mask
-    # dropout matrices for recurrent units
-    rec_dp_mask = self._recurrent_dropout_mask
-
-    h_tm1 = states[0]  # previous memory state
-    c_tm1 = states[1]  # previous carry state
-
-    if self.implementation == 1:
-      if 0 < self.dropout < 1.:
-        inputs_i = inputs * dp_mask[0]
-        inputs_f = inputs * dp_mask[1]
-        inputs_c = inputs * dp_mask[2]
-        inputs_o = inputs * dp_mask[3]
-      else:
-        inputs_i = inputs
-        inputs_f = inputs
-        inputs_c = inputs
-        inputs_o = inputs
-      x_i = K.dot(inputs_i, self.kernel_i)
-      x_f = K.dot(inputs_f, self.kernel_f)
-      x_c = K.dot(inputs_c, self.kernel_c)
-      x_o = K.dot(inputs_o, self.kernel_o)
-      if self.use_bias:
-        x_i = K.bias_add(x_i, self.bias_i)
-        x_f = K.bias_add(x_f, self.bias_f)
-        x_c = K.bias_add(x_c, self.bias_c)
-        x_o = K.bias_add(x_o, self.bias_o)
-
-      if 0 < self.recurrent_dropout < 1.:
-        h_tm1_i = h_tm1 * rec_dp_mask[0]
-        h_tm1_f = h_tm1 * rec_dp_mask[1]
-        h_tm1_c = h_tm1 * rec_dp_mask[2]
-        h_tm1_o = h_tm1 * rec_dp_mask[3]
-      else:
-        h_tm1_i = h_tm1
-        h_tm1_f = h_tm1
-        h_tm1_c = h_tm1
-        h_tm1_o = h_tm1
-      i = self.recurrent_activation(
-          x_i + K.dot(h_tm1_i, self.recurrent_kernel_i))
-      f = self.recurrent_activation(
-          x_f + K.dot(h_tm1_f, self.recurrent_kernel_f))
-      c = f * c_tm1 + i * self.activation(
-          x_c + K.dot(h_tm1_c, self.recurrent_kernel_c))
-      o = self.recurrent_activation(
-          x_o + K.dot(h_tm1_o, self.recurrent_kernel_o))
-    else:
-      if 0. < self.dropout < 1.:
-        inputs *= dp_mask[0]
-      z = K.dot(inputs, self.kernel)
-      if 0. < self.recurrent_dropout < 1.:
-        h_tm1 *= rec_dp_mask[0]
-      z += K.dot(h_tm1, self.recurrent_kernel)
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+    return constants
+
+  def step(self, inputs, states):
+    h_tm1 = states[0]
+    c_tm1 = states[1]
+    dp_mask = states[2]
+    rec_dp_mask = states[3]
+
+    if self.implementation == 2:
+      z = K.dot(inputs * dp_mask[0], self.kernel)
+      z += K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel)
       if self.use_bias:
         z = K.bias_add(z, self.bias)
 
@@ -1900,606 +1214,57 @@ class LSTMCell(Layer):
       f = self.recurrent_activation(z1)
       c = f * c_tm1 + i * self.activation(z2)
       o = self.recurrent_activation(z3)
+    else:
+      if self.implementation == 0:
+        x_i = inputs[:, :self.units]
+        x_f = inputs[:, self.units:2 * self.units]
+        x_c = inputs[:, 2 * self.units:3 * self.units]
+        x_o = inputs[:, 3 * self.units:]
+      elif self.implementation == 1:
+        x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i
+        x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f
+        x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c
+        x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o
+      else:
+        raise ValueError('Unknown `implementation` mode.')
 
+      i = self.recurrent_activation(x_i + K.dot(h_tm1 * rec_dp_mask[0],
+                                                self.recurrent_kernel_i))
+      f = self.recurrent_activation(x_f + K.dot(h_tm1 * rec_dp_mask[1],
+                                                self.recurrent_kernel_f))
+      c = f * c_tm1 + i * self.activation(
+          x_c + K.dot(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c))
+      o = self.recurrent_activation(x_o + K.dot(h_tm1 * rec_dp_mask[3],
+                                                self.recurrent_kernel_o))
     h = o * self.activation(c)
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        h._uses_learning_phase = True
+      h._uses_learning_phase = True
     return h, [h, c]
 
-
-class LSTM(RNN):
-  # pylint: disable=line-too-long
-  """Long-Short Term Memory layer - Hochreiter 1997.
-
-  Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          If you pass None, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
-      unit_forget_bias: Boolean.
-          If True, add 1 to the bias of the forget gate at initialization.
-          Setting it to true will also force `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et
-            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
-      return_sequences: Boolean. Whether to return the last output.
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-
-  References:
-      - [Long short-term memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
-      - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
-      - [Supervised sequence labeling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
-  """
-  # pylint: enable=line-too-long
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               implementation=1,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    if implementation == 0:
-      logging.warning('`implementation=0` has been deprecated, '
-                      'and now defaults to `implementation=1`.'
-                      'Please update your layer call.')
-    cell = LSTMCell(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        unit_forget_bias=unit_forget_bias,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
-    super(LSTM, self).__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
-    return super(LSTM, self).call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def unit_forget_bias(self):
-    return self.cell.unit_forget_bias
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  @property
-  def implementation(self):
-    return self.cell.implementation
-
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'unit_forget_bias': self.unit_forget_bias,
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(LSTM, self).get_config()
-    del base_config['cell']
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config and config['implementation'] == 0:
-      config['implementation'] = 1
-    return cls(**config)
-
-
-class Recurrent(Layer):
-  """Deprecated abstract base class for recurrent layers.
-
-  It still exists because it is leveraged by the convolutional-recurrent layers.
-  It will be removed entirely in the future.
-  It was never part of the public API.
-  Do not use.
-
-  Arguments:
-      weights: list of Numpy arrays to set as initial weights.
-          The list should have 3 elements, of shapes:
-          `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-      implementation: one of {0, 1, or 2}.
-          If set to 0, the RNN will use
-          an implementation that uses fewer, larger matrix products,
-          thus running faster on CPU but consuming more memory.
-          If set to 1, the RNN will use more matrix products,
-          but smaller ones, thus running slower
-          (may actually be faster on GPU) while consuming less memory.
-          If set to 2 (LSTM/GRU only),
-          the RNN will combine the input gate,
-          the forget gate and the output gate into a single matrix,
-          enabling more time-efficient parallelization on the GPU.
-          Note: RNN dropout must be shared for all gates,
-          resulting in a slightly reduced regularization.
-      input_dim: dimensionality of the input (integer).
-          This argument (or alternatively, the keyword argument `input_shape`)
-          is required when using this layer as the first layer in a model.
-      input_length: Length of input sequences, to be specified
-          when it is constant.
-          This argument is required if you are going to connect
-          `Flatten` then `Dense` layers upstream
-          (without it, the shape of the dense outputs cannot be computed).
-          Note that if the recurrent layer is not the first layer
-          in your model, you would need to specify the input length
-          at the level of the first layer
-          (e.g. via the `input_shape` argument)
-
-  Input shape:
-      3D tensor with shape `(batch_size, timesteps, input_dim)`,
-      (Optional) 2D tensors with shape `(batch_size, output_dim)`.
-
-  Output shape:
-      - if `return_state`: a list of tensors. The first tensor is
-          the output. The remaining tensors are the last states,
-          each with shape `(batch_size, units)`.
-      - if `return_sequences`: 3D tensor with shape
-          `(batch_size, timesteps, units)`.
-      - else, 2D tensor with shape `(batch_size, units)`.
-
-  # Masking
-      This layer supports masking for input data with a variable number
-      of timesteps. To introduce masks to your data,
-      use an `Embedding` layer with the `mask_zero` parameter
-      set to `True`.
-
-  # Note on using statefulness in RNNs
-      You can set RNN layers to be 'stateful', which means that the states
-      computed for the samples in one batch will be reused as initial states
-      for the samples in the next batch. This assumes a one-to-one mapping
-      between samples in different successive batches.
-
-      To enable statefulness:
-          - specify `stateful=True` in the layer constructor.
-          - specify a fixed batch size for your model, by passing
-              if sequential model:
-                `batch_input_shape=(...)` to the first layer in your model.
-              else for functional model with 1 or more Input layers:
-                `batch_shape=(...)` to all the first layers in your model.
-              This is the expected shape of your inputs
-              *including the batch size*.
-              It should be a tuple of integers, e.g. `(32, 10, 100)`.
-          - specify `shuffle=False` when calling fit().
-
-      To reset the states of your model, call `.reset_states()` on either
-      a specific layer, or on your entire model.
-
-  # Note on specifying the initial state of RNNs
-      You can specify the initial state of RNN layers symbolically by
-      calling them with the keyword argument `initial_state`. The value of
-      `initial_state` should be a tensor or list of tensors representing
-      the initial state of the RNN layer.
-
-      You can specify the initial state of RNN layers numerically by
-      calling `reset_states` with the keyword argument `states`. The value of
-      `states` should be a numpy array or list of numpy arrays representing
-      the initial state of the RNN layer.
-  """
-
-  def __init__(self,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               implementation=0,
-               **kwargs):
-    super(Recurrent, self).__init__(**kwargs)
-    self.return_sequences = return_sequences
-    self.return_state = return_state
-    self.go_backwards = go_backwards
-    self.stateful = stateful
-    self.unroll = unroll
-    self.implementation = implementation
-    self.supports_masking = True
-    self.input_spec = [InputSpec(ndim=3)]
-    self.state_spec = None
-    self.dropout = 0
-    self.recurrent_dropout = 0
-
-  def _compute_output_shape(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.return_sequences:
-      output_shape = (input_shape[0], input_shape[1], self.units)
-    else:
-      output_shape = (input_shape[0], self.units)
-
-    if self.return_state:
-      state_shape = [tensor_shape.TensorShape(
-          (input_shape[0], self.units)) for _ in self.states]
-      return [tensor_shape.TensorShape(output_shape)] + state_shape
-    return tensor_shape.TensorShape(output_shape)
-
-  def compute_mask(self, inputs, mask):
-    if isinstance(mask, list):
-      mask = mask[0]
-    output_mask = mask if self.return_sequences else None
-    if self.return_state:
-      state_mask = [None for _ in self.states]
-      return [output_mask] + state_mask
-    return output_mask
-
-  def step(self, inputs, states):
-    raise NotImplementedError
-
-  def get_constants(self, inputs, training=None):
-    return []
-
-  def get_initial_state(self, inputs):
-    # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
-    initial_state = K.tile(initial_state, [1,
-                                           self.units])  # (samples, output_dim)
-    initial_state = [initial_state for _ in range(len(self.states))]
-    return initial_state
-
-  def preprocess_input(self, inputs, training=None):
-    return inputs
-
-  def __call__(self, inputs, initial_state=None, **kwargs):
-    if (isinstance(inputs, (list, tuple)) and
-        len(inputs) > 1
-        and initial_state is None):
-      initial_state = inputs[1:]
-      inputs = inputs[0]
-
-    # If `initial_state` is specified,
-    # and if it a Keras tensor,
-    # then add it to the inputs and temporarily
-    # modify the input spec to include the state.
-    if initial_state is None:
-      return super(Recurrent, self).__call__(inputs, **kwargs)
-
-    if not isinstance(initial_state, (list, tuple)):
-      initial_state = [initial_state]
-
-    is_keras_tensor = hasattr(initial_state[0], '_keras_history')
-    for tensor in initial_state:
-      if hasattr(tensor, '_keras_history') != is_keras_tensor:
-        raise ValueError('The initial state of an RNN layer cannot be'
-                         ' specified with a mix of Keras tensors and'
-                         ' non-Keras tensors')
-
-    if is_keras_tensor:
-      # Compute the full input spec, including state
-      input_spec = self.input_spec
-      state_spec = self.state_spec
-      if not isinstance(input_spec, list):
-        input_spec = [input_spec]
-      if not isinstance(state_spec, list):
-        state_spec = [state_spec]
-      self.input_spec = input_spec + state_spec
-
-      # Compute the full inputs, including state
-      inputs = [inputs] + list(initial_state)
-
-      # Perform the call
-      output = super(Recurrent, self).__call__(inputs, **kwargs)
-
-      # Restore original input spec
-      self.input_spec = input_spec
-      return output
-    else:
-      kwargs['initial_state'] = initial_state
-      return super(Recurrent, self).__call__(inputs, **kwargs)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    # input shape: `(samples, time (padded with zeros), input_dim)`
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
-      initial_state = inputs[1:]
-      inputs = inputs[0]
-    elif initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
-
-    if isinstance(mask, list):
-      mask = mask[0]
-
-    if len(initial_state) != len(self.states):
-      raise ValueError('Layer has ' + str(len(self.states)) +
-                       ' states but was passed ' + str(len(initial_state)) +
-                       ' initial states.')
-    input_shape = K.int_shape(inputs)
-    if self.unroll and input_shape[1] is None:
-      raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined. \n'
-                       '- If using a Sequential model, '
-                       'specify the time dimension by passing '
-                       'an `input_shape` or `batch_input_shape` '
-                       'argument to your first layer. If your '
-                       'first layer is an Embedding, you can '
-                       'also use the `input_length` argument.\n'
-                       '- If using the functional API, specify '
-                       'the time dimension by passing a `shape` '
-                       'or `batch_shape` argument to your Input layer.')
-    constants = self.get_constants(inputs, training=None)
-    preprocessed_input = self.preprocess_input(inputs, training=None)
-    last_output, outputs, states = K.rnn(
-        self.step,
-        preprocessed_input,
-        initial_state,
-        go_backwards=self.go_backwards,
-        mask=mask,
-        constants=constants,
-        unroll=self.unroll)
-    if self.stateful:
-      updates = []
-      for i in range(len(states)):
-        updates.append((self.states[i], states[i]))
-      self.add_update(updates, inputs)
-
-    # Properly set learning phase
-    if 0 < self.dropout + self.recurrent_dropout:
-      last_output._uses_learning_phase = True
-      outputs._uses_learning_phase = True
-
-    if not self.return_sequences:
-      outputs = last_output
-
-    if self.return_state:
-      if not isinstance(states, (list, tuple)):
-        states = [states]
-      else:
-        states = list(states)
-      return [outputs] + states
-    return outputs
-
-  def reset_states(self, states=None):
-    if not self.stateful:
-      raise AttributeError('Layer must be stateful.')
-    batch_size = self.input_spec[0].shape[0]
-    if not batch_size:
-      raise ValueError('If a RNN is stateful, it needs to know '
-                       'its batch size. Specify the batch size '
-                       'of your input tensors: \n'
-                       '- If using a Sequential model, '
-                       'specify the batch size by passing '
-                       'a `batch_input_shape` '
-                       'argument to your first layer.\n'
-                       '- If using the functional API, specify '
-                       'the time dimension by passing a '
-                       '`batch_shape` argument to your Input layer.')
-    # initialize state if None
-    if self.states[0] is None:
-      self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
-    elif states is None:
-      for state in self.states:
-        K.set_value(state, np.zeros((batch_size, self.units)))
-    else:
-      if not isinstance(states, (list, tuple)):
-        states = [states]
-      if len(states) != len(self.states):
-        raise ValueError('Layer ' + self.name + ' expects ' +
-                         str(len(self.states)) + ' states, '
-                         'but it received ' + str(len(states)) +
-                         ' state values. Input received: ' + str(states))
-      for index, (value, state) in enumerate(zip(states, self.states)):
-        if value.shape != (batch_size, self.units):
-          raise ValueError('State ' + str(index) +
-                           ' is incompatible with layer ' + self.name +
-                           ': expected shape=' + str((batch_size, self.units)) +
-                           ', found shape=' + str(value.shape))
-        K.set_value(state, value)
-
-  def get_config(self):
-    config = {
-        'return_sequences': self.return_sequences,
-        'return_state': self.return_state,
-        'go_backwards': self.go_backwards,
-        'stateful': self.stateful,
-        'unroll': self.unroll,
-        'implementation': self.implementation
-    }
-    base_config = super(Recurrent, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))