Backport changes from Github master.

PiperOrigin-RevId: 157603238
author: Francois Chollet <fchollet@google.com> 2017-05-31 10:33:22 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-05-31 10:38:01 -0700
commit: d21bf7d7502f447e5f967a479282b32b5845ba8b (patch)
tree: 455de8814f480728c77d919a8d2a170e2a47b300 /tensorflow/contrib/keras
parent: 43bfc138c9676fb54945fbede977b90a0c0aea79 (diff)
34 files changed, 1027 insertions, 550 deletions
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index b38bcd1e8f..f7f56f6fcf 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -123,6 +123,7 @@ py_library(
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
diff --git a/tensorflow/contrib/keras/python/keras/__init__.py b/tensorflow/contrib/keras/python/keras/__init__.py
index ec316253db..1c1485c0cd 100644
--- a/tensorflow/contrib/keras/python/keras/__init__.py
+++ b/tensorflow/contrib/keras/python/keras/__init__.py
@@ -35,6 +35,6 @@ from tensorflow.contrib.keras.python.keras import preprocessing
 from tensorflow.contrib.keras.python.keras import regularizers
 from tensorflow.contrib.keras.python.keras import utils
 from tensorflow.contrib.keras.python.keras import wrappers
+from tensorflow.contrib.keras.python.keras.layers import Input
 
-
-__version__ = '2.0.2-tf'
+__version__ = '2.0.4-tf'
diff --git a/tensorflow/contrib/keras/python/keras/activations.py b/tensorflow/contrib/keras/python/keras/activations.py
index 67762c83ba..35d15e74c2 100644
--- a/tensorflow/contrib/keras/python/keras/activations.py
+++ b/tensorflow/contrib/keras/python/keras/activations.py
@@ -21,7 +21,9 @@ from __future__ import print_function
 import six
 
 from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine import Layer
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.platform import tf_logging as logging
 
 
 def softmax(x, axis=-1):
@@ -99,6 +101,12 @@ def get(identifier):
     identifier = str(identifier)
     return deserialize(identifier)
   elif callable(identifier):
+    if isinstance(identifier, Layer):
+      logging.warning(
+          'Do not pass a layer instance (such as {identifier}) as the '
+          'activation argument of another layer. Instead, advanced '
+          'activation layers should be used just like any other '
+          'layer in a model.'.format(identifier=identifier.__class__.__name__))
     return identifier
   else:
     raise ValueError('Could not interpret '
diff --git a/tensorflow/contrib/keras/python/keras/applications/inception_v3.py b/tensorflow/contrib/keras/python/keras/applications/inception_v3.py
index 3fc16c88ca..f77e4a8341 100644
--- a/tensorflow/contrib/keras/python/keras/applications/inception_v3.py
+++ b/tensorflow/contrib/keras/python/keras/applications/inception_v3.py
@@ -29,8 +29,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras import layers
 from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
@@ -47,7 +45,6 @@ from tensorflow.contrib.keras.python.keras.layers import Input
 from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
 from tensorflow.contrib.keras.python.keras.models import Model
 from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
-from tensorflow.contrib.keras.python.keras.utils.layer_utils import convert_all_kernels_in_model
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5'
@@ -371,16 +368,6 @@ def InceptionV3(include_top=True,
 
   # load weights
   if weights == 'imagenet':
-    if K.image_data_format() == 'channels_first':
-      if K.backend() == 'tensorflow':
-        warnings.warn('You are using the TensorFlow backend, yet you '
-                      'are using the Theano '
-                      'image data format convention '
-                      '(`image_data_format="channels_first"`). '
-                      'For best performance, set '
-                      '`image_data_format="channels_last"` in '
-                      'your Keras config '
-                      'at ~/.keras/keras.json.')
     if include_top:
       weights_path = get_file(
           'inception_v3_weights_tf_dim_ordering_tf_kernels.h5',
@@ -394,8 +381,6 @@ def InceptionV3(include_top=True,
           cache_subdir='models',
           md5_hash='bcbd6486424b2319ff4ef7d526e38f63')
     model.load_weights(weights_path)
-    if K.backend() == 'theano':
-      convert_all_kernels_in_model(model)
   return model
 
 
diff --git a/tensorflow/contrib/keras/python/keras/applications/resnet50.py b/tensorflow/contrib/keras/python/keras/applications/resnet50.py
index 12f7ca424e..ce7d0bb046 100644
--- a/tensorflow/contrib/keras/python/keras/applications/resnet50.py
+++ b/tensorflow/contrib/keras/python/keras/applications/resnet50.py
@@ -26,8 +26,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras import layers
 from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
@@ -46,7 +44,6 @@ from tensorflow.contrib.keras.python.keras.layers import Input
 from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
 from tensorflow.contrib.keras.python.keras.layers import ZeroPadding2D
 from tensorflow.contrib.keras.python.keras.models import Model
-from tensorflow.contrib.keras.python.keras.utils import layer_utils
 from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
 
 
@@ -172,7 +169,7 @@ def ResNet50(include_top=True,
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
-          or `(3, 224, 244)` (with `channels_first` data format).
+          or `(3, 224, 224)` (with `channels_first` data format).
           It should have exactly 3 inputs channels,
           and width and height should be no smaller than 197.
           E.g. `(200, 200, 3)` would be one valid value.
@@ -286,24 +283,4 @@ def ResNet50(include_top=True,
           cache_subdir='models',
           md5_hash='a268eb855778b3df3c7506639542a6af')
     model.load_weights(weights_path)
-    if K.backend() == 'theano':
-      layer_utils.convert_all_kernels_in_model(model)
-
-    if K.image_data_format() == 'channels_first':
-      if include_top:
-        maxpool = model.get_layer(name='avg_pool')
-        shape = maxpool.output_shape[1:]
-        dense = model.get_layer(name='fc1000')
-        layer_utils.convert_dense_weights_data_format(dense, shape,
-                                                      'channels_first')
-
-      if K.backend() == 'tensorflow':
-        warnings.warn('You are using the TensorFlow backend, yet you '
-                      'are using the Theano '
-                      'image data format convention '
-                      '(`image_data_format="channels_first"`). '
-                      'For best performance, set '
-                      '`image_data_format="channels_last"` in '
-                      'your Keras config '
-                      'at ~/.keras/keras.json.')
   return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg16.py b/tensorflow/contrib/keras/python/keras/applications/vgg16.py
index 7fc393055f..89bbb040e6 100644
--- a/tensorflow/contrib/keras/python/keras/applications/vgg16.py
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg16.py
@@ -25,8 +25,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
@@ -77,7 +75,7 @@ def VGG16(include_top=True,
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
-          or `(3, 224, 244)` (with `channels_first` data format).
+          or `(3, 224, 224)` (with `channels_first` data format).
           It should have exactly 3 inputs channels,
           and width and height should be no smaller than 48.
           E.g. `(200, 200, 3)` would be one valid value.
@@ -210,14 +208,4 @@ def VGG16(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
-
-      if K.backend() == 'tensorflow':
-        warnings.warn('You are using the TensorFlow backend, yet you '
-                      'are using the Theano '
-                      'image data format convention '
-                      '(`image_data_format="channels_first"`). '
-                      'For best performance, set '
-                      '`image_data_format="channels_last"` in '
-                      'your Keras config '
-                      'at ~/.keras/keras.json.')
   return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg19.py b/tensorflow/contrib/keras/python/keras/applications/vgg19.py
index f7c2921b5c..522a516ecf 100644
--- a/tensorflow/contrib/keras/python/keras/applications/vgg19.py
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg19.py
@@ -25,8 +25,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
@@ -77,7 +75,7 @@ def VGG19(include_top=True,
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
-          or `(3, 224, 244)` (with `channels_first` data format).
+          or `(3, 224, 224)` (with `channels_first` data format).
           It should have exactly 3 inputs channels,
           and width and height should be no smaller than 48.
           E.g. `(200, 200, 3)` would be one valid value.
@@ -216,14 +214,4 @@ def VGG19(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
-
-      if K.backend() == 'tensorflow':
-        warnings.warn('You are using the TensorFlow backend, yet you '
-                      'are using the Theano '
-                      'image data format convention '
-                      '(`image_data_format="channels_first"`). '
-                      'For best performance, set '
-                      '`image_data_format="channels_last"` in '
-                      'your Keras config '
-                      'at ~/.keras/keras.json.')
   return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/xception.py b/tensorflow/contrib/keras/python/keras/applications/xception.py
index 3b08e73514..49fb6008f6 100644
--- a/tensorflow/contrib/keras/python/keras/applications/xception.py
+++ b/tensorflow/contrib/keras/python/keras/applications/xception.py
@@ -36,8 +36,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras import layers
 from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
@@ -54,6 +52,7 @@ from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
 from tensorflow.contrib.keras.python.keras.layers import SeparableConv2D
 from tensorflow.contrib.keras.python.keras.models import Model
 from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 TF_WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels.h5'
@@ -127,7 +126,7 @@ def Xception(include_top=True,
     raise RuntimeError('The Xception model is only available with '
                        'the TensorFlow backend.')
   if K.image_data_format() != 'channels_last':
-    warnings.warn(
+    logging.warning(
         'The Xception model is only available for the '
         'input data format "channels_last" '
         '(width, height, channels). '
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index ed2b251b31..84d0dacce9 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 
 import json
 import os
-import warnings
 
 import numpy as np
 
@@ -52,6 +51,7 @@ from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-im
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.training import moving_averages
+from tensorflow.python.util import tf_inspect
 
 
 py_all = all
@@ -386,6 +386,17 @@ def set_session(session):
 
 
 def _convert_string_dtype(dtype):
+  """Get the type from a string.
+
+  Arguments:
+      dtype: A string representation of a type.
+
+  Returns:
+      The type requested.
+
+  Raises:
+      ValueError: if `dtype` is not supported.
+  """
   if dtype == 'float16':
     return dtypes_module.float16
   if dtype == 'float32':
@@ -407,6 +418,15 @@ def _convert_string_dtype(dtype):
 
 
 def _to_tensor(x, dtype):
+  """Convert the input `x` to a tensor of type `dtype`.
+
+  Arguments:
+      x: An object to be converted (numpy array, list, tensors).
+      dtype: The destination type.
+
+  Returns:
+      A tensor.
+  """
   x = ops.convert_to_tensor(x)
   if x.dtype != dtype:
     x = math_ops.cast(x, dtype)
@@ -521,6 +541,17 @@ def _initialize_variables():
 
 
 def constant(value, dtype=None, shape=None, name=None):
+  """Creates a constant tensor.
+
+  Arguments:
+      value: A constant value (or list)
+      dtype: The type of the elements of the resulting tensor.
+      shape: Optional dimensions of resulting tensor.
+      name: Optional name for the tensor.
+
+  Returns:
+      A Constant Tensor.
+  """
   if dtype is None:
     dtype = floatx()
   return constant_op.constant(value, dtype=dtype, shape=shape, name=name)
@@ -833,6 +864,18 @@ def ones_like(x, dtype=None, name=None):
   return array_ops.ones_like(x, dtype=dtype, name=name)
 
 
+def identity(x):
+  """Returns a tensor with the same content as the input tensor.
+
+  Arguments:
+      x: The input tensor.
+
+  Returns:
+      A tensor of the same shape, type and content.
+  """
+  return array_ops.identity(x)
+
+
 def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   """Instantiates a variable with values drawn from a uniform distribution.
 
@@ -971,14 +1014,42 @@ def update(x, new_x):
 
 
 def update_add(x, increment):
+  """Update the value of `x` by adding `increment`.
+
+  Arguments:
+      x: A Variable.
+      increment: A tensor of same shape as `x`.
+
+  Returns:
+      The variable `x` updated.
+  """
   return state_ops.assign_add(x, increment)
 
 
 def update_sub(x, decrement):
+  """Update the value of `x` by subtracting `decrement`.
+
+  Arguments:
+      x: A Variable.
+      decrement: A tensor of same shape as `x`.
+
+  Returns:
+      The variable `x` updated.
+  """
   return state_ops.assign_sub(x, decrement)
 
 
 def moving_average_update(x, value, momentum):
+  """Compute the moving average of a variable.
+
+  Arguments:
+      x: A Variable.
+      value: A tensor with the same shape as `variable`.
+      momentum: The moving average momentum.
+
+  Returns:
+      An Operation to update the variable.
+  """
   return moving_averages.assign_moving_average(
       x, value, momentum, zero_debias=False)
 
@@ -1110,6 +1181,20 @@ def batch_dot(x, y, axes=None):
   """
   if isinstance(axes, int):
     axes = (axes, axes)
+  x_ndim = ndim(x)
+  y_ndim = ndim(y)
+  if x_ndim > y_ndim:
+    diff = x_ndim - y_ndim
+    y = array_ops.reshape(y,
+                          array_ops.concat(
+                              [array_ops.shape(y), [1] * (diff)], axis=0))
+  elif y_ndim > x_ndim:
+    diff = y_ndim - x_ndim
+    x = array_ops.reshape(x,
+                          array_ops.concat(
+                              [array_ops.shape(x), [1] * (diff)], axis=0))
+  else:
+    diff = 0
   if ndim(x) == 2 and ndim(y) == 2:
     if axes[0] == axes[1]:
       out = math_ops.reduce_sum(math_ops.multiply(x, y), axes[0])
@@ -1124,6 +1209,12 @@ def batch_dot(x, y, axes=None):
       adj_x = None
       adj_y = None
     out = math_ops.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
+  if diff:
+    if x_ndim > y_ndim:
+      idx = x_ndim + y_ndim - 3
+    else:
+      idx = x_ndim - 1
+    out = array_ops.squeeze(out, list(range(idx, idx + diff)))
   if ndim(out) == 1:
     out = expand_dims(out, 1)
   return out
@@ -1485,6 +1576,28 @@ def log(x):
   return math_ops.log(x)
 
 
+def logsumexp(x, axis=None, keepdims=False):
+  """Computes log(sum(exp(elements across dimensions of a tensor))).
+
+  This function is more numerically stable than log(sum(exp(x))).
+  It avoids overflows caused by taking the exp of large inputs and
+  underflows caused by taking the log of small inputs.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to reduce over.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`, the reduced dimension is
+          retained with length 1.
+
+  Returns:
+      The reduced tensor.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_logsumexp(x, axis=axis, keep_dims=keepdims)
+
+
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -1986,14 +2099,14 @@ def batch_flatten(x):
 
 
 def expand_dims(x, axis=-1):
-  """Adds a 1-sized dimension at index "dim".
+  """Adds a 1-sized dimension at index "axis".
 
   Arguments:
       x: A tensor or variable.
       axis: Position where to add a new axis.
 
   Returns:
-      A tensor with expended dimensions.
+      A tensor with expanded dimensions.
   """
   return array_ops.expand_dims(x, axis)
 
@@ -2247,9 +2360,11 @@ class Function(object):
       inputs: Feed placeholders to the computation graph.
       outputs: Output tensors to fetch.
       updates: Additional update ops to be run at function call.
+      name: a name to help users identify what this function does.
   """
 
-  def __init__(self, inputs, outputs, updates=None):
+  def __init__(self, inputs, outputs, updates=None, name=None,
+               **session_kwargs):
     updates = updates or []
     if not isinstance(inputs, (list, tuple)):
       raise TypeError('`inputs` to a TensorFlow backend function '
@@ -2272,6 +2387,8 @@ class Function(object):
           # assumed already an op
           updates_ops.append(update)
       self.updates_op = control_flow_ops.group(*updates_ops)
+    self.name = name
+    self.session_kwargs = session_kwargs
 
   def __call__(self, inputs):
     if not isinstance(inputs, (list, tuple)):
@@ -2285,7 +2402,10 @@ class Function(object):
         value = (indices, sparse_coo.data, sparse_coo.shape)
       feed_dict[tensor] = value
     session = get_session()
-    updated = session.run(self.outputs + [self.updates_op], feed_dict=feed_dict)
+    updated = session.run(
+        self.outputs + [self.updates_op],
+        feed_dict=feed_dict,
+        **self.session_kwargs)
     return updated[:len(self.outputs)]
 
 
@@ -2296,18 +2416,22 @@ def function(inputs, outputs, updates=None, **kwargs):
       inputs: List of placeholder tensors.
       outputs: List of output tensors.
       updates: List of update ops.
-      **kwargs: Not used with TensorFlow.
+      **kwargs: Passed to `tf.Session.run`.
 
   Returns:
       Output values as Numpy arrays.
+
+  Raises:
+      ValueError: if invalid kwargs are passed in.
   """
   if kwargs:
-    msg = [
-        'Expected no kwargs, you passed %s' % len(kwargs),
-        'kwargs passed to function are ignored with Tensorflow backend'
-    ]
-    warnings.warn('\n'.join(msg))
-  return Function(inputs, outputs, updates=updates)
+    for key in kwargs:
+      if (key not in tf_inspect.getargspec(session_module.Session.run)[0] and
+          key not in tf_inspect.getargspec(Function.__init__)[0]):
+        msg = ('Invalid argument "%s" passed to K.function with Tensorflow '
+               'backend') % key
+        raise ValueError(msg)
+  return Function(inputs, outputs, updates=updates, **kwargs)
 
 
 def gradients(loss, variables):
@@ -2452,7 +2576,8 @@ def rnn(step_function,
           # (see earlier comment for tile explanation)
           tiled_mask_t = array_ops.tile(mask_t,
                                         array_ops.stack(
-                                            [1, array_ops.shape(new_state)[1]]))
+                                            [1,
+                                             array_ops.shape(new_state)[1]]))
           return_states.append(array_ops.where(tiled_mask_t, new_state, state))
         states = return_states
         successive_outputs.append(output)
@@ -2931,6 +3056,16 @@ def in_top_k(predictions, targets, k):
 
 
 def _preprocess_deconv_output_shape(x, shape, data_format):
+  """Get the output_shape for the deconvolution.
+
+  Arguments:
+      x: input tensor.
+      shape: output shape.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      The output shape.
+  """
   if data_format == 'channels_first':
     shape = (shape[0], shape[2], shape[3], shape[1])
 
@@ -2941,6 +3076,15 @@ def _preprocess_deconv_output_shape(x, shape, data_format):
 
 
 def _preprocess_conv2d_input(x, data_format):
+  """Transpose and cast the input before the conv2d.
+
+  Arguments:
+      x: input tensor.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      A tensor.
+  """
   if dtype(x) == 'float64':
     x = math_ops.cast(x, 'float32')
   if data_format == 'channels_first':
@@ -2953,6 +3097,15 @@ def _preprocess_conv2d_input(x, data_format):
 
 
 def _preprocess_conv3d_input(x, data_format):
+  """Transpose and cast the input before the conv3d.
+
+  Arguments:
+      x: input tensor.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      A tensor.
+  """
   if dtype(x) == 'float64':
     x = math_ops.cast(x, 'float32')
   if data_format == 'channels_first':
@@ -2961,6 +3114,15 @@ def _preprocess_conv3d_input(x, data_format):
 
 
 def _preprocess_conv2d_kernel(kernel, data_format):
+  """Transpose and cast the kernel before the conv2d.
+
+  Arguments:
+      kernel: kernel tensor.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      A tensor.
+  """
   if dtype(kernel) == 'float64':
     kernel = math_ops.cast(kernel, 'float32')
   if data_format == 'channels_first':
@@ -2969,6 +3131,15 @@ def _preprocess_conv2d_kernel(kernel, data_format):
 
 
 def _preprocess_conv3d_kernel(kernel, data_format):
+  """Transpose and cast the kernel before the conv3d.
+
+  Arguments:
+      kernel: kernel tensor.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      A tensor.
+  """
   if dtype(kernel) == 'float64':
     kernel = math_ops.cast(kernel, 'float32')
   if data_format == 'channels_first':
@@ -2977,16 +3148,37 @@ def _preprocess_conv3d_kernel(kernel, data_format):
 
 
 def _preprocess_padding(padding):
+  """Convert keras' padding to tensorflow's padding.
+
+  Arguments:
+      padding: string, one of 'same' , 'valid'
+
+  Returns:
+      a string, one of 'SAME', 'VALID'.
+
+  Raises:
+      ValueError: if invalid `padding'`
+  """
   if padding == 'same':
     padding = 'SAME'
   elif padding == 'valid':
     padding = 'VALID'
   else:
-    raise ValueError('Invalid border mode:', padding)
+    raise ValueError('Invalid padding:', padding)
   return padding
 
 
 def _postprocess_conv2d_output(x, data_format):
+  """Transpose and cast the output from conv2d if needed.
+
+  Arguments:
+      x: A tensor.
+      data_format: string, one of "channels_last", "channels_first".
+
+  Returns:
+      A tensor.
+  """
+
   if data_format == 'channels_first':
     x = array_ops.transpose(x, (0, 3, 1, 2))
 
@@ -2996,6 +3188,15 @@ def _postprocess_conv2d_output(x, data_format):
 
 
 def _postprocess_conv3d_output(x, data_format):
+  """Transpose and cast the output from conv3d if needed.
+
+  Arguments:
+      x: A tensor.
+      data_format: string, one of "channels_last", "channels_first".
+
+  Returns:
+      A tensor.
+  """
   if data_format == 'channels_first':
     x = array_ops.transpose(x, (0, 4, 1, 2, 3))
 
diff --git a/tensorflow/contrib/keras/python/keras/callbacks.py b/tensorflow/contrib/keras/python/keras/callbacks.py
index a533e0fbda..d0587a549b 100644
--- a/tensorflow/contrib/keras/python/keras/callbacks.py
+++ b/tensorflow/contrib/keras/python/keras/callbacks.py
@@ -25,14 +25,15 @@ import csv
 import json
 import os
 import time
-import warnings
 
 import numpy as np
+import six
 
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
 from tensorflow.contrib.tensorboard.plugins import projector
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
 from tensorflow.python.training import saver as saver_lib
 
@@ -110,7 +111,7 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_begin)
     if (self._delta_t_batch > 0. and
         delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      warnings.warn(
+      logging.warning(
           'Method on_batch_begin() is slow compared '
           'to the batch update (%f). Check your callbacks.' % delta_t_median)
     self._t_enter_batch = time.time()
@@ -133,7 +134,7 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_end)
     if (self._delta_t_batch > 0. and
         (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-      warnings.warn(
+      logging.warning(
           'Method on_batch_end() is slow compared '
           'to the batch update (%f). Check your callbacks.' % delta_t_median)
 
@@ -245,6 +246,21 @@ class BaseLogger(Callback):
           logs[k] = self.totals[k] / self.seen
 
 
+class TerminateOnNaN(Callback):
+  """Callback that terminates training when a NaN loss is encountered."""
+
+  def __init__(self):
+    super(TerminateOnNaN, self).__init__()
+
+  def on_batch_end(self, batch, logs=None):
+    logs = logs or {}
+    loss = logs.get('loss')
+    if loss is not None:
+      if np.isnan(loss) or np.isinf(loss):
+        print('Batch %d: Invalid loss, terminating training' % (batch))
+        self.model.stop_training = True
+
+
 class ProgbarLogger(Callback):
   """Callback that prints metrics to stdout.
 
@@ -380,8 +396,8 @@ class ModelCheckpoint(Callback):
     self.epochs_since_last_save = 0
 
     if mode not in ['auto', 'min', 'max']:
-      warnings.warn('ModelCheckpoint mode %s is unknown, '
-                    'fallback to auto mode.' % (mode), RuntimeWarning)
+      logging.warning('ModelCheckpoint mode %s is unknown, '
+                      'fallback to auto mode.' % (mode))
       mode = 'auto'
 
     if mode == 'min':
@@ -407,8 +423,8 @@ class ModelCheckpoint(Callback):
       if self.save_best_only:
         current = logs.get(self.monitor)
         if current is None:
-          warnings.warn('Can save best model only with %s available, '
-                        'skipping.' % (self.monitor), RuntimeWarning)
+          logging.warning('Can save best model only with %s available, '
+                          'skipping.' % (self.monitor))
         else:
           if self.monitor_op(current, self.best):
             if self.verbose > 0:
@@ -469,8 +485,8 @@ class EarlyStopping(Callback):
     self.stopped_epoch = 0
 
     if mode not in ['auto', 'min', 'max']:
-      warnings.warn('EarlyStopping mode %s is unknown, '
-                    'fallback to auto mode.' % (self.mode), RuntimeWarning)
+      logging.warning('EarlyStopping mode %s is unknown, '
+                      'fallback to auto mode.' % (self.mode))
       mode = 'auto'
 
     if mode == 'min':
@@ -489,14 +505,15 @@ class EarlyStopping(Callback):
       self.min_delta *= -1
 
   def on_train_begin(self, logs=None):
-    self.wait = 0  # Allow instances to be re-used
+    # Allow instances to be re-used
+    self.wait = 0
+    self.stopped_epoch = 0
     self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
   def on_epoch_end(self, epoch, logs=None):
     current = logs.get(self.monitor)
     if current is None:
-      warnings.warn('Early stopping requires %s available!' % (self.monitor),
-                    RuntimeWarning)
+      logging.warning('Early stopping requires %s available!' % (self.monitor))
 
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
@@ -526,8 +543,7 @@ class RemoteMonitor(Callback):
       field: String; JSON field under which the data will be stored.
       headers: Dictionary; optional custom HTTP headers.
           Defaults to:
-          `{'Accept': 'application/json',
-            'Content-Type': 'application/json'}`
+          `{'Accept': 'application/json', 'Content-Type': 'application/json'}`
   """
 
   def __init__(self,
@@ -559,8 +575,8 @@ class RemoteMonitor(Callback):
           self.root + self.path, {self.field: json.dumps(send)},
           headers=self.headers)
     except requests.exceptions.RequestException:
-      warnings.warn('Warning: could not reach RemoteMonitor '
-                    'root server at ' + str(self.root))
+      logging.warning('Warning: could not reach RemoteMonitor '
+                      'root server at ' + str(self.root))
 
 
 class LearningRateScheduler(Callback):
@@ -595,17 +611,34 @@ class TensorBoard(Callback):
   metrics, as well as activation histograms for the different
   layers in your model.
 
+  TensorBoard is a visualization tool provided with TensorFlow.
+
+  If you have installed TensorFlow with pip, you should be able
+  to launch TensorBoard from the command line:
+
+  ```
+  tensorboard --logdir=/full_path_to_your_logs
+  ```
+
+  You can find more information about TensorBoard
+  [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
   Arguments:
       log_dir: the path of the directory where to save the log
-          files to be parsed by Tensorboard.
+          files to be parsed by TensorBoard.
       histogram_freq: frequency (in epochs) at which to compute activation
-          histograms for the layers of the model. If set to 0,
-          histograms won't be computed.
-      write_graph: whether to visualize the graph in Tensorboard.
+          and weight histograms for the layers of the model. If set to 0,
+          histograms won't be computed. Validation data (or split) must be
+          specified for histogram visualizations.
+      write_graph: whether to visualize the graph in TensorBoard.
           The log file can become quite large when
           write_graph is set to True.
+      write_grads: whether to visualize gradient histograms in TensorBoard.
+          `histogram_freq` must be greater than 0.
+      batch_size: size of batch of inputs to feed to the network
+          for histograms computation.
       write_images: whether to write model weights to visualize as
-          image in Tensorboard.
+          image in TensorBoard.
       embeddings_freq: frequency (in epochs) at which selected embedding
           layers will be saved.
       embeddings_layer_names: a list of names of layers to keep eye on. If
@@ -622,7 +655,9 @@ class TensorBoard(Callback):
   def __init__(self,
                log_dir='./logs',
                histogram_freq=0,
+               batch_size=32,
                write_graph=True,
+               write_grads=False,
                write_images=False,
                embeddings_freq=0,
                embeddings_layer_names=None,
@@ -632,27 +667,47 @@ class TensorBoard(Callback):
     self.histogram_freq = histogram_freq
     self.merged = None
     self.write_graph = write_graph
+    self.write_grads = write_grads
     self.write_images = write_images
     self.embeddings_freq = embeddings_freq
     self.embeddings_layer_names = embeddings_layer_names
     self.embeddings_metadata = embeddings_metadata or {}
+    self.batch_size = batch_size
 
   def set_model(self, model):
     self.model = model
     self.sess = K.get_session()
     if self.histogram_freq and self.merged is None:
       for layer in self.model.layers:
-
         for weight in layer.weights:
           tf_summary.histogram(weight.name, weight)
+          if self.write_grads:
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+            tf_summary.histogram('{}_grad'.format(weight.name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
-            shape = w_img.get_shape()
-            if len(shape) > 1 and shape[0] > shape[1]:
-              w_img = array_ops.transpose(w_img)
-            if len(shape) == 1:
-              w_img = array_ops.expand_dims(w_img, 0)
-            w_img = array_ops.expand_dims(array_ops.expand_dims(w_img, 0), -1)
+            shape = K.int_shape(w_img)
+            if len(shape) == 2:  # dense layer kernel case
+              if shape[0] > shape[1]:
+                w_img = array_ops.transpose(w_img)
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
+            elif len(shape) == 3:  # convnet case
+              if K.image_data_format() == 'channels_last':
+                # switch to channels_first to display
+                # every kernel as a separate image
+                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img,
+                                        [shape[0], shape[1], shape[2], 1])
+            elif len(shape) == 1:  # bias case
+              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
+            else:
+              # not possible to handle 3D convnets etc.
+              continue
+
+            shape = K.int_shape(w_img)
+            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(weight.name, w_img)
 
         if hasattr(layer, 'output'):
@@ -665,8 +720,6 @@ class TensorBoard(Callback):
       self.writer = tf_summary.FileWriter(self.log_dir)
 
     if self.embeddings_freq:
-      self.saver = saver_lib.Saver()
-
       embeddings_layer_names = self.embeddings_layer_names
 
       if not embeddings_layer_names:
@@ -680,6 +733,8 @@ class TensorBoard(Callback):
           for layer in self.model.layers if layer.name in embeddings_layer_names
       }
 
+      self.saver = saver_lib.Saver(list(embeddings.values()))
+
       embeddings_metadata = {}
 
       if not isinstance(self.embeddings_metadata, str):
@@ -691,15 +746,13 @@ class TensorBoard(Callback):
         }
 
       config = projector.ProjectorConfig()
-      self.embeddings_logs = []
+      self.embeddings_ckpt_path = os.path.join(self.log_dir,
+                                               'keras_embedding.ckpt')
 
       for layer_name, tensor in embeddings.items():
         embedding = config.embeddings.add()
         embedding.tensor_name = tensor.name
 
-        self.embeddings_logs.append(
-            os.path.join(self.log_dir, layer_name + '.ckpt'))
-
         if layer_name in embeddings_metadata:
           embedding.metadata_path = embeddings_metadata[layer_name]
 
@@ -710,24 +763,34 @@ class TensorBoard(Callback):
 
     if self.validation_data and self.histogram_freq:
       if epoch % self.histogram_freq == 0:
-        # TODO(fchollet): implement batched calls to sess.run
-        # (current call will likely go OOM on GPU)
+
+        val_data = self.validation_data
+        tensors = (
+            self.model.inputs + self.model.targets + self.model.sample_weights)
+
         if self.model.uses_learning_phase:
-          cut_v_data = len(self.model.inputs)
-          val_data = self.validation_data[:cut_v_data] + [0]
-          tensors = self.model.inputs + [K.learning_phase()]
-        else:
-          val_data = self.validation_data
-          tensors = self.model.inputs
-        feed_dict = dict(zip(tensors, val_data))
-        result = self.sess.run([self.merged], feed_dict=feed_dict)
-        summary_str = result[0]
-        self.writer.add_summary(summary_str, epoch)
-
-    if self.embeddings_freq and self.embeddings_logs:
+          tensors += [K.learning_phase()]
+
+        assert len(val_data) == len(tensors)
+        val_size = val_data[0].shape[0]
+        i = 0
+        while i < val_size:
+          step = min(self.batch_size, val_size - i)
+          batch_val = []
+          batch_val.append(val_data[0][i:i + step])
+          batch_val.append(val_data[1][i:i + step])
+          batch_val.append(val_data[2][i:i + step])
+          if self.model.uses_learning_phase:
+            batch_val.append(val_data[3])
+          feed_dict = dict(zip(tensors, batch_val))
+          result = self.sess.run([self.merged], feed_dict=feed_dict)
+          summary_str = result[0]
+          self.writer.add_summary(summary_str, epoch)
+          i += self.batch_size
+
+    if self.embeddings_freq and self.embeddings_ckpt_path:
       if epoch % self.embeddings_freq == 0:
-        for log in self.embeddings_logs:
-          self.saver.save(self.sess, log, epoch)
+        self.saver.save(self.sess, self.embeddings_ckpt_path, epoch)
 
     for name, value in logs.items():
       if name in ['batch', 'size']:
@@ -752,11 +815,12 @@ class ReduceLROnPlateau(Callback):
   of epochs, the learning rate is reduced.
 
   Example:
-      ```python
-          reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
-                                        patience=5, min_lr=0.001)
-          model.fit(X_train, Y_train, callbacks=[reduce_lr])
-      ```
+
+  ```python
+  reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
+                                patience=5, min_lr=0.001)
+  model.fit(X_train, Y_train, callbacks=[reduce_lr])
+  ```
 
   Arguments:
       monitor: quantity to be monitored.
@@ -810,8 +874,8 @@ class ReduceLROnPlateau(Callback):
     """Resets wait counter and cooldown counter.
     """
     if self.mode not in ['auto', 'min', 'max']:
-      warnings.warn('Learning Rate Plateau Reducing mode %s is unknown, '
-                    'fallback to auto mode.' % (self.mode), RuntimeWarning)
+      logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
+                      'fallback to auto mode.' % (self.mode))
       self.mode = 'auto'
     if (self.mode == 'min' or
         (self.mode == 'auto' and 'acc' not in self.monitor)):
@@ -832,8 +896,8 @@ class ReduceLROnPlateau(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
     current = logs.get(self.monitor)
     if current is None:
-      warnings.warn('Learning Rate Plateau Reducing requires %s available!' %
-                    self.monitor, RuntimeWarning)
+      logging.warning('Learning Rate Plateau Reducing requires %s available!' %
+                      self.monitor)
     else:
       if self.in_cooldown():
         self.cooldown_counter -= 1
@@ -868,8 +932,8 @@ class CSVLogger(Callback):
 
   Example:
       ```python
-          csv_logger = CSVLogger('training.log')
-          model.fit(X_train, Y_train, callbacks=[csv_logger])
+      csv_logger = CSVLogger('training.log')
+      model.fit(X_train, Y_train, callbacks=[csv_logger])
       ```
 
   Arguments:
@@ -886,23 +950,26 @@ class CSVLogger(Callback):
     self.writer = None
     self.keys = None
     self.append_header = True
+    self.file_flags = 'b' if six.PY2 and os.name == 'nt' else ''
     super(CSVLogger, self).__init__()
 
   def on_train_begin(self, logs=None):
     if self.append:
       if os.path.exists(self.filename):
-        with open(self.filename) as f:
+        with open(self.filename, 'r' + self.file_flags) as f:
           self.append_header = not bool(len(f.readline()))
-      self.csv_file = open(self.filename, 'a')
+      self.csv_file = open(self.filename, 'a' + self.file_flags)
     else:
-      self.csv_file = open(self.filename, 'w')
+      self.csv_file = open(self.filename, 'w' + self.file_flags)
 
   def on_epoch_end(self, epoch, logs=None):
     logs = logs or {}
 
     def handle_value(k):
       is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
-      if isinstance(k, Iterable) and not is_zero_dim_ndarray:
+      if isinstance(k, six.string_types):
+        return k
+      elif isinstance(k, Iterable) and not is_zero_dim_ndarray:
         return '"[%s]"' % (', '.join(map(str, k)))
       else:
         return k
@@ -936,6 +1003,7 @@ class LambdaCallback(Callback):
   This callback is constructed with anonymous functions that will be called
   at the appropriate time. Note that the callbacks expects positional
   arguments, as:
+
    - `on_epoch_begin` and `on_epoch_end` expect two positional arguments:
       `epoch`, `logs`
    - `on_batch_begin` and `on_batch_end` expect two positional arguments:
diff --git a/tensorflow/contrib/keras/python/keras/callbacks_test.py b/tensorflow/contrib/keras/python/keras/callbacks_test.py
index 412f736e16..15a7304b60 100644
--- a/tensorflow/contrib/keras/python/keras/callbacks_test.py
+++ b/tensorflow/contrib/keras/python/keras/callbacks_test.py
@@ -436,6 +436,35 @@ class KerasCallbacksTest(test.TestCase):
 
       os.remove(filepath)
 
+  def test_TerminateOnNaN(self):
+    np.random.seed(1337)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+    cbks = [keras.callbacks.TerminateOnNaN()]
+    model = keras.models.Sequential()
+    initializer = keras.initializers.Constant(value=1e5)
+    for _ in range(5):
+      model.add(keras.layers.Dense(2,
+                                   input_dim=INPUT_DIM,
+                                   activation='relu',
+                                   kernel_initializer=initializer))
+    model.add(keras.layers.Dense(NUM_CLASSES))
+    model.compile(loss='mean_squared_error',
+                  optimizer='rmsprop')
+
+    history = model.fit(x_train, y_train, batch_size=BATCH_SIZE,
+                        validation_data=(x_test, y_test),
+                        callbacks=cbks, epochs=20)
+    loss = history.history['loss']
+    assert len(loss) == 1
+    assert loss[0] == np.inf
+
   def test_TensorBoard(self):
     np.random.seed(1337)
 
@@ -479,7 +508,9 @@ class KerasCallbacksTest(test.TestCase):
           metrics=['accuracy'])
 
       tsb = keras.callbacks.TensorBoard(
-          log_dir=temp_dir, histogram_freq=1, write_images=True)
+          log_dir=temp_dir, histogram_freq=1, write_images=True,
+          write_grads=True, embeddings_freq=1,
+          embeddings_layer_names=['dense_1'], batch_size=5)
       cbks = [tsb]
 
       # fit with validation data
diff --git a/tensorflow/contrib/keras/python/keras/datasets/imdb.py b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
index 5c087fe63f..04ab154f9f 100644
--- a/tensorflow/contrib/keras/python/keras/datasets/imdb.py
+++ b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
@@ -118,7 +118,7 @@ def load_data(path='imdb.npz',
     for x in xs:
       nx = []
       for w in x:
-        if w >= num_words or w < skip_top:
+        if skip_top <= w < num_words:
           nx.append(w)
       new_xs.append(nx)
     xs = new_xs
diff --git a/tensorflow/contrib/keras/python/keras/datasets/reuters.py b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
index b1c22fee63..2904eb5bf6 100644
--- a/tensorflow/contrib/keras/python/keras/datasets/reuters.py
+++ b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
@@ -104,7 +104,7 @@ def load_data(path='reuters.npz',
     for x in xs:
       nx = []
       for w in x:
-        if w >= num_words or w < skip_top:
+        if skip_top <= w < num_words:
           nx.append(w)
       new_xs.append(nx)
     xs = new_xs
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology.py b/tensorflow/contrib/keras/python/keras/engine/topology.py
index 8bc0c412b5..7561ef78f3 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology.py
@@ -23,7 +23,6 @@ import copy
 import json
 import os
 import re
-import warnings
 
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
@@ -35,6 +34,7 @@ from tensorflow.contrib.keras.python.keras.utils.layer_utils import print_summar
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
 
 
@@ -50,43 +50,7 @@ except ImportError:
   yaml = None
 # pylint: enable=g-import-not-at-top
 
-
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
-
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
-
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
-
-  Arguments:
-      dtype: Expected datatype of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
-
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
+InputSpec = tf_base_layers.InputSpec  # pylint: disable=invalid-name
 
 
 class Node(object):
@@ -435,6 +399,20 @@ class Layer(tf_base_layers.Layer):
     with K.name_scope(self.name):
       output_mask = self.compute_mask(inputs, previous_mask)
 
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = _to_list(output)
+    inputs_ls = _to_list(inputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in inputs_ls:
+        x = K.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      output = output_ls_copy[0]
+    else:
+      output = output_ls_copy
+
     # Add an inbound node to the layer, so that it keeps track
     # of the call and of all new variables created during the call.
     # This also updates the layer history of the output tensor(s).
@@ -1085,7 +1063,7 @@ def Input(  # pylint: disable=invalid-name
   attributes that allow us to build a Keras model
   just by knowing the inputs and outputs of the model.
 
-  For instance, if a, b and c and Keras tensors,
+  For instance, if a, b and c are Keras tensors,
   it becomes possible to do:
   `model = Model(input=[a, b], output=c)`
 
@@ -1259,16 +1237,16 @@ class Container(Layer):
       if len(layer.inbound_nodes) > 1 or (
           layer.inbound_nodes and layer.inbound_nodes[0].inbound_layers):
         cls_name = self.__class__.__name__
-        warnings.warn(cls_name + ' inputs must come from '
-                      'a Keras Input layer, '
-                      'they cannot be the output of '
-                      'a previous non-Input layer. '
-                      'Here, a tensor specified as '
-                      'input to "' + self.name + '" was not an Input tensor, '
-                      'it was generated by layer ' + layer.name + '.\n'
-                      'Note that input tensors are '
-                      'instantiated via `tensor = Input(shape)`.\n'
-                      'The tensor that caused the issue was: ' + str(x.name))
+        logging.warning(cls_name + ' inputs must come from '
+                        'a Keras Input layer, '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
     for x in self.outputs:
       if not hasattr(x, '_keras_history'):
         cls_name = self.__class__.__name__
@@ -1338,76 +1316,96 @@ class Container(Layer):
     nodes_depths = {}  # dict {node: depth value}
     layers_depths = {}  # dict {layer: depth value}
     layer_indices = {}  # dict {layer: index in traversal}
-
-    def make_node_marker(node, depth):
-      return str(id(node)) + '-' + str(depth)
+    nodes_in_decreasing_depth = []
 
     def build_map_of_graph(tensor,
-                           seen_nodes=None,
-                           depth=0,
+                           finished_nodes,
+                           nodes_in_progress,
                            layer=None,
                            node_index=None,
                            tensor_index=None):
       """Builds a map of the graph of layers.
 
-      This recursively updates the maps `nodes_depths`,
-      `layers_depths` and the set `container_nodes`.
-
-      Does not try to detect cycles in the graph.
+      This recursively updates the map `layer_indices`,
+      the list `nodes_in_decreasing_depth` and the set `container_nodes`.
 
       Arguments:
           tensor: Some tensor in a graph.
-          seen_nodes: Set of node ids ("{layer.name}_ib-{node_index}")
-              of nodes seen so far. Useful to prevent infinite loops.
-          depth: Current depth in the graph (0 = last output).
+          finished_nodes: Set of nodes whose subgraphs have been traversed
+              completely. Useful to prevent duplicated work.
+          nodes_in_progress: Set of nodes that are currently active on the
+              recursion stack. Useful to detect cycles.
           layer: Layer from which `tensor` comes from. If not provided,
               will be obtained from `tensor._keras_history`.
           node_index: Node index from which `tensor` comes from.
           tensor_index: Tensor_index from which `tensor` comes from.
+
+      Raises:
+          RuntimeError: if a cycle is detected.
       """
-      seen_nodes = seen_nodes or set()
       if not layer or node_index is None or tensor_index is None:
         layer, node_index, tensor_index = tensor._keras_history
       node = layer.inbound_nodes[node_index]
 
       # Prevent cycles.
-      seen_nodes.add(make_node_marker(node, depth))
+      if node in nodes_in_progress:
+        raise RuntimeError('The tensor ' + str(tensor) + ' at layer "' +
+                           layer.name + '" is part of a cycle.')
+
+      # Don't repeat work for shared subgraphs
+      if node in finished_nodes:
+        return
 
       node_key = layer.name + '_ib-' + str(node_index)
       # Update container_nodes.
       container_nodes.add(node_key)
-      # Update nodes_depths.
-      node_depth = nodes_depths.get(node)
-      if node_depth is None:
-        nodes_depths[node] = depth
-      else:
-        nodes_depths[node] = max(depth, node_depth)
-      # Update layers_depths.
-      previously_seen_depth = layers_depths.get(layer)
-      if previously_seen_depth is None:
-        current_depth = depth
-      else:
-        current_depth = max(depth, previously_seen_depth)
-      layers_depths[layer] = current_depth
+
+      # Store the traversal order for layer sorting.
       if layer not in layer_indices:
         layer_indices[layer] = len(layer_indices)
 
+      nodes_in_progress.add(node)
+
       # Propagate to all previous tensors connected to this node.
       for i in range(len(node.inbound_layers)):
         x = node.input_tensors[i]
         layer = node.inbound_layers[i]
         node_index = node.node_indices[i]
         tensor_index = node.tensor_indices[i]
-        next_node = layer.inbound_nodes[node_index]
-        # use node_marker to prevent cycles
-        node_marker = make_node_marker(next_node, current_depth + 1)
-        if node_marker not in seen_nodes:
-          build_map_of_graph(x, seen_nodes, current_depth + 1, layer,
-                             node_index, tensor_index)
+        build_map_of_graph(x, finished_nodes, nodes_in_progress, layer,
+                           node_index, tensor_index)
+
+      finished_nodes.add(node)
+      nodes_in_progress.remove(node)
+
+      nodes_in_decreasing_depth.append(node)
 
+    finished_nodes = set()
+    nodes_in_progress = set()
     for x in self.outputs:
-      seen_nodes = set()
-      build_map_of_graph(x, seen_nodes, depth=0)
+      build_map_of_graph(x, finished_nodes, nodes_in_progress)
+
+    for node in reversed(nodes_in_decreasing_depth):
+      # If the depth is not set, the node has no outbound nodes (depth 0).
+      depth = nodes_depths.setdefault(node, 0)
+
+      # Update the depth of the corresponding layer
+      previous_depth = layers_depths.get(node.outbound_layer, 0)
+      # If we've seen this layer before at a higher depth,
+      # we should use that depth instead of the node depth.
+      # This is necessary for shared layers that have inputs at different
+      # depth levels in the graph.
+      depth = max(depth, previous_depth)
+      layers_depths[node.outbound_layer] = depth
+      nodes_depths[node] = depth
+
+      # Update the depth of inbound nodes.
+      for i in range(len(node.inbound_layers)):
+        inbound_layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        inbound_node = inbound_layer.inbound_nodes[node_index]
+        previous_depth = nodes_depths.get(inbound_node, 0)
+        nodes_depths[inbound_node] = max(depth + 1, previous_depth)
 
     # Build a dict {depth: list of nodes with this depth}
     nodes_by_depth = {}
@@ -2043,11 +2041,12 @@ class Container(Layer):
               json.dumps(node.arguments)
               kwargs = node.arguments
             except TypeError:
-              warnings.warn('Layer ' + layer.name +
-                            ' was passed non-serializable keyword arguments: ' +
-                            str(node.arguments) + '. They will not be included '
-                            'in the serialized model (and thus will be missing '
-                            'at deserialization time).')
+              logging.warning(
+                  'Layer ' + layer.name +
+                  ' was passed non-serializable keyword arguments: ' +
+                  str(node.arguments) + '. They will not be included '
+                  'in the serialized model (and thus will be missing '
+                  'at deserialization time).')
               kwargs = {}
           else:
             kwargs = {}
@@ -2527,6 +2526,21 @@ def preprocess_weights_for_loading(layer,
       A list of weights values (Numpy arrays).
   """
   if original_keras_version == '1':
+    if layer.__class__.__name__ == 'Bidirectional':
+      num_weights_per_layer = len(weights) // 2
+
+      forward_weights = preprocess_weights_for_loading(
+          layer.forward_layer, weights[:num_weights_per_layer],
+          original_keras_version, original_backend)
+      backward_weights = preprocess_weights_for_loading(
+          layer.backward_layer, weights[num_weights_per_layer:],
+          original_keras_version, original_backend)
+      weights = forward_weights + backward_weights
+
+    if layer.__class__.__name__ == 'TimeDistributed':
+      weights = preprocess_weights_for_loading(
+          layer.layer, weights, original_keras_version, original_backend)
+
     if layer.__class__.__name__ == 'Conv1D':
       shape = weights[0].shape
       # Handle Keras 1.1 format
@@ -2595,13 +2609,16 @@ def preprocess_weights_for_loading(layer,
           recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
         weights = [kernel, recurrent_kernel, bias]
 
-  if original_backend and K.backend() != original_backend:
-    conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose']
-    if layer.__class__.__name__ in conv_layers:
-      weights[0] = conv_utils.convert_kernel(weights[0])
-    if layer.__class__.__name__ == 'ConvLSTM2D':
+  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
+  if layer.__class__.__name__ in conv_layers:
+    if original_backend and K.backend() != original_backend:
       weights[0] = conv_utils.convert_kernel(weights[0])
-      weights[1] = conv_utils.convert_kernel(weights[1])
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = conv_utils.convert_kernel(weights[1])
+    if K.int_shape(layer.weights[0]) != weights[0].shape:
+      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
   return weights
 
 
diff --git a/tensorflow/contrib/keras/python/keras/engine/training.py b/tensorflow/contrib/keras/python/keras/engine/training.py
index 96d1c2f262..09459fd713 100644
--- a/tensorflow/contrib/keras/python/keras/engine/training.py
+++ b/tensorflow/contrib/keras/python/keras/engine/training.py
@@ -23,7 +23,6 @@ import copy
 import multiprocessing
 import threading
 import time
-import warnings
 
 import numpy as np
 import six
@@ -35,6 +34,7 @@ from tensorflow.contrib.keras.python.keras import metrics as metrics_module
 from tensorflow.contrib.keras.python.keras import optimizers
 from tensorflow.contrib.keras.python.keras.engine.topology import Container
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.platform import tf_logging as logging
 
 
 # pylint: disable=g-import-not-at-top
@@ -72,6 +72,8 @@ def _standardize_input_data(data,
   Raises:
       ValueError: in case of improperly formatted user-provided data.
   """
+  if not names:
+    return []
   if data is None:
     return [None for _ in range(len(names))]
   if isinstance(data, dict):
@@ -85,7 +87,7 @@ def _standardize_input_data(data,
     if len(data) != len(names):
       if data and hasattr(data[0], 'shape'):
         raise ValueError(
-            'Error when checking ' + exception_prefix +
+            'Error when checking model ' + exception_prefix +
             ': the list of Numpy arrays '
             'that you are passing to your model '
             'is not the size the model expected. '
@@ -96,7 +98,7 @@ def _standardize_input_data(data,
         if len(names) == 1:
           data = [np.asarray(data)]
         else:
-          raise ValueError('Error when checking ' + exception_prefix +
+          raise ValueError('Error when checking model ' + exception_prefix +
                            ': you are passing a list as '
                            'input to your model, '
                            'but the model expects '
@@ -106,15 +108,15 @@ def _standardize_input_data(data,
     arrays = data
   else:
     if not hasattr(data, 'shape'):
-      raise TypeError('Error when checking ' + exception_prefix +
+      raise TypeError('Error when checking model ' + exception_prefix +
                       ': data should be a Numpy array, '
                       'or list/dict of Numpy arrays. '
                       'Found: ' + str(data)[:200] + '...')
-    if len(names) != 1:
+    if len(names) > 1:
       # Case: model expects multiple inputs but only received
       # a single Numpy array.
-      raise ValueError('The model expects ' + str(len(names)) +
-                       ' input arrays, but only received one array. '
+      raise ValueError('The model expects ' + str(len(names)) + exception_prefix
+                       + ' arrays, but only received one array. '
                        'Found: array with shape ' + str(data.shape))
     arrays = [data]
 
@@ -682,7 +684,8 @@ class Model(Container):
               loss,
               metrics=None,
               loss_weights=None,
-              sample_weight_mode=None):
+              sample_weight_mode=None,
+              **kwargs):
     """Configures the model for training.
 
     Arguments:
@@ -692,6 +695,8 @@ class Model(Container):
             See [losses](/losses).
             If the model has multiple outputs, you can use a different loss
             on each output by passing a dictionary or a list of losses.
+            The loss value that will be minimized by the model
+            will then be the sum of all individual losses.
         metrics: list of metrics to be evaluated by the model
             during training and testing.
             Typically you will use `metrics=['accuracy']`.
@@ -701,6 +706,9 @@ class Model(Container):
         loss_weights: Optional list or dictionary specifying scalar
             coefficients (Python floats) to weight the loss contributions
             of different model outputs.
+            The loss value that will be minimized by the model
+            will then be the *weighted sum* of all individual losses,
+            weighted by the `loss_weights` coefficients.
             If a list, it is expected to have a 1:1 mapping
             to the model's outputs. If a tensor, it is expected to map
             output names (strings) to scalar coefficients.
@@ -710,6 +718,7 @@ class Model(Container):
             If the model has multiple outputs, you can use a different
             `sample_weight_mode` on each output by passing a
             dictionary or a list of modes.
+        **kwargs: Additional arguments passed to `tf.Session.run`.
 
     Raises:
         ValueError: In case of invalid arguments for
@@ -733,7 +742,7 @@ class Model(Container):
       loss_functions = []
       for name in self.output_names:
         if name not in loss:
-          warnings.warn(
+          logging.warning(
               'Output "' + name + '" missing from loss dictionary. '
               'We assume this was done on purpose, '
               'and we will not be expecting '
@@ -975,6 +984,7 @@ class Model(Container):
     self.train_function = None
     self.test_function = None
     self.predict_function = None
+    self._function_kwargs = kwargs
 
     # Collected trainable weights and sort them deterministically.
     trainable_weights = self.trainable_weights
@@ -997,7 +1007,10 @@ class Model(Container):
       updates = self.updates + training_updates
       # Gets loss and metrics. Updates weights at each call.
       self.train_function = K.function(
-          inputs, [self.total_loss] + self.metrics_tensors, updates=updates)
+          inputs, [self.total_loss] + self.metrics_tensors,
+          updates=updates,
+          name='train_function',
+          **self._function_kwargs)
 
   def _make_test_function(self):
     if not hasattr(self, 'test_function'):
@@ -1011,11 +1024,14 @@ class Model(Container):
       # Does update the network states.
       self.test_function = K.function(
           inputs, [self.total_loss] + self.metrics_tensors,
-          updates=self.state_updates)
+          updates=self.state_updates,
+          name='test_function',
+          **self._function_kwargs)
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
       self.predict_function = None
+      self._function_kwargs = {}
     if self.predict_function is None:
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         inputs = self._feed_inputs + [K.learning_phase()]
@@ -1024,7 +1040,11 @@ class Model(Container):
       # Gets network outputs. Does not update weights.
       # Does update the network states.
       self.predict_function = K.function(
-          inputs, self.outputs, updates=self.state_updates)
+          inputs,
+          self.outputs,
+          updates=self.state_updates,
+          name='predict_function',
+          **self._function_kwargs)
 
   def _fit_loop(self,
                 f,
@@ -1124,7 +1144,7 @@ class Model(Container):
         batch_ids = index_array[batch_start:batch_end]
         try:
           if isinstance(ins[-1], float):
-            # do not slice the training phase flag
+            # Do not slice the training phase flag.
             ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
           else:
             ins_batch = _slice_arrays(ins, batch_ids)
@@ -1143,16 +1163,16 @@ class Model(Container):
           batch_logs[l] = o
 
         callbacks.on_batch_end(batch_index, batch_logs)
+        if callback_model.stop_training:
+          break
 
-        if batch_index == len(batches) - 1:  # last batch
-          # validation
+        if batch_index == len(batches) - 1:  # Last batch.
           if do_validation:
-            # replace with self._evaluate
             val_outs = self._test_loop(
                 val_f, val_ins, batch_size=batch_size, verbose=0)
             if not isinstance(val_outs, list):
               val_outs = [val_outs]
-            # same labels assumed
+            # Same labels assumed.
             for l, o in zip(out_labels, val_outs):
               epoch_logs['val_' + l] = o
       callbacks.on_epoch_end(epoch, epoch_logs)
@@ -1192,7 +1212,7 @@ class Model(Container):
     for batch_index, (batch_start, batch_end) in enumerate(batches):
       batch_ids = index_array[batch_start:batch_end]
       if ins and isinstance(ins[-1], float):
-        # do not slice the training phase flag
+        # Do not slice the training phase flag.
         ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
       else:
         ins_batch = _slice_arrays(ins, batch_ids)
@@ -1246,7 +1266,7 @@ class Model(Container):
     for batch_index, (batch_start, batch_end) in enumerate(batches):
       batch_ids = index_array[batch_start:batch_end]
       if isinstance(ins[-1], float):
-        # do not slice the training phase flag
+        # Do not slice the training phase flag.
         ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
       else:
         ins_batch = _slice_arrays(ins, batch_ids)
@@ -1297,13 +1317,13 @@ class Model(Container):
         self._feed_input_names,
         self._feed_input_shapes,
         check_batch_axis=False,
-        exception_prefix='model input')
+        exception_prefix='input')
     y = _standardize_input_data(
         y,
         self._feed_output_names,
         output_shapes,
         check_batch_axis=False,
-        exception_prefix='model target')
+        exception_prefix='target')
     sample_weights = _standardize_sample_weights(sample_weight,
                                                  self._feed_output_names)
     class_weights = _standardize_class_weights(class_weight,
@@ -1325,6 +1345,20 @@ class Model(Container):
                          str(x[0].shape[0]) + ' samples')
     return x, y, sample_weights
 
+  def _get_deduped_metrics_names(self):
+    out_labels = self.metrics_names
+
+    # Rename duplicated metrics name
+    # (can happen with an output layer shared among multiple dataflows).
+    deduped_out_labels = []
+    for i, label in enumerate(out_labels):
+      new_label = label
+      if out_labels.count(label) > 1:
+        dup_idx = out_labels[:i].count(label)
+        new_label += '_' + str(dup_idx + 1)
+      deduped_out_labels.append(new_label)
+    return deduped_out_labels
+
   def fit(self,
           x=None,
           y=None,
@@ -1354,7 +1388,7 @@ class Model(Container):
         batch_size: integer. Number of samples per gradient update.
         epochs: integer, the number of times to iterate
             over the training data arrays.
-            verbose: 0, 1, or 2. Verbosity mode.
+        verbose: 0, 1, or 2. Verbosity mode.
             0 = silent, 1 = verbose, 2 = one log line per epoch.
         callbacks: list of callbacks to be called during training.
             See [callbacks](/callbacks).
@@ -1396,7 +1430,7 @@ class Model(Container):
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
-    # validate user data
+    # Validate user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
@@ -1404,7 +1438,7 @@ class Model(Container):
         class_weight=class_weight,
         check_batch_axis=False,
         batch_size=batch_size)
-    # prepare validation data
+    # Prepare validation data.
     if validation_data:
       do_validation = True
       if len(validation_data) == 2:
@@ -1450,7 +1484,7 @@ class Model(Container):
       val_f = None
       val_ins = None
 
-    # prepare input arrays and training function
+    # Prepare input arrays and training function.
     if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
       ins = x + y + sample_weights + [1.]
     else:
@@ -1458,19 +1492,8 @@ class Model(Container):
     self._make_train_function()
     f = self.train_function
 
-    # prepare display labels
-    out_labels = self.metrics_names
-
-    # rename duplicated metrics name
-    # (can happen with an output layer shared among multiple dataflows)
-    deduped_out_labels = []
-    for i, label in enumerate(out_labels):
-      new_label = label
-      if out_labels.count(label) > 1:
-        dup_idx = out_labels[:i].count(label)
-        new_label += '_' + str(dup_idx + 1)
-      deduped_out_labels.append(new_label)
-    out_labels = deduped_out_labels
+    # Prepare display labels.
+    out_labels = self._get_deduped_metrics_names()
 
     if do_validation:
       callback_metrics = copy.copy(out_labels) + [
@@ -1479,7 +1502,7 @@ class Model(Container):
     else:
       callback_metrics = copy.copy(out_labels)
 
-    # delegate logic to _fit_loop
+    # Delegate logic to `_fit_loop`.
     return self._fit_loop(
         f,
         ins,
@@ -1521,14 +1544,14 @@ class Model(Container):
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
     """
-    # validate user data
+    # Validate user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
         check_batch_axis=False,
         batch_size=batch_size)
-    # prepare inputs, delegate logic to _test_loop
+    # Prepare inputs, delegate logic to `_test_loop`.
     if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
       ins = x + y + sample_weights + [0.]
     else:
@@ -1557,7 +1580,7 @@ class Model(Container):
             or in case a stateful model receives a number of samples
             that is not a multiple of the batch size.
     """
-    # validate user data
+    # Validate user data.
     x = _standardize_input_data(
         x,
         self._feed_input_names,
@@ -1572,7 +1595,7 @@ class Model(Container):
                          str(x[0].shape[0]) + ' samples. '
                          'Batch size: ' + str(batch_size) + '.')
 
-    # prepare inputs, delegate logic to _predict_loop
+    # Prepare inputs, delegate logic to `_predict_loop`.
     if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
       ins = x + [0.]
     else:
@@ -1720,7 +1743,7 @@ class Model(Container):
             All arrays should contain the same number of samples.
             The generator is expected to loop over its data
             indefinitely. An epoch finishes when `steps_per_epoch`
-            samples have been seen by the model.
+            batches have been seen by the model.
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
@@ -1792,7 +1815,8 @@ class Model(Container):
                        'you must specify a value for '
                        '`validation_steps`.')
 
-    out_labels = self.metrics_names
+    # Prepare display labels.
+    out_labels = self._get_deduped_metrics_names()
     callback_metrics = out_labels + ['val_' + n for n in out_labels]
 
     # prepare callbacks
@@ -1829,8 +1853,11 @@ class Model(Container):
                          'or `(val_x, val_y)`. Found: ' + str(validation_data))
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x, val_y, val_sample_weight)
+      val_data = val_x + val_y + val_sample_weights
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        val_data += [0.]
       for cbk in callbacks:
-        cbk.validation_data = val_x + [val_y, val_sample_weights]
+        cbk.validation_data = val_data
     enqueuer = None
 
     try:
diff --git a/tensorflow/contrib/keras/python/keras/integration_test.py b/tensorflow/contrib/keras/python/keras/integration_test.py
index f25e8eeaac..bcd844201c 100644
--- a/tensorflow/contrib/keras/python/keras/integration_test.py
+++ b/tensorflow/contrib/keras/python/keras/integration_test.py
@@ -80,7 +80,7 @@ class KerasIntegrationTest(test.TestCase):
 
   def test_temporal_classification_declarative(self):
     with self.test_session():
-      np.random.seed(1337)
+      np.random.seed(1336)
       (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
           train_samples=200,
           test_samples=100,
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
index 30325b7148..9ab2e72bf1 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
@@ -124,9 +124,12 @@ class ConvRecurrent2D(Recurrent):
     self.return_sequences = return_sequences
     self.go_backwards = go_backwards
     self.stateful = stateful
-    self.input_spec = InputSpec(ndim=5)
+    self.input_spec = [InputSpec(ndim=5)]
+    self.state_spec = None
 
   def _compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       rows = input_shape[3]
@@ -344,11 +347,14 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
 
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    # TODO(fchollet): better handling of input spec
-    self.input_spec = InputSpec(shape=input_shape)
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
 
     if self.stateful:
       self.reset_states()
@@ -364,6 +370,13 @@ class ConvLSTM2D(ConvRecurrent2D):
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = input_shape[channel_axis]
+    state_shape = [None] * 4
+    state_shape[channel_axis] = input_dim
+    state_shape = tuple(state_shape)
+    self.state_spec = [
+        InputSpec(shape=state_shape),
+        InputSpec(shape=state_shape)
+    ]
     kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
     self.kernel_shape = kernel_shape
     recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
@@ -417,7 +430,7 @@ class ConvLSTM2D(ConvRecurrent2D):
       self.bias_o = None
     self.built = True
 
-  def get_initial_states(self, inputs):
+  def get_initial_state(self, inputs):
     # (samples, timesteps, rows, cols, filters)
     initial_state = K.zeros_like(inputs)
     # (samples, rows, cols, filters)
@@ -433,8 +446,9 @@ class ConvLSTM2D(ConvRecurrent2D):
   def reset_states(self):
     if not self.stateful:
       raise RuntimeError('Layer must be stateful.')
-    input_shape = self.input_spec.shape
+    input_shape = self.input_spec[0].shape
     output_shape = self._compute_output_shape(input_shape)
+
     if not input_shape[0]:
       raise ValueError('If a RNN is stateful, a complete '
                        'input_shape must be provided '
@@ -453,8 +467,8 @@ class ConvLSTM2D(ConvRecurrent2D):
                   np.zeros((input_shape[0], out_row, out_col, out_filter)))
     else:
       self.states = [
-          K.zeros((input_shape[0], out_row, out_col, out_filter)), K.zeros(
-              (input_shape[0], out_row, out_col, out_filter))
+          K.zeros((input_shape[0], out_row, out_col, out_filter)),
+          K.zeros((input_shape[0], out_row, out_col, out_filter))
       ]
 
   def get_constants(self, inputs, training=None):
diff --git a/tensorflow/contrib/keras/python/keras/layers/lstm_test.py b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
index 0e1d148bd8..90bf95a781 100644
--- a/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
@@ -189,6 +189,109 @@ class LSTMLayerTest(test.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_specify_initial_state_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      # Test with Keras tensor
+      inputs = keras.Input((timesteps, embedding_dim))
+      initial_state = [keras.Input((units,)) for _ in range(num_states)]
+      layer = keras.layers.LSTM(units)
+      if len(initial_state) == 1:
+        output = layer(inputs, initial_state=initial_state[0])
+      else:
+        output = layer(inputs, initial_state=initial_state)
+      assert initial_state[0] in layer.inbound_nodes[0].input_tensors
+
+      model = keras.models.Model([inputs] + initial_state, output)
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      initial_state = [np.random.random((num_samples, units))
+                       for _ in range(num_states)]
+      targets = np.random.random((num_samples, units))
+      model.train_on_batch([inputs] + initial_state, targets)
+
+  def test_specify_initial_state_non_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      # Test with non-Keras tensor
+      inputs = keras.Input((timesteps, embedding_dim))
+      initial_state = [keras.backend.random_normal_variable(
+          (num_samples, units), 0, 1)
+                       for _ in range(num_states)]
+      layer = keras.layers.LSTM(units)
+      output = layer(inputs, initial_state=initial_state)
+
+      model = keras.models.Model(inputs, output)
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      targets = np.random.random((num_samples, units))
+      model.train_on_batch(inputs, targets)
+
+  def test_reset_states_with_values(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      layer = keras.layers.LSTM(units, stateful=True)
+      layer.build((num_samples, timesteps, embedding_dim))
+      layer.reset_states()
+      assert len(layer.states) == num_states
+      assert layer.states[0] is not None
+      np.testing.assert_allclose(
+          keras.backend.eval(layer.states[0]),
+          np.zeros(keras.backend.int_shape(layer.states[0])),
+          atol=1e-4)
+      state_shapes = [keras.backend.int_shape(state) for state in layer.states]
+      values = [np.ones(shape) for shape in state_shapes]
+      if len(values) == 1:
+        values = values[0]
+      layer.reset_states(values)
+      np.testing.assert_allclose(
+          keras.backend.eval(layer.states[0]),
+          np.ones(keras.backend.int_shape(layer.states[0])),
+          atol=1e-4)
+
+      # Test with invalid data
+      with self.assertRaises(ValueError):
+        layer.reset_states([1] * (len(layer.states) + 1))
+
+  def test_specify_state_with_masking(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      inputs = keras.Input((timesteps, embedding_dim))
+      _ = keras.layers.Masking()(inputs)
+      initial_state = [keras.Input((units,)) for _ in range(num_states)]
+      output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
+
+      model = keras.models.Model([inputs] + initial_state, output)
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      initial_state = [np.random.random((num_samples, units))
+                       for _ in range(num_states)]
+      targets = np.random.random((num_samples, units))
+      model.train_on_batch([inputs] + initial_state, targets)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/merge.py b/tensorflow/contrib/keras/python/keras/layers/merge.py
index b4bb9935fd..84c03fdebd 100644
--- a/tensorflow/contrib/keras/python/keras/layers/merge.py
+++ b/tensorflow/contrib/keras/python/keras/layers/merge.py
@@ -139,7 +139,8 @@ class _Merge(Layer):
             batch_size = x_shape[0]
             new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
             x_transposed = K.reshape(x,
-                                     K.stack([batch_size, K.prod(x_shape[1:])]))
+                                     K.stack([batch_size,
+                                              K.prod(x_shape[1:])]))
             x_transposed = K.permute_dimensions(x_transposed, (1, 0))
             x_transposed = K.reshape(x_transposed, new_shape)
             reshaped_inputs.append(x_transposed)
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
index e608921add..5e8c23ed3e 100644
--- a/tensorflow/contrib/keras/python/keras/layers/recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -197,11 +197,16 @@ class Recurrent(Layer):
       To reset the states of your model, call `.reset_states()` on either
       a specific layer, or on your entire model.
 
-  # Note on specifying initial states in RNNs
-      You can specify the initial state of RNN layers by calling them with
-      the keyword argument `initial_state`. The value of `initial_state`
-      should be a tensor or list of tensors representing the initial state
-      of the RNN layer.
+  # Note on specifying the initial state of RNNs
+      You can specify the initial state of RNN layers symbolically by
+      calling them with the keyword argument `initial_state`. The value of
+      `initial_state` should be a tensor or list of tensors representing
+      the initial state of the RNN layer.
+
+      You can specify the initial state of RNN layers numerically by
+      calling `reset_states` with the keyword argument `states`. The value of
+      `states` should be a numpy array or list of numpy arrays representing
+      the initial state of the RNN layer.
   """
 
   def __init__(self,
@@ -218,7 +223,7 @@ class Recurrent(Layer):
     self.unroll = unroll
     self.implementation = implementation
     self.supports_masking = True
-    self.input_spec = InputSpec(ndim=3)
+    self.input_spec = [InputSpec(ndim=3)]
     self.state_spec = None
     self.dropout = 0
     self.recurrent_dropout = 0
@@ -235,6 +240,8 @@ class Recurrent(Layer):
 
   def compute_mask(self, inputs, mask):
     if self.return_sequences:
+      if isinstance(mask, list):
+        return mask[0]
       return mask
     else:
       return None
@@ -245,15 +252,15 @@ class Recurrent(Layer):
   def get_constants(self, inputs, training=None):
     return []
 
-  def get_initial_states(self, inputs):
+  def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
     initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
     initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
     initial_state = K.expand_dims(initial_state)  # (samples, 1)
     initial_state = K.tile(initial_state, [1,
                                            self.units])  # (samples, output_dim)
-    initial_states = [initial_state for _ in range(len(self.states))]
-    return initial_states
+    initial_state = [initial_state for _ in range(len(self.states))]
+    return initial_state
 
   def preprocess_input(self, inputs, training=None):
     return inputs
@@ -263,50 +270,62 @@ class Recurrent(Layer):
     # and if it a Keras tensor,
     # then add it to the inputs and temporarily
     # modify the input spec to include the state.
-    if initial_state is not None:
-      if hasattr(initial_state, '_keras_history'):
-        # Compute the full input spec, including state
-        input_spec = self.input_spec
-        state_spec = self.state_spec
-        if not isinstance(state_spec, list):
-          state_spec = [state_spec]
-        self.input_spec = [input_spec] + state_spec
-
-        # Compute the full inputs, including state
-        if not isinstance(initial_state, (list, tuple)):
-          initial_state = [initial_state]
-        inputs = [inputs] + list(initial_state)
-
-        # Perform the call
-        output = super(Recurrent, self).__call__(inputs, **kwargs)
-
-        # Restore original input spec
-        self.input_spec = input_spec
-        return output
-      else:
-        kwargs['initial_state'] = initial_state
-    return super(Recurrent, self).__call__(inputs, **kwargs)
+    if initial_state is None:
+      return super(Recurrent, self).__call__(inputs, **kwargs)
+
+    if not isinstance(initial_state, (list, tuple)):
+      initial_state = [initial_state]
+
+    is_keras_tensor = hasattr(initial_state[0], '_keras_history')
+    for tensor in initial_state:
+      if hasattr(tensor, '_keras_history') != is_keras_tensor:
+        raise ValueError('The initial state of an RNN layer cannot be'
+                         ' specified with a mix of Keras tensors and'
+                         ' non-Keras tensors')
+
+    if is_keras_tensor:
+      # Compute the full input spec, including state
+      input_spec = self.input_spec
+      state_spec = self.state_spec
+      if not isinstance(input_spec, list):
+        input_spec = [input_spec]
+      if not isinstance(state_spec, list):
+        state_spec = [state_spec]
+      self.input_spec = input_spec + state_spec
+
+      # Compute the full inputs, including state
+      inputs = [inputs] + list(initial_state)
+
+      # Perform the call
+      output = super(Recurrent, self).__call__(inputs, **kwargs)
+
+      # Restore original input spec
+      self.input_spec = input_spec
+      return output
+    else:
+      kwargs['initial_state'] = initial_state
+      return super(Recurrent, self).__call__(inputs, **kwargs)
 
-  def call(self, inputs, mask=None, initial_state=None, training=None):
+  def call(self, inputs, mask=None, training=None, initial_state=None):
     # input shape: `(samples, time (padded with zeros), input_dim)`
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
-    if initial_state is not None:
-      if not isinstance(initial_state, (list, tuple)):
-        initial_states = [initial_state]
-      else:
-        initial_states = list(initial_state)
     if isinstance(inputs, list):
-      initial_states = inputs[1:]
+      initial_state = inputs[1:]
       inputs = inputs[0]
+    elif initial_state is not None:
+      pass
     elif self.stateful:
-      initial_states = self.states
+      initial_state = self.states
     else:
-      initial_states = self.get_initial_states(inputs)
+      initial_state = self.get_initial_state(inputs)
+
+    if isinstance(mask, list):
+      mask = mask[0]
 
-    if len(initial_states) != len(self.states):
+    if len(initial_state) != len(self.states):
       raise ValueError('Layer has ' + str(len(self.states)) +
-                       ' states but was passed ' + str(len(initial_states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
                        ' initial states.')
     input_shape = K.int_shape(inputs)
     if self.unroll and input_shape[1] is None:
@@ -326,7 +345,7 @@ class Recurrent(Layer):
     last_output, outputs, states = K.rnn(
         self.step,
         preprocessed_input,
-        initial_states,
+        initial_state,
         go_backwards=self.go_backwards,
         mask=mask,
         constants=constants,
@@ -347,13 +366,10 @@ class Recurrent(Layer):
     else:
       return last_output
 
-  def reset_states(self, states_value=None):
+  def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
-    if not self.input_spec:
-      raise RuntimeError('Layer has never been called '
-                         'and thus has no states.')
-    batch_size = self.input_spec.shape[0]
+    batch_size = self.input_spec[0].shape[0]
     if not batch_size:
       raise ValueError('If a RNN is stateful, it needs to know '
                        'its batch size. Specify the batch size '
@@ -365,28 +381,27 @@ class Recurrent(Layer):
                        '- If using the functional API, specify '
                        'the time dimension by passing a '
                        '`batch_shape` argument to your Input layer.')
-    if states_value is not None:
-      if not isinstance(states_value, (list, tuple)):
-        states_value = [states_value]
-      if len(states_value) != len(self.states):
-        raise ValueError('The layer has ' + str(len(self.states)) +
-                         ' states, but the `states_value` '
-                         'argument passed '
-                         'only has ' + str(len(states_value)) + ' entries')
+    # initialize state if None
     if self.states[0] is None:
       self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
-      if not states_value:
-        return
-    for i, state in enumerate(self.states):
-      if states_value:
-        value = states_value[i]
+    elif states is None:
+      for state in self.states:
+        K.set_value(state, np.zeros((batch_size, self.units)))
+    else:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      if len(states) != len(self.states):
+        raise ValueError('Layer ' + self.name + ' expects ' +
+                         str(len(self.states)) + ' states, '
+                         'but it received ' + str(len(states)) +
+                         ' state values. Input received: ' + str(states))
+      for index, (value, state) in enumerate(zip(states, self.states)):
         if value.shape != (batch_size, self.units):
-          raise ValueError('Expected state #' + str(i) + ' to have shape ' +
-                           str((batch_size, self.units)) +
-                           ' but got array with shape ' + str(value.shape))
-      else:
-        value = np.zeros((batch_size, self.units))
-      K.set_value(state, value)
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' + self.name +
+                           ': expected shape=' + str((batch_size, self.units)) +
+                           ', found shape=' + str(value.shape))
+        K.set_value(state, value)
 
   def get_config(self):
     config = {
@@ -477,6 +492,7 @@ class SimpleRNN(Recurrent):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    self.state_spec = InputSpec(shape=(None, self.units))
 
   def build(self, input_shape):
     if isinstance(input_shape, list):
@@ -485,8 +501,7 @@ class SimpleRNN(Recurrent):
 
     batch_size = input_shape[0] if self.stateful else None
     self.input_dim = input_shape[2]
-    self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim))
-    self.state_spec = InputSpec(shape=(batch_size, self.units))
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
 
     self.states = [None]
     if self.stateful:
@@ -707,16 +722,15 @@ class GRU(Recurrent):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    self.state_spec = InputSpec(shape=(None, self.units))
 
   def build(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    self.input_spec = InputSpec(shape=input_shape)
     batch_size = input_shape[0] if self.stateful else None
     self.input_dim = input_shape[2]
-    self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim))
-    self.state_spec = InputSpec(shape=(batch_size, self.units))
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
 
     self.states = [None]
     if self.stateful:
@@ -1020,19 +1034,18 @@ class LSTM(Recurrent):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    self.state_spec = [
+        InputSpec(shape=(None, self.units)),
+        InputSpec(shape=(None, self.units))
+    ]
 
   def build(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    self.input_spec = InputSpec(shape=input_shape)
     batch_size = input_shape[0] if self.stateful else None
     self.input_dim = input_shape[2]
-    self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim))
-    self.state_spec = [
-        InputSpec(shape=(batch_size, self.units)), InputSpec(
-            shape=(batch_size, self.units))
-    ]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
 
     self.states = [None, None]
     if self.stateful:
@@ -1052,16 +1065,22 @@ class LSTM(Recurrent):
         constraint=self.recurrent_constraint)
 
     if self.use_bias:
+      if self.unit_forget_bias:
+
+        def bias_initializer(_, *args, **kwargs):
+          return K.concatenate([
+              self.bias_initializer((self.units,), *args, **kwargs),
+              initializers.Ones()((self.units,), *args, **kwargs),
+              self.bias_initializer((self.units * 2,), *args, **kwargs),
+          ])
+      else:
+        bias_initializer = self.bias_initializer
       self.bias = self.add_weight(
           shape=(self.units * 4,),
           name='bias',
-          initializer=self.bias_initializer,
+          initializer=bias_initializer,
           regularizer=self.bias_regularizer,
           constraint=self.bias_constraint)
-      if self.unit_forget_bias:
-        bias_value = np.zeros((self.units * 4,))
-        bias_value[self.units:self.units * 2] = 1.
-        K.set_value(self.bias, bias_value)
     else:
       self.bias = None
 
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers.py b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
index 092501cb11..dbc79fb193 100644
--- a/tensorflow/contrib/keras/python/keras/layers/wrappers.py
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
@@ -132,13 +132,18 @@ class TimeDistributed(Wrapper):
       model = Sequential()
       model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
       # now model.output_shape == (None, 10, 8)
+  ```
+
+  The output will then have shape `(32, 10, 8)`.
 
-      # subsequent layers: no need for input_shape
+  In subsequent layers, there is no need for the `input_shape`:
+
+  ```python
       model.add(TimeDistributed(Dense(32)))
       # now model.output_shape == (None, 10, 32)
   ```
 
-  The output will then have shape `(32, 10, 8)`.
+  The output will then have shape `(32, 10, 32)`.
 
   `TimeDistributed` can be used with arbitrary layers, not just `Dense`,
   for instance with a `Conv2D` layer:
@@ -186,12 +191,7 @@ class TimeDistributed(Wrapper):
         output = self.layer.call(x)
         return output, []
 
-      _, outputs, _ = K.rnn(
-          step,
-          inputs,
-          initial_states=[],
-          input_length=input_shape[1],
-          unroll=False)
+      _, outputs, _ = K.rnn(step, inputs, initial_states=[], unroll=False)
       y = outputs
     else:
       # No batch size specified, therefore the layer will be able
diff --git a/tensorflow/contrib/keras/python/keras/losses.py b/tensorflow/contrib/keras/python/keras/losses.py
index 54b8fa429d..777ec440ac 100644
--- a/tensorflow/contrib/keras/python/keras/losses.py
+++ b/tensorflow/contrib/keras/python/keras/losses.py
@@ -52,6 +52,20 @@ def hinge(y_true, y_pred):
   return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
+def categorical_hinge(y_true, y_pred):
+  pos = K.sum(y_true * y_pred, axis=-1)
+  neg = K.max((1. - y_true) * y_pred, axis=-1)
+  return K.maximum(neg - pos + 1., 0.)
+
+
+def logcosh(y_true, y_pred):
+
+  def cosh(x):
+    return (K.exp(x) + K.exp(-x)) / 2
+
+  return K.mean(K.log(cosh(y_pred - y_true)), axis=-1)
+
+
 def categorical_crossentropy(y_true, y_pred):
   return K.categorical_crossentropy(y_pred, y_true)
 
diff --git a/tensorflow/contrib/keras/python/keras/losses_test.py b/tensorflow/contrib/keras/python/keras/losses_test.py
index fd4458cce2..6bdcc0b5ff 100644
--- a/tensorflow/contrib/keras/python/keras/losses_test.py
+++ b/tensorflow/contrib/keras/python/keras/losses_test.py
@@ -34,7 +34,9 @@ ALL_LOSSES = [keras.losses.mean_squared_error,
               keras.losses.binary_crossentropy,
               keras.losses.kullback_leibler_divergence,
               keras.losses.poisson,
-              keras.losses.cosine_proximity]
+              keras.losses.cosine_proximity,
+              keras.losses.logcosh,
+              keras.losses.categorical_hinge]
 
 
 class KerasLossesTest(test.TestCase):
@@ -73,6 +75,14 @@ class KerasLossesTest(test.TestCase):
     new_fn = keras.losses.deserialize(config)
     self.assertEqual(fn, new_fn)
 
+  def test_categorical_hinge(self):
+    y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
+                                              [0.1, 0.2, 0.7]]))
+    y_true = keras.backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+    expected_loss = ((0.3 - 0.2 + 1) + (0.7 - 0.1 + 1)) / 2.0
+    loss = keras.backend.eval(keras.losses.categorical_hinge(y_true, y_pred))
+    self.assertAllClose(expected_loss, np.mean(loss))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/metrics.py b/tensorflow/contrib/keras/python/keras/metrics.py
index 59d380f73b..93c8684f91 100644
--- a/tensorflow/contrib/keras/python/keras/metrics.py
+++ b/tensorflow/contrib/keras/python/keras/metrics.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.keras.python.keras.losses import categorical_crossentrop
 from tensorflow.contrib.keras.python.keras.losses import cosine_proximity
 from tensorflow.contrib.keras.python.keras.losses import hinge
 from tensorflow.contrib.keras.python.keras.losses import kullback_leibler_divergence
+from tensorflow.contrib.keras.python.keras.losses import logcosh
 from tensorflow.contrib.keras.python.keras.losses import mean_absolute_error
 from tensorflow.contrib.keras.python.keras.losses import mean_absolute_percentage_error
 from tensorflow.contrib.keras.python.keras.losses import mean_squared_error
diff --git a/tensorflow/contrib/keras/python/keras/models.py b/tensorflow/contrib/keras/python/keras/models.py
index 1c041091fc..0ae373da3c 100644
--- a/tensorflow/contrib/keras/python/keras/models.py
+++ b/tensorflow/contrib/keras/python/keras/models.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 import copy
 import json
 import os
-import warnings
 
 import numpy as np
 
@@ -36,6 +35,7 @@ from tensorflow.contrib.keras.python.keras.engine.topology import Layer
 from tensorflow.contrib.keras.python.keras.engine.training import Model
 from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
 
 
 # pylint: disable=g-import-not-at-top
@@ -133,7 +133,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
   if include_optimizer and hasattr(model, 'optimizer'):
     if isinstance(model.optimizer, optimizers.TFOptimizer):
-      warnings.warn(
+      logging.warning(
           'TensorFlow optimizers do not '
           'make it possible to access '
           'optimizer attributes or optimizer state '
@@ -189,7 +189,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
   f.close()
 
 
-def load_model(filepath, custom_objects=None):
+def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
   """Loads a model saved via `save_model`.
 
   Arguments:
@@ -197,12 +197,16 @@ def load_model(filepath, custom_objects=None):
       custom_objects: Optional dictionary mapping names
           (strings) to custom classes or functions to be
           considered during deserialization.
+      compile: Boolean, whether to compile the model
+          after loading.
 
   Returns:
       A Keras model instance. If an optimizer was found
       as part of the saved model, the model is already
       compiled. Otherwise, the model is uncompiled and
-      a warning will be displayed.
+      a warning will be displayed. When `compile` is set
+      to False, the compilation is omitted without any
+      warning.
 
   Raises:
       ImportError: if h5py is not available.
@@ -264,11 +268,16 @@ def load_model(filepath, custom_objects=None):
   # set weights
   topology.load_weights_from_hdf5_group(f['model_weights'], model.layers)
 
+  # Early return if compilation is not required.
+  if not compile:
+    f.close()
+    return model
+
   # instantiate optimizer
   training_config = f.attrs.get('training_config')
   if training_config is None:
-    warnings.warn('No training configuration found in save file: '
-                  'the model was *not* compiled. Compile it manually.')
+    logging.warning('No training configuration found in save file: '
+                    'the model was *not* compiled. Compile it manually.')
     f.close()
     return model
   training_config = json.loads(training_config.decode('utf-8'))
@@ -320,9 +329,12 @@ def model_from_config(config, custom_objects=None):
 
   Returns:
       A Keras model instance (uncompiled).
+
+  Raises:
+      TypeError if `config` is not a dictionary
   """
   if isinstance(config, list):
-    raise TypeError('`model_fom_config` expects a dictionary, not a list. '
+    raise TypeError('`model_from_config` expects a dictionary, not a list. '
                     'Maybe you meant to use '
                     '`Sequential.from_config(config)`?')
   return layer_module.deserialize(config, custom_objects=custom_objects)
@@ -730,7 +742,7 @@ class Sequential(Model):
         optimizer: str (name of optimizer) or optimizer object.
             See [optimizers](/optimizers).
         loss: str (name of objective function) or objective function.
-            See [objectives](/objectives).
+            See [losses](/losses).
         metrics: list of metrics to be evaluated by the model
             during training and testing.
             Typically you will use `metrics=['accuracy']`.
@@ -739,7 +751,8 @@ class Sequential(Model):
             sample weighting (2D weights), set this to "temporal".
             "None" defaults to sample-wise weights (1D).
         **kwargs: for Theano backend, these are passed into K.function.
-            Ignored for Tensorflow backend.
+            When using the Tensorflow backend, these are passed into
+            `tf.Session.run`.
 
     Example:
         ```python
@@ -762,11 +775,14 @@ class Sequential(Model):
         **kwargs)
     self.optimizer = self.model.optimizer
     self.loss = self.model.loss
+    self.total_loss = self.model.total_loss
     self.loss_weights = self.model.loss_weights
     self.metrics = self.model.metrics
     self.metrics_tensors = self.model.metrics_tensors
     self.metrics_names = self.model.metrics_names
     self.sample_weight_mode = self.model.sample_weight_mode
+    self.sample_weights = self.model.sample_weights
+    self.targets = self.model.targets
 
   def fit(self,
           x,
@@ -966,10 +982,10 @@ class Sequential(Model):
     """
     preds = self.predict(x, batch_size, verbose)
     if preds.min() < 0. or preds.max() > 1.:
-      warnings.warn('Network returning invalid probability values. '
-                    'The last layer might not normalize predictions '
-                    'into probabilities '
-                    '(like softmax or sigmoid would).')
+      logging.warning('Network returning invalid probability values. '
+                      'The last layer might not normalize predictions '
+                      'into probabilities '
+                      '(like softmax or sigmoid would).')
     return preds
 
   def predict_classes(self, x, batch_size=32, verbose=1):
@@ -1018,8 +1034,8 @@ class Sequential(Model):
             - a tuple (inputs, targets, sample_weights).
             All arrays should contain the same number of samples.
             The generator is expected to loop over its data
-            indefinitely. An epoch finishes when `samples_per_epoch`
-            samples have been seen by the model.
+            indefinitely. An epoch finishes when `steps_per_epoch`
+            batches have been seen by the model.
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
@@ -1072,7 +1088,7 @@ class Sequential(Model):
                     f.close()
 
         model.fit_generator(generate_arrays_from_file('/my_file.txt'),
-                            samples_per_epoch=10000, epochs=10)
+                            steps_per_epoch=1000, epochs=10)
     ```
     """
     if self.model is None:
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image.py b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
index 8cceb441df..0d69396e8b 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/image.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
@@ -24,12 +24,12 @@ from __future__ import print_function
 import os
 import re
 import threading
-import warnings
 
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.python.platform import tf_logging as logging
 
 
 # pylint: disable=g-import-not-at-top
@@ -368,9 +368,9 @@ def load_img(path, grayscale=False, target_size=None):
     if img.mode != 'RGB':
       img = img.convert('RGB')
   if target_size:
-    wh_tuple = (target_size[1], target_size[0])
-    if img.size != wh_tuple:
-      img = img.resize(wh_tuple)
+    hw_tuple = (target_size[1], target_size[0])
+    if img.size != hw_tuple:
+      img = img.resize(hw_tuple)
   return img
 
 
@@ -391,6 +391,7 @@ class ImageDataGenerator(object):
       featurewise_std_normalization: divide inputs by std of the dataset.
       samplewise_std_normalization: divide each input by its std.
       zca_whitening: apply ZCA whitening.
+      zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
       rotation_range: degrees (0 to 180).
       width_shift_range: fraction of total width.
       height_shift_range: fraction of total height.
@@ -428,6 +429,7 @@ class ImageDataGenerator(object):
                featurewise_std_normalization=False,
                samplewise_std_normalization=False,
                zca_whitening=False,
+               zca_epsilon=1e-6,
                rotation_range=0.,
                width_shift_range=0.,
                height_shift_range=0.,
@@ -448,6 +450,7 @@ class ImageDataGenerator(object):
     self.featurewise_std_normalization = featurewise_std_normalization
     self.samplewise_std_normalization = samplewise_std_normalization
     self.zca_whitening = zca_whitening
+    self.zca_epsilon = zca_epsilon
     self.rotation_range = rotation_range
     self.width_shift_range = width_shift_range
     self.height_shift_range = height_shift_range
@@ -497,7 +500,7 @@ class ImageDataGenerator(object):
            seed=None,
            save_to_dir=None,
            save_prefix='',
-           save_format='jpeg'):
+           save_format='png'):
     return NumpyArrayIterator(
         x,
         y,
@@ -521,7 +524,7 @@ class ImageDataGenerator(object):
                           seed=None,
                           save_to_dir=None,
                           save_prefix='',
-                          save_format='jpeg',
+                          save_format='png',
                           follow_links=False):
     return DirectoryIterator(
         directory,
@@ -563,28 +566,28 @@ class ImageDataGenerator(object):
       if self.mean is not None:
         x -= self.mean
       else:
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`featurewise_center`, but it hasn\'t'
-                      'been fit on any training data. Fit it '
-                      'first by calling `.fit(numpy_data)`.')
+        logging.warning('This ImageDataGenerator specifies '
+                        '`featurewise_center`, but it hasn\'t'
+                        'been fit on any training data. Fit it '
+                        'first by calling `.fit(numpy_data)`.')
     if self.featurewise_std_normalization:
       if self.std is not None:
         x /= (self.std + 1e-7)
       else:
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`featurewise_std_normalization`, but it hasn\'t'
-                      'been fit on any training data. Fit it '
-                      'first by calling `.fit(numpy_data)`.')
+        logging.warning('This ImageDataGenerator specifies '
+                        '`featurewise_std_normalization`, but it hasn\'t'
+                        'been fit on any training data. Fit it '
+                        'first by calling `.fit(numpy_data)`.')
     if self.zca_whitening:
       if self.principal_components is not None:
         flatx = np.reshape(x, (x.size))
         whitex = np.dot(flatx, self.principal_components)
         x = np.reshape(whitex, (x.shape[0], x.shape[1], x.shape[2]))
       else:
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`zca_whitening`, but it hasn\'t'
-                      'been fit on any training data. Fit it '
-                      'first by calling `.fit(numpy_data)`.')
+        logging.warning('This ImageDataGenerator specifies '
+                        '`zca_whitening`, but it hasn\'t'
+                        'been fit on any training data. Fit it '
+                        'first by calling `.fit(numpy_data)`.')
     return x
 
   def random_transform(self, x):
@@ -640,7 +643,8 @@ class ImageDataGenerator(object):
     transform_matrix = None
     if theta != 0:
       rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
-                                  [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
+                                  [np.sin(theta),
+                                   np.cos(theta), 0], [0, 0, 1]])
       transform_matrix = rotation_matrix
 
     if tx != 0 or ty != 0:
@@ -748,7 +752,7 @@ class ImageDataGenerator(object):
       sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
       u, s, _ = linalg.svd(sigma)
       self.principal_components = np.dot(
-          np.dot(u, np.diag(1. / np.sqrt(s + 10e-7))), u.T)
+          np.dot(u, np.diag(1. / np.sqrt(s + self.zca_epsilon))), u.T)
 
 
 class Iterator(object):
@@ -836,7 +840,7 @@ class NumpyArrayIterator(Iterator):
                data_format=None,
                save_to_dir=None,
                save_prefix='',
-               save_format='jpeg'):
+               save_format='png'):
     if y is not None and len(x) != len(y):
       raise ValueError('X (images tensor) and y (labels) '
                        'should have the same length. '
@@ -927,6 +931,8 @@ class DirectoryIterator(Iterator):
           `"binary"`: binary targets (if there are only two classes),
           `"categorical"`: categorical targets,
           `"sparse"`: integer targets,
+          `"input"`: targets are images identical to input images (mainly
+              used to work with autoencoders),
           `None`: no targets get yielded (only input images are yielded).
       batch_size: Integer, size of a batch.
       shuffle: Boolean, whether to shuffle the data between epochs.
@@ -955,7 +961,7 @@ class DirectoryIterator(Iterator):
                data_format=None,
                save_to_dir=None,
                save_prefix='',
-               save_format='jpeg',
+               save_format='png',
                follow_links=False):
     if data_format is None:
       data_format = K.image_data_format()
@@ -978,10 +984,11 @@ class DirectoryIterator(Iterator):
       else:
         self.image_shape = (1,) + self.target_size
     self.classes = classes
-    if class_mode not in {'categorical', 'binary', 'sparse', None}:
+    if class_mode not in {'categorical', 'binary', 'sparse', 'input', None}:
       raise ValueError('Invalid class_mode:', class_mode,
                        '; expected one of "categorical", '
-                       '"binary", "sparse", or None.')
+                       '"binary", "sparse", "input"'
+                       ' or None.')
     self.class_mode = class_mode
     self.save_to_dir = save_to_dir
     self.save_prefix = save_prefix
@@ -1076,7 +1083,9 @@ class DirectoryIterator(Iterator):
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     # build batch of labels
-    if self.class_mode == 'sparse':
+    if self.class_mode == 'input':
+      batch_y = batch_x.copy()
+    elif self.class_mode == 'sparse':
       batch_y = self.classes[index_array]
     elif self.class_mode == 'binary':
       batch_y = self.classes[index_array].astype(K.floatx())
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
index 692a359ead..382aa386d4 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
@@ -205,7 +205,8 @@ def skipgrams(sequence,
     words = [c[0] for c in couples]
     random.shuffle(words)
 
-    couples += [[words[i % len(words)], random.randint(1, vocabulary_size - 1)]
+    couples += [[words[i % len(words)],
+                 random.randint(1, vocabulary_size - 1)]
                 for i in range(num_negative_samples)]
     if categorical:
       labels += [[1, 0]] * num_negative_samples
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text.py b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
index 5b89c8035c..93e629af17 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/text.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
@@ -20,15 +20,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from collections import OrderedDict
 import string
 import sys
-import warnings
 
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-
 if sys.version_info < (3,):
   maketrans = string.maketrans
 else:
@@ -39,7 +38,7 @@ def text_to_word_sequence(text,
                           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                           lower=True,
                           split=' '):
-  """Converts a text to a sequence of word indices.
+  """Converts a text to a sequence of words (or tokens).
 
   Arguments:
       text: Input text (string).
@@ -48,7 +47,7 @@ def text_to_word_sequence(text,
       split: Sentence split marker (string).
 
   Returns:
-      A list of integer word indices.
+      A list of words (or tokens).
   """
   if lower:
     text = text.lower()
@@ -83,7 +82,7 @@ class Tokenizer(object):
           tabs and line breaks, minus the `'` character.
       lower: boolean. Whether to convert the texts to lowercase.
       split: character or string to use for token splitting.
-      char_level: if True, every character will be treated as a word.
+      char_level: if True, every character will be treated as a token.
 
   By default, all punctuation is removed, turning the texts into
   space-separated sequences of words
@@ -98,17 +97,8 @@ class Tokenizer(object):
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
-               char_level=False,
-               **kwargs):
-    # Legacy support
-    if 'nb_words' in kwargs:
-      warnings.warn('The `nb_words` argument in `Tokenizer` '
-                    'has been renamed `num_words`.')
-      num_words = kwargs.pop('nb_words')
-    if kwargs:
-      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    self.word_counts = {}
+               char_level=False):
+    self.word_counts = OrderedDict()
     self.word_docs = {}
     self.filters = filters
     self.split = split
diff --git a/tensorflow/contrib/keras/python/keras/utils/conv_utils.py b/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
index 7d4fdda296..570a63b606 100644
--- a/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
@@ -89,7 +89,7 @@ def convert_kernel(kernel):
   Also works reciprocally, since the transformation is its own inverse.
 
   Arguments:
-      kernel: Numpy array (4D or 5D).
+      kernel: Numpy array (3D, 4D or 5D).
 
   Returns:
       The converted kernel.
@@ -97,7 +97,8 @@ def convert_kernel(kernel):
   Raises:
       ValueError: in case of invalid kernel shape or invalid data_format.
   """
-  if not 4 <= kernel.ndim <= 5:
+  kernel = np.asarray(kernel)
+  if not 3 <= kernel.ndim <= 5:
     raise ValueError('Invalid kernel shape:', kernel.shape)
   slices = [slice(None, None, -1) for _ in range(kernel.ndim)]
   no_flip = (slice(None, None), slice(None, None))
diff --git a/tensorflow/contrib/keras/python/keras/utils/data_utils.py b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
index 5a42444308..61a11b95e8 100644
--- a/tensorflow/contrib/keras/python/keras/utils/data_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import hashlib
 import os
 import shutil
@@ -54,8 +53,10 @@ if sys.version_info[0] == 2:
     """
 
     def chunk_read(response, chunk_size=8192, reporthook=None):
-      total_size = response.info().get('Content-Length').strip()
-      total_size = int(total_size)
+      content_type = response.info().get('Content-Length')
+      total_size = -1
+      if content_type is not None:
+        total_size = int(content_type.strip())
       count = 0
       while 1:
         chunk = response.read(chunk_size)
@@ -204,19 +205,24 @@ def get_file(fname,
 
   if download:
     print('Downloading data from', origin)
-    progbar = None
 
-    def dl_progress(count, block_size, total_size, progbar=None):
-      if progbar is None:
-        progbar = Progbar(total_size)
+    class ProgressTracker(object):
+      # Maintain progbar for the lifetime of download.
+      # This design was chosen for Python 2.7 compatibility.
+      progbar = None
+
+    def dl_progress(count, block_size, total_size):
+      if ProgressTracker.progbar is None:
+        if total_size is -1:
+          total_size = None
+        ProgressTracker.progbar = Progbar(total_size)
       else:
-        progbar.update(count * block_size)
+        ProgressTracker.progbar.update(count * block_size)
 
     error_msg = 'URL fetch failure on {}: {} -- {}'
     try:
       try:
-        urlretrieve(origin, fpath,
-                    functools.partial(dl_progress, progbar=progbar))
+        urlretrieve(origin, fpath, dl_progress)
       except URLError as e:
         raise Exception(error_msg.format(origin, e.errno, e.reason))
       except HTTPError as e:
@@ -225,7 +231,7 @@ def get_file(fname,
       if os.path.exists(fpath):
         os.remove(fpath)
       raise
-    progbar = None
+    ProgressTracker.progbar = None
 
   if untar:
     if not os.path.exists(untar_fpath):
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
index 27cc23f232..5cae694d54 100644
--- a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
@@ -45,8 +45,8 @@ class CustomObjectScope(object):
   Consider a custom object `MyObject`
 
   ```python
-      with CustomObjectScope({"MyObject":MyObject}):
-          layer = Dense(..., W_regularizer="MyObject")
+      with CustomObjectScope({'MyObject':MyObject}):
+          layer = Dense(..., kernel_regularizer='MyObject')
           # save, load, etc. will recognize custom object by name
   ```
   """
@@ -81,8 +81,8 @@ def custom_object_scope(*args):
   Consider a custom object `MyObject`
 
   ```python
-      with custom_object_scope({"MyObject":MyObject}):
-          layer = Dense(..., W_regularizer="MyObject")
+      with custom_object_scope({'MyObject':MyObject}):
+          layer = Dense(..., kernel_regularizer='MyObject')
           # save, load, etc. will recognize custom object by name
   ```
 
@@ -107,7 +107,7 @@ def get_custom_objects():
 
   ```python
       get_custom_objects().clear()
-      get_custom_objects()["MyObject"] = MyObject
+      get_custom_objects()['MyObject'] = MyObject
   ```
 
   Returns:
@@ -152,19 +152,23 @@ def deserialize_keras_object(identifier,
         raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
     if hasattr(cls, 'from_config'):
       arg_spec = tf_inspect.getargspec(cls.from_config)
+      custom_objects = custom_objects or {}
+
       if 'custom_objects' in arg_spec.args:
-        custom_objects = custom_objects or {}
         return cls.from_config(
             config['config'],
             custom_objects=dict(
                 list(_GLOBAL_CUSTOM_OBJECTS.items()) +
                 list(custom_objects.items())))
-      return cls.from_config(config['config'])
+      with CustomObjectScope(custom_objects):
+        return cls.from_config(config['config'])
     else:
       # Then `cls` may be a function returning a class.
       # in this case by convention `config` holds
       # the kwargs of the function.
-      return cls(**config['config'])
+      custom_objects = custom_objects or {}
+      with CustomObjectScope(custom_objects):
+        return cls(**config['config'])
   elif isinstance(identifier, six.string_types):
     function_name = identifier
     if custom_objects and function_name in custom_objects:
@@ -174,18 +178,14 @@ def deserialize_keras_object(identifier,
     else:
       fn = module_objects.get(function_name)
       if fn is None:
-        raise ValueError('Unknown ' + printable_module_name,
-                         ':' + function_name)
+        raise ValueError('Unknown ' + printable_module_name + ':' +
+                         function_name)
     return fn
   else:
     raise ValueError('Could not interpret serialized ' + printable_module_name +
                      ': ' + identifier)
 
 
-def make_tuple(*args):
-  return args
-
-
 def func_dump(func):
   """Serializes a user defined function.
 
@@ -231,12 +231,14 @@ class Progbar(object):
   """Displays a progress bar.
 
   Arguments:
-      target: Total number of steps expected.
+      target: Total number of steps expected, None if unknown.
       interval: Minimum visual progress update interval (in seconds).
   """
 
   def __init__(self, target, width=30, verbose=1, interval=0.05):
     self.width = width
+    if target is None:
+      target = -1
     self.target = target
     self.sum_values = {}
     self.unique_values = []
@@ -277,21 +279,22 @@ class Progbar(object):
       sys.stdout.write('\b' * prev_total_width)
       sys.stdout.write('\r')
 
-      numdigits = int(np.floor(np.log10(self.target))) + 1
-      barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
-      bar = barstr % (current, self.target)
-      prog = float(current) / self.target
-      prog_width = int(self.width * prog)
-      if prog_width > 0:
-        bar += ('=' * (prog_width - 1))
-        if current < self.target:
-          bar += '>'
-        else:
-          bar += '='
-      bar += ('.' * (self.width - prog_width))
-      bar += ']'
-      sys.stdout.write(bar)
-      self.total_width = len(bar)
+      if self.target is not -1:
+        numdigits = int(np.floor(np.log10(self.target))) + 1
+        barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
+        bar = barstr % (current, self.target)
+        prog = float(current) / self.target
+        prog_width = int(self.width * prog)
+        if prog_width > 0:
+          bar += ('=' * (prog_width - 1))
+          if current < self.target:
+            bar += '>'
+          else:
+            bar += '='
+        bar += ('.' * (self.width - prog_width))
+        bar += ']'
+        sys.stdout.write(bar)
+        self.total_width = len(bar)
 
       if current:
         time_per_unit = (now - self.start) / current
@@ -299,7 +302,7 @@ class Progbar(object):
         time_per_unit = 0
       eta = time_per_unit * (self.target - current)
       info = ''
-      if current < self.target:
+      if current < self.target and self.target is not -1:
         info += ' - ETA: %ds' % eta
       else:
         info += ' - %ds' % (now - self.start)
diff --git a/tensorflow/contrib/keras/python/keras/utils/io_utils.py b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
index 7cef39b03f..55c135b5eb 100644
--- a/tensorflow/contrib/keras/python/keras/utils/io_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
@@ -80,8 +80,13 @@ class HDF5Matrix(object):
 
   def __getitem__(self, key):
     if isinstance(key, slice):
-      if key.stop + self.start <= self.end:
-        idx = slice(key.start + self.start, key.stop + self.start)
+      start, stop = key.start, key.stop
+      if start is None:
+        start = 0
+      if stop is None:
+        stop = self.data.shape[0]
+      if stop + self.start <= self.end:
+        idx = slice(start + self.start, stop + self.start)
       else:
         raise IndexError
     elif isinstance(key, int):
diff --git a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
index 26878fdd57..154070fb93 100644
--- a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
@@ -38,8 +38,11 @@ def print_summary(model, line_length=None, positions=None):
   else:
     sequential_like = True
     for v in model.nodes_by_depth.values():
-      if len(v) > 1:
+      if (len(v) > 1) or (len(v) == 1 and len(v[0].inbound_layers) > 1):
+        # If the model has multiple nodes or if the nodes have
+        # multiple inbound_layers, the model is no longer sequential.
         sequential_like = False
+        break
 
   if sequential_like:
     line_length = line_length or 65
@@ -94,12 +97,10 @@ def print_summary(model, line_length=None, positions=None):
     except AttributeError:
       output_shape = 'multiple'
     connections = []
-    for node_index, node in enumerate(layer.inbound_nodes):
-      if relevant_nodes:
-        node_key = layer.name + '_ib-' + str(node_index)
-        if node_key not in relevant_nodes:
-          # node is node part of the current network
-          continue
+    for node in layer.inbound_nodes:
+      if relevant_nodes and node not in relevant_nodes:
+        # node is not part of the current network
+        continue
       for i in range(len(node.inbound_layers)):
         inbound_layer = node.inbound_layers[i].name
         inbound_node_index = node.node_indices[i]
@@ -114,8 +115,8 @@ def print_summary(model, line_length=None, positions=None):
     else:
       first_connection = connections[0]
     fields = [
-        name + ' (' + cls_name + ')', output_shape, layer.count_params(),
-        first_connection
+        name + ' (' + cls_name + ')', output_shape,
+        layer.count_params(), first_connection
     ]
     print_row(fields, positions)
     if len(connections) > 1:
@@ -134,8 +135,10 @@ def print_summary(model, line_length=None, positions=None):
     else:
       print('_' * line_length)
 
-  trainable_count, non_trainable_count = count_total_params(
-      layers, layer_set=None)
+  trainable_count = int(
+      np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
+  non_trainable_count = int(
+      np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
 
   print('Total params: {:,}'.format(trainable_count + non_trainable_count))
   print('Trainable params: {:,}'.format(trainable_count))
@@ -143,37 +146,6 @@ def print_summary(model, line_length=None, positions=None):
   print('_' * line_length)
 
 
-def count_total_params(layers, layer_set=None):
-  """Counts the number of parameters in a list of layers.
-
-  Arguments:
-      layers: list of layers.
-      layer_set: set of layers already seen
-          (so that we don't count their weights twice).
-
-  Returns:
-      A tuple (count of trainable weights, count of non-trainable weights.)
-  """
-  if layer_set is None:
-    layer_set = set()
-  trainable_count = 0
-  non_trainable_count = 0
-  for layer in layers:
-    if layer in layer_set:
-      continue
-    layer_set.add(layer)
-    if hasattr(layer, 'layers'):
-      t, nt = count_total_params(layer.layers, layer_set)
-      trainable_count += t
-      non_trainable_count += nt
-    else:
-      trainable_count += np.sum(
-          [K.count_params(p) for p in layer.trainable_weights])
-      non_trainable_count += np.sum(
-          [K.count_params(p) for p in layer.non_trainable_weights])
-  return int(trainable_count), int(non_trainable_count)
-
-
 def convert_all_kernels_in_model(model):
   """Converts all convolution kernels in a model from Theano to TensorFlow.
 
@@ -218,7 +190,7 @@ def convert_dense_weights_data_format(dense,
           came before the target `Dense` layer.
       target_data_format: One of "channels_last", "channels_first".
           Set it "channels_last"
-          if converting a "chnnels_first" model to "channels_last",
+          if converting a "channels_first" model to "channels_last",
           or reciprocally.
   """
   assert target_data_format in {'channels_last', 'channels_first'}
diff --git a/tensorflow/contrib/keras/python/keras/utils/vis_utils.py b/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
index 9e2ee86424..949767299b 100644
--- a/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
@@ -39,18 +39,28 @@ except ImportError:
 
 
 def _check_pydot():
-  if not (pydot and pydot.find_graphviz()):
+  try:
+    # Attempt to create an image of a blank graph
+    # to check the pydot/graphviz installation.
+    pydot.Dot.create(pydot.Dot())
+  except Exception:
+    # pydot raises a generic Exception here,
+    # so no specific class can be caught.
     raise ImportError('Failed to import pydot. You must install pydot'
                       ' and graphviz for `pydotprint` to work.')
 
 
-def model_to_dot(model, show_shapes=False, show_layer_names=True):
-  """Converts a Keras model to dot format.
+def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
+  """Convert a Keras model to dot format.
 
   Arguments:
       model: A Keras model instance.
       show_shapes: whether to display shape information.
       show_layer_names: whether to display layer names.
+      rankdir: `rankdir` argument passed to PyDot,
+          a string specifying the format of the plot:
+          'TB' creates a vertical plot;
+          'LR' creates a horizontal plot.
 
   Returns:
       A `pydot.Dot` instance representing the Keras model.
@@ -60,7 +70,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True):
 
   _check_pydot()
   dot = pydot.Dot()
-  dot.set('rankdir', 'TB')
+  dot.set('rankdir', rankdir)
   dot.set('concentrate', True)
   dot.set_node_defaults(shape='record')
 
@@ -102,7 +112,6 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True):
         inputlabels = 'multiple'
       label = '%s\n|{input:|output:}|{{%s}|{%s}}' % (label, inputlabels,
                                                      outputlabels)
-
     node = pydot.Node(layer_id, label=label)
     dot.add_node(node)
 
@@ -122,8 +131,21 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True):
 def plot_model(model,
                to_file='model.png',
                show_shapes=False,
-               show_layer_names=True):
-  dot = model_to_dot(model, show_shapes, show_layer_names)
+               show_layer_names=True,
+               rankdir='TB'):
+  """Converts a Keras model to dot format and save to a file.
+
+  Arguments:
+      model: A Keras model instance
+      to_file: File name of the plot image.
+      show_shapes: whether to display shape information.
+      show_layer_names: whether to display layer names.
+      rankdir: `rankdir` argument passed to PyDot,
+          a string specifying the format of the plot:
+          'TB' creates a vertical plot;
+          'LR' creates a horizontal plot.
+  """
+  dot = model_to_dot(model, show_shapes, show_layer_names, rankdir)
   _, extension = os.path.splitext(to_file)
   if not extension:
     extension = 'png'
author	Francois Chollet <fchollet@google.com>	2017-05-31 10:33:22 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-05-31 10:38:01 -0700
commit	d21bf7d7502f447e5f967a479282b32b5845ba8b (patch)
tree	455de8814f480728c77d919a8d2a170e2a47b300 /tensorflow/contrib/keras
parent	43bfc138c9676fb54945fbede977b90a0c0aea79 (diff)