Backport changes from external Keras into tf.contrib.keras.

PiperOrigin-RevId: 161161308
author: Francois Chollet <fchollet@google.com> 2017-07-06 19:29:26 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-07-06 19:33:27 -0700
commit: 24101b35f3baebbfff3d8057ac223b325bc415ce (patch)
tree: f2a022c43038928119fc9476769a692726531a30 /tensorflow/contrib/keras
parent: 154df32a959df74b3a1c377ff72f955d755b3d34 (diff)
48 files changed, 3125 insertions, 660 deletions
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index 619ebb7ce0..9e92b03399 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -52,6 +52,7 @@ py_library(
         "python/keras/applications/__init__.py",
         "python/keras/applications/imagenet_utils.py",
         "python/keras/applications/inception_v3.py",
+        "python/keras/applications/mobilenet.py",
         "python/keras/applications/resnet50.py",
         "python/keras/applications/vgg16.py",
         "python/keras/applications/vgg19.py",
@@ -240,6 +241,17 @@ py_test(
 )
 
 py_test(
+    name = "mobilenet_test",
+    size = "small",
+    srcs = ["python/keras/applications/mobilenet_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
     name = "resnet50_test",
     size = "small",
     srcs = ["python/keras/applications/resnet50_test.py"],
@@ -480,6 +492,40 @@ py_test(
 )
 
 py_test(
+    name = "data_utils_test",
+    size = "small",
+    srcs = ["python/keras/utils/data_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "generic_utils_test",
+    size = "small",
+    srcs = ["python/keras/utils/generic_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "imagenet_utils_test",
+    size = "small",
+    srcs = ["python/keras/applications/imagenet_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
     name = "image_test",
     size = "medium",
     srcs = ["python/keras/preprocessing/image_test.py"],
diff --git a/tensorflow/contrib/keras/python/keras/__init__.py b/tensorflow/contrib/keras/python/keras/__init__.py
index 1c1485c0cd..6e0e03d7f7 100644
--- a/tensorflow/contrib/keras/python/keras/__init__.py
+++ b/tensorflow/contrib/keras/python/keras/__init__.py
@@ -37,4 +37,4 @@ from tensorflow.contrib.keras.python.keras import utils
 from tensorflow.contrib.keras.python.keras import wrappers
 from tensorflow.contrib.keras.python.keras.layers import Input
 
-__version__ = '2.0.4-tf'
+__version__ = '2.0.5-tf'
diff --git a/tensorflow/contrib/keras/python/keras/activations.py b/tensorflow/contrib/keras/python/keras/activations.py
index 35d15e74c2..7f04234e01 100644
--- a/tensorflow/contrib/keras/python/keras/activations.py
+++ b/tensorflow/contrib/keras/python/keras/activations.py
@@ -54,6 +54,23 @@ def elu(x, alpha=1.0):
   return K.elu(x, alpha)
 
 
+def selu(x):
+  """Scaled Exponential Linear Unit. (Klambauer et al., 2017).
+
+  Arguments:
+      x: A tensor or variable to compute the activation function for.
+
+  Returns:
+    Tensor with the same shape and dtype as `x`.
+
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+  """
+  alpha = 1.6732632423543772848170429916717
+  scale = 1.0507009873554804934193349852946
+  return scale * K.elu(x, alpha)
+
+
 def softplus(x):
   return K.softplus(x)
 
diff --git a/tensorflow/contrib/keras/python/keras/activations_test.py b/tensorflow/contrib/keras/python/keras/activations_test.py
index eec4d257f2..3d21610e49 100644
--- a/tensorflow/contrib/keras/python/keras/activations_test.py
+++ b/tensorflow/contrib/keras/python/keras/activations_test.py
@@ -35,7 +35,7 @@ class KerasActivationsTest(test.TestCase):
   def test_serialization(self):
     all_activations = ['softmax', 'relu', 'elu', 'tanh',
                        'sigmoid', 'hard_sigmoid', 'linear',
-                       'softplus', 'softsign']
+                       'softplus', 'softsign', 'selu']
     for name in all_activations:
       fn = keras.activations.get(name)
       ref_fn = getattr(keras.activations, name)
@@ -63,6 +63,22 @@ class KerasActivationsTest(test.TestCase):
     expected = _ref_softmax(test_values[0, 0])
     self.assertAllClose(result[0, 0], expected, rtol=1e-05)
 
+  def test_selu(self):
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.selu(x)])
+    alpha = 1.6732632423543772848170429916717
+    scale = 1.0507009873554804934193349852946
+
+    with self.test_session():
+      positive_values = np.array([[1, 2]], dtype=keras.backend.floatx())
+      result = f([positive_values])[0]
+      self.assertAllClose(result, positive_values * scale, rtol=1e-05)
+
+      negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+      result = f([negative_values])[0]
+      true_result = (np.exp(negative_values) - 1) * scale * alpha
+      self.assertAllClose(result, true_result)
+
   def test_softplus(self):
     def softplus(x):
       return np.log(np.ones_like(x) + np.exp(x))
diff --git a/tensorflow/contrib/keras/python/keras/applications/__init__.py b/tensorflow/contrib/keras/python/keras/applications/__init__.py
index c6af9ea9f1..9139df30a6 100644
--- a/tensorflow/contrib/keras/python/keras/applications/__init__.py
+++ b/tensorflow/contrib/keras/python/keras/applications/__init__.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.keras.python.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.python.keras.applications.mobilenet import MobileNet
 from tensorflow.contrib.keras.python.keras.applications.resnet50 import ResNet50
 from tensorflow.contrib.keras.python.keras.applications.vgg16 import VGG16
 from tensorflow.contrib.keras.python.keras.applications.vgg19 import VGG19
 from tensorflow.contrib.keras.python.keras.applications.xception import Xception
-
diff --git a/tensorflow/contrib/keras/python/keras/applications/imagenet_utils_test.py b/tensorflow/contrib/keras/python/keras/applications/imagenet_utils_test.py
new file mode 100644
index 0000000000..f3bcf93a95
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/imagenet_utils_test.py
@@ -0,0 +1,139 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Inception V3 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class ImageNetUtilsTest(test.TestCase):
+
+  def test_preprocess_input(self):
+    x = np.random.uniform(0, 255, (2, 3, 2, 3))
+    self.assertEqual(
+        keras.applications.imagenet_utils.preprocess_input(x).shape, x.shape)
+
+    out1 = keras.applications.imagenet_utils.preprocess_input(
+        x, 'channels_last')
+    out2 = keras.applications.imagenet_utils.preprocess_input(
+        np.transpose(x, (0, 3, 1, 2)), 'channels_first')
+    self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
+
+  def test_decode_predictions(self):
+    x = np.zeros((2, 1000))
+    x[0, 372] = 1.0
+    x[1, 549] = 1.0
+    outs = keras.applications.imagenet_utils.decode_predictions(x, top=1)
+    scores = [out[0][2] for out in outs]
+    self.assertEqual(scores[0], scores[1])
+
+    # the numbers of columns and ImageNet classes are not identical.
+    with self.assertRaises(ValueError):
+      keras.applications.imagenet_utils.decode_predictions(np.ones((2, 100)))
+
+  def test_obtain_input_shape(self):
+    # input_shape and default_size are not identical.
+    with self.assertRaises(ValueError):
+      keras.applications.imagenet_utils._obtain_input_shape(
+          input_shape=(224, 224, 3),
+          default_size=299,
+          min_size=139,
+          data_format='channels_last',
+          include_top=True)
+
+    # Test invalid use cases
+    for data_format in ['channels_last', 'channels_first']:
+      # input_shape is smaller than min_size.
+      shape = (100, 100)
+      if data_format == 'channels_last':
+        input_shape = shape + (3,)
+      else:
+        input_shape = (3,) + shape
+      with self.assertRaises(ValueError):
+        keras.applications.imagenet_utils._obtain_input_shape(
+            input_shape=input_shape,
+            default_size=None,
+            min_size=139,
+            data_format=data_format,
+            include_top=False)
+
+      # shape is 1D.
+      shape = (100,)
+      if data_format == 'channels_last':
+        input_shape = shape + (3,)
+      else:
+        input_shape = (3,) + shape
+      with self.assertRaises(ValueError):
+        keras.applications.imagenet_utils._obtain_input_shape(
+            input_shape=input_shape,
+            default_size=None,
+            min_size=139,
+            data_format=data_format,
+            include_top=False)
+
+      # the number of channels is 5 not 3.
+      shape = (100, 100)
+      if data_format == 'channels_last':
+        input_shape = shape + (5,)
+      else:
+        input_shape = (5,) + shape
+      with self.assertRaises(ValueError):
+        keras.applications.imagenet_utils._obtain_input_shape(
+            input_shape=input_shape,
+            default_size=None,
+            min_size=139,
+            data_format=data_format,
+            include_top=False)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=None,
+        default_size=None,
+        min_size=139,
+        data_format='channels_last',
+        include_top=False) == (None, None, 3)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=None,
+        default_size=None,
+        min_size=139,
+        data_format='channels_first',
+        include_top=False) == (3, None, None)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=None,
+        default_size=None,
+        min_size=139,
+        data_format='channels_last',
+        include_top=False) == (None, None, 3)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=(150, 150, 3),
+        default_size=None,
+        min_size=139,
+        data_format='channels_last',
+        include_top=False) == (150, 150, 3)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=(3, None, None),
+        default_size=None,
+        min_size=139,
+        data_format='channels_first',
+        include_top=False) == (3, None, None)
diff --git a/tensorflow/contrib/keras/python/keras/applications/mobilenet.py b/tensorflow/contrib/keras/python/keras/applications/mobilenet.py
new file mode 100644
index 0000000000..37240234d3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/mobilenet.py
@@ -0,0 +1,655 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MobileNet v1 models for Keras.
+
+MobileNet is a general architecture and can be used for multiple use cases.
+Depending on the use case, it can use different input layer size and
+different width factors. This allows different width models to reduce
+the number of multiply-adds and thereby
+reduce inference cost on mobile devices.
+
+MobileNets support any input size greater than 32 x 32, with larger image sizes
+offering better performance.
+The number of parameters and number of multiply-adds
+can be modified by using the `alpha` parameter,
+which increases/decreases the number of filters in each layer.
+By altering the image size and `alpha` parameter,
+all 16 models from the paper can be built, with ImageNet weights provided.
+
+The paper demonstrates the performance of MobileNets using `alpha` values of
+1.0 (also called 100 % MobileNet), 0.75, 0.5 and 0.25.
+For each of these `alpha` values, weights for 4 different input image sizes
+are provided (224, 192, 160, 128).
+
+The following table describes the size and accuracy of the 100% MobileNet
+on size 224 x 224:
+----------------------------------------------------------------------------
+Width Multiplier (alpha) | ImageNet Acc |  Multiply-Adds (M) |  Params (M)
+----------------------------------------------------------------------------
+|   1.0 MobileNet-224    |    70.6 %     |        529        |     4.2     |
+|   0.75 MobileNet-224   |    68.4 %     |        325        |     2.6     |
+|   0.50 MobileNet-224   |    63.7 %     |        149        |     1.3     |
+|   0.25 MobileNet-224   |    50.6 %     |        41         |     0.5     |
+----------------------------------------------------------------------------
+
+The following table describes the performance of
+the 100 % MobileNet on various input sizes:
+------------------------------------------------------------------------
+      Resolution      | ImageNet Acc | Multiply-Adds (M) | Params (M)
+------------------------------------------------------------------------
+|  1.0 MobileNet-224  |    70.6 %    |        529        |     4.2     |
+|  1.0 MobileNet-192  |    69.1 %    |        529        |     4.2     |
+|  1.0 MobileNet-160  |    67.2 %    |        529        |     4.2     |
+|  1.0 MobileNet-128  |    64.4 %    |        529        |     4.2     |
+------------------------------------------------------------------------
+
+The weights for all 16 models are obtained and translated
+from Tensorflow checkpoints found at
+https://github.com/tensorflow/models/blob/master/slim/nets/mobilenet_v1.md
+
+# Reference
+- [MobileNets: Efficient Convolutional Neural Networks for
+   Mobile Vision Applications](https://arxiv.org/pdf/1704.04861.pdf))
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Activation
+from tensorflow.contrib.keras.python.keras.layers import BatchNormalization
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dropout
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import Reshape
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
+
+
+def relu6(x):
+  return K.relu(x, max_value=6)
+
+
+def preprocess_input(x):
+  x /= 255.
+  x -= 0.5
+  x *= 2.
+  return x
+
+
+class DepthwiseConv2D(Conv2D):
+  """Depthwise separable 2D convolution.
+
+  Depthwise Separable convolutions consists in performing
+  just the first step in a depthwise spatial convolution
+  (which acts on each input channel separately).
+  The `depth_multiplier` argument controls how many
+  output channels are generated per input channel in the depthwise step.
+
+  Arguments:
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      depth_multiplier: The number of depthwise convolution output channels
+          for each input channel.
+          The total number of depthwise convolution output
+          channels will be equal to `filters_in * depth_multiplier`.
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      activation: Activation function to use
+          (see [activations](../activations.md)).
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      depthwise_initializer: Initializer for the depthwise kernel matrix
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
+      depthwise_regularizer: Regularizer function applied to
+          the depthwise kernel matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation").
+          (see [regularizer](../regularizers.md)).
+      depthwise_constraint: Constraint function applied to
+          the depthwise kernel matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
+
+  Input shape:
+      4D tensor with shape:
+      `[batch, channels, rows, cols]` if data_format='channels_first'
+      or 4D tensor with shape:
+      `[batch, rows, cols, channels]` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
+      or 4D tensor with shape:
+      `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               depth_multiplier=1,
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(DepthwiseConv2D, self).__init__(
+        filters=None,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  def build(self, input_shape):
+    if len(input_shape) < 4:
+      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = 3
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs to '
+                       '`DepthwiseConv2D` '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    depthwise_kernel_shape = (self.kernel_size[0], self.kernel_size[1],
+                              input_dim, self.depth_multiplier)
+
+    self.depthwise_kernel = self.add_weight(
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        name='depthwise_kernel',
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(
+          shape=(input_dim * self.depth_multiplier,),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs, training=None):
+    outputs = K.depthwise_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        strides=self.strides,
+        padding=self.padding,
+        dilation_rate=self.dilation_rate,
+        data_format=self.data_format)
+
+    if self.bias:
+      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+      out_filters = input_shape[1] * self.depth_multiplier
+    elif self.data_format == 'channels_last':
+      rows = input_shape[1]
+      cols = input_shape[2]
+      out_filters = input_shape[3] * self.depth_multiplier
+
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding, self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding, self.strides[1])
+
+    if self.data_format == 'channels_first':
+      return (input_shape[0], out_filters, rows, cols)
+    elif self.data_format == 'channels_last':
+      return (input_shape[0], rows, cols, out_filters)
+
+  def get_config(self):
+    config = super(DepthwiseConv2D, self).get_config()
+    config.pop('filters')
+    config.pop('kernel_initializer')
+    config.pop('kernel_regularizer')
+    config.pop('kernel_constraint')
+    config['depth_multiplier'] = self.depth_multiplier
+    config['depthwise_initializer'] = initializers.serialize(
+        self.depthwise_initializer)
+    config['depthwise_regularizer'] = regularizers.serialize(
+        self.depthwise_regularizer)
+    config['depthwise_constraint'] = constraints.serialize(
+        self.depthwise_constraint)
+    return config
+
+
+def MobileNet(input_shape=None,  # pylint: disable=invalid-name
+              alpha=1.0,
+              depth_multiplier=1,
+              dropout=1e-3,
+              include_top=True,
+              weights='imagenet',
+              input_tensor=None,
+              pooling=None,
+              classes=1000):
+  """Instantiates the MobileNet architecture.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  To load a MobileNet model via `load_model`, import the custom
+  objects `relu6` and `DepthwiseConv2D` and pass them to the
+  `custom_objects` parameter.
+  E.g.
+  model = load_model('mobilenet.h5', custom_objects={
+                     'relu6': mobilenet.relu6,
+                     'DepthwiseConv2D': mobilenet.DepthwiseConv2D})
+
+  Arguments:
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or (3, 224, 224) (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(200, 200, 3)` would be one valid value.
+      alpha: controls the width of the network.
+          - If `alpha` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `alpha` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `alpha` = 1, default number of filters from the paper
+               are used at each layer.
+      depth_multiplier: depth multiplier for depthwise convolution
+          (also called the resolution multiplier)
+      dropout: dropout rate
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('Only TensorFlow backend is currently supported, '
+                       'as other backends do not support '
+                       'depthwise convolution.')
+
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as ImageNet with `include_top` '
+                     'as true, `classes` should be 1000')
+
+  # Determine proper input shape.
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=32,
+      data_format=K.image_data_format(),
+      include_top=include_top or weights)
+  if K.image_data_format() == 'channels_last':
+    row_axis, col_axis = (0, 1)
+  else:
+    row_axis, col_axis = (1, 2)
+  rows = input_shape[row_axis]
+  cols = input_shape[col_axis]
+
+  if weights == 'imagenet':
+    if depth_multiplier != 1:
+      raise ValueError('If imagenet weights are being loaded, '
+                       'depth multiplier must be 1')
+
+    if alpha not in [0.25, 0.50, 0.75, 1.0]:
+      raise ValueError('If imagenet weights are being loaded, '
+                       'alpha can be one of'
+                       '`0.25`, `0.50`, `0.75` or `1.0` only.')
+
+    if rows != cols or rows not in [128, 160, 192, 224]:
+      raise ValueError('If imagenet weights are being loaded, '
+                       'input must have a static square shape (one of '
+                       '(128,128), (160,160), (192,192), or (224, 224)).'
+                       ' Input shape provided = %s' % (input_shape,))
+
+  if K.image_data_format() != 'channels_last':
+    warnings.warn('The MobileNet family of models is only available '
+                  'for the input data format "channels_last" '
+                  '(width, height, channels). '
+                  'However your settings specify the default '
+                  'data format "channels_first" (channels, width, height).'
+                  ' You should set `image_data_format="channels_last"` '
+                  'in your Keras config located at ~/.keras/keras.json. '
+                  'The model being returned right now will expect inputs '
+                  'to follow the "channels_last" data format.')
+    K.set_image_data_format('channels_last')
+    old_data_format = 'channels_first'
+  else:
+    old_data_format = None
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  x = _conv_block(img_input, 32, alpha, strides=(2, 2))
+  x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1)
+
+  x = _depthwise_conv_block(
+      x, 128, alpha, depth_multiplier, strides=(2, 2), block_id=2)
+  x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3)
+
+  x = _depthwise_conv_block(
+      x, 256, alpha, depth_multiplier, strides=(2, 2), block_id=4)
+  x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5)
+
+  x = _depthwise_conv_block(
+      x, 512, alpha, depth_multiplier, strides=(2, 2), block_id=6)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11)
+
+  x = _depthwise_conv_block(
+      x, 1024, alpha, depth_multiplier, strides=(2, 2), block_id=12)
+  x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13)
+
+  if include_top:
+    if K.image_data_format() == 'channels_first':
+      shape = (int(1024 * alpha), 1, 1)
+    else:
+      shape = (1, 1, int(1024 * alpha))
+
+    x = GlobalAveragePooling2D()(x)
+    x = Reshape(shape, name='reshape_1')(x)
+    x = Dropout(dropout, name='dropout')(x)
+    x = Conv2D(classes, (1, 1), padding='same', name='conv_preds')(x)
+    x = Activation('softmax', name='act_softmax')(x)
+    x = Reshape((classes,), name='reshape_2')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  # Create model.
+  model = Model(inputs, x, name='mobilenet_%0.2f_%s' % (alpha, rows))
+
+  # load weights
+  if weights == 'imagenet':
+    if K.image_data_format() == 'channels_first':
+      raise ValueError('Weights for "channels_last" format '
+                       'are not available.')
+    if alpha == 1.0:
+      alpha_text = '1_0'
+    elif alpha == 0.75:
+      alpha_text = '7_5'
+    elif alpha == 0.50:
+      alpha_text = '5_0'
+    else:
+      alpha_text = '2_5'
+
+    if include_top:
+      model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)
+      weigh_path = BASE_WEIGHT_PATH + model_name
+      weights_path = get_file(model_name, weigh_path, cache_subdir='models')
+    else:
+      model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)
+      weigh_path = BASE_WEIGHT_PATH + model_name
+      weights_path = get_file(model_name, weigh_path, cache_subdir='models')
+    model.load_weights(weights_path)
+
+  if old_data_format:
+    K.set_image_data_format(old_data_format)
+  return model
+
+
+def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
+  """Adds an initial convolution layer (with batch normalization and relu6).
+
+  Arguments:
+      inputs: Input tensor of shape `(rows, cols, 3)`
+          (with `channels_last` data format) or
+          (3, rows, cols) (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      alpha: controls the width of the network.
+          - If `alpha` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `alpha` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `alpha` = 1, default number of filters from the paper
+               are used at each layer.
+      kernel: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+
+  Input shape:
+      4D tensor with shape:
+      `(samples, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to stride.
+
+  Returns:
+      Output tensor of block.
+  """
+  channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+  filters = int(filters * alpha)
+  x = Conv2D(
+      filters,
+      kernel,
+      padding='same',
+      use_bias=False,
+      strides=strides,
+      name='conv1')(inputs)
+  x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
+  return Activation(relu6, name='conv1_relu')(x)
+
+
+def _depthwise_conv_block(inputs,
+                          pointwise_conv_filters,
+                          alpha,
+                          depth_multiplier=1,
+                          strides=(1, 1),
+                          block_id=1):
+  """Adds a depthwise convolution block.
+
+  A depthwise convolution block consists of a depthwise conv,
+  batch normalization, relu6, pointwise convolution,
+  batch normalization and relu6 activation.
+
+  Arguments:
+      inputs: Input tensor of shape `(rows, cols, channels)`
+          (with `channels_last` data format) or
+          (channels, rows, cols) (with `channels_first` data format).
+      pointwise_conv_filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the pointwise convolution).
+      alpha: controls the width of the network.
+          - If `alpha` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `alpha` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `alpha` = 1, default number of filters from the paper
+               are used at each layer.
+      depth_multiplier: The number of depthwise convolution output channels
+          for each input channel.
+          The total number of depthwise convolution output
+          channels will be equal to `filters_in * depth_multiplier`.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      block_id: Integer, a unique identification designating the block number.
+
+  Input shape:
+      4D tensor with shape:
+      `(batch, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to stride.
+
+  Returns:
+      Output tensor of block.
+  """
+  channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+  pointwise_conv_filters = int(pointwise_conv_filters * alpha)
+
+  x = DepthwiseConv2D(  # pylint: disable=not-callable
+      (3, 3),
+      padding='same',
+      depth_multiplier=depth_multiplier,
+      strides=strides,
+      use_bias=False,
+      name='conv_dw_%d' % block_id)(inputs)
+  x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
+  x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
+
+  x = Conv2D(
+      pointwise_conv_filters, (1, 1),
+      padding='same',
+      use_bias=False,
+      strides=(1, 1),
+      name='conv_pw_%d' % block_id)(x)
+  x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x)
+  return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)
diff --git a/tensorflow/contrib/keras/python/keras/applications/mobilenet_test.py b/tensorflow/contrib/keras/python/keras/applications/mobilenet_test.py
new file mode 100644
index 0000000000..6aa786f9b1
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/mobilenet_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MobileNet application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class MobileNetTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.MobileNet(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.MobileNet(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1024))
+
+  def test_with_pooling(self):
+    model = keras.applications.MobileNet(weights=None,
+                                         include_top=False,
+                                         pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1024))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index 324f510301..5175bd0040 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -708,7 +708,7 @@ def dtype(x):
       'float32_ref'
   ```
   """
-  return x.dtype.name
+  return x.dtype.base_dtype.name
 
 
 def eval(x):
@@ -1309,7 +1309,7 @@ def max(x, axis=None, keepdims=False):
       A tensor with maximum values of `x`.
   """
   axis = _normalize_axis(axis, ndim(x))
-  return math_ops.reduce_max(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_max(x, axis=axis, keep_dims=keepdims)
 
 
 def min(x, axis=None, keepdims=False):
@@ -1327,7 +1327,7 @@ def min(x, axis=None, keepdims=False):
       A tensor with miminum values of `x`.
   """
   axis = _normalize_axis(axis, ndim(x))
-  return math_ops.reduce_min(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_min(x, axis=axis, keep_dims=keepdims)
 
 
 def sum(x, axis=None, keepdims=False):
@@ -1345,7 +1345,7 @@ def sum(x, axis=None, keepdims=False):
       A tensor with sum of `x`.
   """
   axis = _normalize_axis(axis, ndim(x))
-  return math_ops.reduce_sum(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_sum(x, axis=axis, keep_dims=keepdims)
 
 
 def prod(x, axis=None, keepdims=False):
@@ -1363,7 +1363,7 @@ def prod(x, axis=None, keepdims=False):
       A tensor with the product of elements of `x`.
   """
   axis = _normalize_axis(axis, ndim(x))
-  return math_ops.reduce_prod(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_prod(x, axis=axis, keep_dims=keepdims)
 
 
 def cumsum(x, axis=0):
@@ -1411,10 +1411,10 @@ def var(x, axis=None, keepdims=False):
   axis = _normalize_axis(axis, ndim(x))
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  m = math_ops.reduce_mean(x, reduction_indices=axis, keep_dims=True)
+  m = math_ops.reduce_mean(x, axis=axis, keep_dims=True)
   devs_squared = math_ops.square(x - m)
   return math_ops.reduce_mean(
-      devs_squared, reduction_indices=axis, keep_dims=keepdims)
+      devs_squared, axis=axis, keep_dims=keepdims)
 
 
 def std(x, axis=None, keepdims=False):
@@ -1451,7 +1451,7 @@ def mean(x, axis=None, keepdims=False):
   axis = _normalize_axis(axis, ndim(x))
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  return math_ops.reduce_mean(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_mean(x, axis=axis, keep_dims=keepdims)
 
 
 def any(x, axis=None, keepdims=False):
@@ -1467,7 +1467,7 @@ def any(x, axis=None, keepdims=False):
   """
   axis = _normalize_axis(axis, ndim(x))
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_any(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_any(x, axis=axis, keep_dims=keepdims)
 
 
 def all(x, axis=None, keepdims=False):
@@ -1483,7 +1483,7 @@ def all(x, axis=None, keepdims=False):
   """
   axis = _normalize_axis(axis, ndim(x))
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_all(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_all(x, axis=axis, keep_dims=keepdims)
 
 
 def argmax(x, axis=-1):
@@ -2889,13 +2889,13 @@ def categorical_crossentropy(output, target, from_logits=False):
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
     output /= math_ops.reduce_sum(
-        output, reduction_indices=len(output.get_shape()) - 1, keep_dims=True)
+        output, axis=len(output.get_shape()) - 1, keep_dims=True)
     # manual computation of crossentropy
     epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
     output = clip_ops.clip_by_value(output, epsilon, 1. - epsilon)
     return -math_ops.reduce_sum(
         target * math_ops.log(output),
-        reduction_indices=len(output.get_shape()) - 1)
+        axis=len(output.get_shape()) - 1)
   else:
     return nn.softmax_cross_entropy_with_logits(labels=target, logits=output)
 
@@ -2914,7 +2914,7 @@ def sparse_categorical_crossentropy(output, target, from_logits=False):
   Returns:
       Output tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.sparse_softmax_cross_entropy_with_logits
   # expects logits, Keras expects probabilities.
   if not from_logits:
     epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
@@ -3018,7 +3018,7 @@ def dropout(x, level, noise_shape=None, seed=None):
   if seed is None:
     seed = np.random.randint(10e6)
   # the dummy 1. works around a TF bug
-  # (float32_ref vs. float32 incomptability)
+  # (float32_ref vs. float32 incompatibility)
   return nn.dropout(x * 1., retain_prob, noise_shape, seed=seed)
 
 
@@ -3380,6 +3380,42 @@ def separable_conv2d(x,
   return _postprocess_conv2d_output(x, data_format)
 
 
+def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
+                     data_format=None, dilation_rate=(1, 1)):
+  """2D convolution with separable filters.
+
+  Arguments:
+    x: input tensor
+    depthwise_kernel: convolution kernel for the depthwise convolution.
+    strides: strides tuple (length 2).
+    padding: string, `"same"` or `"valid"`.
+    data_format: string, `"channels_last"` or `"channels_first"`.
+    dilation_rate: tuple of integers,
+        dilation rates for the separable convolution.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if `data_format` is neither `channels_last`
+      or `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  x = _preprocess_conv2d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+
+  x = nn.depthwise_conv2d(x, depthwise_kernel,
+                          strides=strides,
+                          padding=padding,
+                          rate=dilation_rate)
+  return _postprocess_conv2d_output(x, data_format)
+
+
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -3515,41 +3551,177 @@ def pool3d(x,
   return _postprocess_conv3d_output(x, data_format)
 
 
+def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
+  """Apply 1D conv with un-shared weights.
+
+  Arguments:
+      inputs: 3D tensor with shape: (batch_size, steps, input_dim)
+      kernel: the unshared weight for convolution,
+              with shape (output_length, feature_dim, filters)
+      kernel_size: a tuple of a single integer,
+                   specifying the length of the 1D convolution window
+      strides: a tuple of a single integer,
+               specifying the stride length of the convolution
+      data_format: the data format, channels_first or channels_last
+
+  Returns:
+      the tensor after 1d conv with un-shared weights, with shape (batch_size,
+      output_lenght, filters)
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  stride = strides[0]
+  kernel_shape = int_shape(kernel)
+  output_length = kernel_shape[0]
+  feature_dim = kernel_shape[1]
+
+  xs = []
+  for i in range(output_length):
+    slice_length = slice(i * stride, i * stride + kernel_size[0])
+    xs.append(reshape(inputs[:, slice_length, :], (1, -1, feature_dim)))
+  x_aggregate = concatenate(xs, axis=0)
+  # Shape: `(output_length, batch_size, filters)`.
+  output = batch_dot(x_aggregate, kernel)
+  return permute_dimensions(output, (1, 0, 2))
+
+
+def local_conv2d(inputs,
+                 kernel,
+                 kernel_size,
+                 strides,
+                 output_shape,
+                 data_format=None):
+  """Apply 2D conv with un-shared weights.
+
+  Arguments:
+      inputs: 4D tensor with shape:
+              (batch_size, filters, new_rows, new_cols)
+              if data_format='channels_first'
+              or 4D tensor with shape:
+              (batch_size, new_rows, new_cols, filters)
+              if data_format='channels_last'.
+      kernel: the unshared weight for convolution,
+              with shape (output_items, feature_dim, filters)
+      kernel_size: a tuple of 2 integers, specifying the
+                   width and height of the 2D convolution window.
+      strides: a tuple of 2 integers, specifying the strides
+               of the convolution along the width and height.
+      output_shape: a tuple with (output_row, output_col)
+      data_format: the data format, channels_first or channels_last
+
+  Returns:
+      A 4d tensor with shape:
+      (batch_size, filters, new_rows, new_cols)
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      (batch_size, new_rows, new_cols, filters)
+      if data_format='channels_last'.
+
+  Raises:
+      ValueError: if `data_format` is neither
+                  `channels_last` or `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  stride_row, stride_col = strides
+  output_row, output_col = output_shape
+  kernel_shape = int_shape(kernel)
+  feature_dim = kernel_shape[1]
+  filters = kernel_shape[2]
+
+  xs = []
+  for i in range(output_row):
+    for j in range(output_col):
+      slice_row = slice(i * stride_row, i * stride_row + kernel_size[0])
+      slice_col = slice(j * stride_col, j * stride_col + kernel_size[1])
+      if data_format == 'channels_first':
+        xs.append(
+            reshape(inputs[:, :, slice_row, slice_col], (1, -1, feature_dim)))
+      else:
+        xs.append(
+            reshape(inputs[:, slice_row, slice_col, :], (1, -1, feature_dim)))
+
+  x_aggregate = concatenate(xs, axis=0)
+  output = batch_dot(x_aggregate, kernel)
+  output = reshape(output, (output_row, output_col, -1, filters))
+
+  if data_format == 'channels_first':
+    output = permute_dimensions(output, (2, 3, 0, 1))
+  else:
+    output = permute_dimensions(output, (2, 0, 1, 3))
+  return output
+
+
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
   Arguments:
       x: Tensor or variable.
       bias: Bias tensor to add.
-      data_format: Data format for 3D, 4D or 5D tensors:
-          one of "channels_first", "channels_last".
+      data_format: string, `"channels_last"` or `"channels_first"`.
 
   Returns:
       Output tensor.
 
   Raises:
-      ValueError: In case of invalid `data_format` argument.
+      ValueError: In one of the two cases below:
+                  1. invalid `data_format` argument.
+                  2. invalid bias shape.
+                     the bias should be either a vector or
+                     a tensor with ndim(x) - 1 dimension
   """
   if data_format is None:
     data_format = image_data_format()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
+  bias_shape = int_shape(bias)
+  if len(bias_shape) != 1 and len(bias_shape) != ndim(x) - 1:
+    raise ValueError(
+        'Unexpected bias dimensions %d, expect to be 1 or %d dimensions' %
+        (len(bias_shape), ndim(x)))
   if ndim(x) == 5:
     if data_format == 'channels_first':
-      x += reshape(bias, (1, int_shape(bias)[0], 1, 1, 1))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, bias_shape[0], 1, 1, 1))
+      else:
+        x += reshape(bias, (1, bias_shape[3]) + bias_shape[:3])
     elif data_format == 'channels_last':
-      x += reshape(bias, (1, 1, 1, 1, int_shape(bias)[0]))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, 1, 1, bias_shape[0]))
+      else:
+        x += reshape(bias, (1,) + bias_shape)
   elif ndim(x) == 4:
     if data_format == 'channels_first':
-      # No support yet for NCHW in bias_add.
-      x += reshape(bias, (1, int_shape(bias)[0], 1, 1))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, bias_shape[0], 1, 1))
+      else:
+        x += reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
     elif data_format == 'channels_last':
-      x = nn.bias_add(x, bias, data_format='NHWC')
+      if len(bias_shape) == 1:
+        x = nn.bias_add(x, bias, data_format='NHWC')
+      else:
+        x += reshape(bias, (1,) + bias_shape)
   elif ndim(x) == 3:
     if data_format == 'channels_first':
-      x += reshape(bias, (1, int_shape(bias)[0], 1))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, bias_shape[0], 1))
+      else:
+        x += reshape(bias, (1, bias_shape[1], bias_shape[0]))
     elif data_format == 'channels_last':
-      x += reshape(bias, (1, 1, int_shape(bias)[0]))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, 1, bias_shape[0]))
+      else:
+        x += reshape(bias, (1,) + bias_shape)
   else:
     x = nn.bias_add(x, bias)
   return x
diff --git a/tensorflow/contrib/keras/python/keras/callbacks.py b/tensorflow/contrib/keras/python/keras/callbacks.py
index d0587a549b..6df6662081 100644
--- a/tensorflow/contrib/keras/python/keras/callbacks.py
+++ b/tensorflow/contrib/keras/python/keras/callbacks.py
@@ -513,7 +513,9 @@ class EarlyStopping(Callback):
   def on_epoch_end(self, epoch, logs=None):
     current = logs.get(self.monitor)
     if current is None:
-      logging.warning('Early stopping requires %s available!' % (self.monitor))
+      logging.warning('Early stopping conditioned on metric `%s` '
+                      'which is not available. Available metrics are: %s' %
+                      (self.monitor, ','.join(list(logs.keys()))))
 
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
@@ -680,10 +682,11 @@ class TensorBoard(Callback):
     if self.histogram_freq and self.merged is None:
       for layer in self.model.layers:
         for weight in layer.weights:
-          tf_summary.histogram(weight.name, weight)
+          mapped_weight_name = weight.name.replace(':', '_')
+          tf_summary.histogram(mapped_weight_name, weight)
           if self.write_grads:
             grads = model.optimizer.get_gradients(model.total_loss, weight)
-            tf_summary.histogram('{}_grad'.format(weight.name), grads)
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -708,7 +711,7 @@ class TensorBoard(Callback):
 
             shape = K.int_shape(w_img)
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
-            tf_summary.image(weight.name, w_img)
+            tf_summary.image(mapped_weight_name, w_img)
 
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
@@ -896,8 +899,9 @@ class ReduceLROnPlateau(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
     current = logs.get(self.monitor)
     if current is None:
-      logging.warning('Learning Rate Plateau Reducing requires %s available!' %
-                      self.monitor)
+      logging.warning('Reduce LR on plateau conditioned on metric `%s` '
+                      'which is not available. Available metrics are: %s' %
+                      (self.monitor, ','.join(list(logs.keys()))))
     else:
       if self.in_cooldown():
         self.cooldown_counter -= 1
@@ -998,7 +1002,7 @@ class CSVLogger(Callback):
 
 
 class LambdaCallback(Callback):
-  """Callback for creating simple, custom callbacks on-the-fly.
+  r"""Callback for creating simple, custom callbacks on-the-fly.
 
   This callback is constructed with anonymous functions that will be called
   at the appropriate time. Note that the callbacks expects positional
@@ -1020,17 +1024,21 @@ class LambdaCallback(Callback):
       on_train_end: called at the end of model training.
 
   Example:
+
       ```python
       # Print the batch number at the beginning of every batch.
       batch_print_callback = LambdaCallback(
           on_batch_begin=lambda batch,logs: print(batch))
 
-      # Plot the loss after every epoch.
-      import numpy as np
-      import matplotlib.pyplot as plt
-      plot_loss_callback = LambdaCallback(
-          on_epoch_end=lambda epoch, logs: plt.plot(np.arange(epoch),
-                                                    logs['loss']))
+      # Stream the epoch loss to a file in JSON format. The file content
+      # is not well-formed JSON but rather has a JSON object per line.
+      import json
+      json_log = open('loss_log.json', mode='wt', buffering=1)
+      json_logging_callback = LambdaCallback(
+          on_epoch_end=lambda epoch, logs: json_log.write(
+              json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
+          on_train_end=lambda logs: json_log.close()
+      )
 
       # Terminate some processes after having finished model training.
       processes = ...
@@ -1040,7 +1048,7 @@ class LambdaCallback(Callback):
 
       model.fit(...,
                 callbacks=[batch_print_callback,
-                           plot_loss_callback,
+                           json_logging_callback,
                            cleanup_callback])
       ```
   """
diff --git a/tensorflow/contrib/keras/python/keras/constraints.py b/tensorflow/contrib/keras/python/keras/constraints.py
index 91d6153862..0a59dd92c1 100644
--- a/tensorflow/contrib/keras/python/keras/constraints.py
+++ b/tensorflow/contrib/keras/python/keras/constraints.py
@@ -47,7 +47,7 @@ class MaxNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Convolution2D` layer with `data_format="channels_last"`,
+          In a `Conv2D` layer with `data_format="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
@@ -92,7 +92,7 @@ class UnitNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Convolution2D` layer with `data_format="channels_last"`,
+          In a `Conv2D` layer with `data_format="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
@@ -132,7 +132,7 @@ class MinMaxNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Convolution2D` layer with `dim_ordering="tf"`,
+          In a `Conv2D` layer with `dim_ordering="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology.py b/tensorflow/contrib/keras/python/keras/engine/topology.py
index 637d0c5a48..c8c746e8af 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology.py
@@ -29,13 +29,13 @@ from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import has_arg
 from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.contrib.keras.python.keras.utils.layer_utils import print_summary as print_layer_summary
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import tf_inspect
 
 
 # pylint: disable=g-import-not-at-top
@@ -386,7 +386,7 @@ class Layer(tf_base_layers.Layer):
     user_kwargs = copy.copy(kwargs)
     if not _is_all_none(previous_mask):
       # The previous layer generated a mask.
-      if 'mask' in tf_inspect.getargspec(self.call).args:
+      if has_arg(self.call, 'mask'):
         if 'mask' not in kwargs:
           # If mask is explicitly passed to __call__,
           # we should override the default mask.
@@ -1916,7 +1916,7 @@ class Container(Layer):
               kwargs = {}
             if len(computed_data) == 1:
               computed_tensor, computed_mask = computed_data[0]
-              if 'mask' in tf_inspect.getargspec(layer.call).args:
+              if has_arg(layer.call, 'mask'):
                 if 'mask' not in kwargs:
                   kwargs['mask'] = computed_mask
               output_tensors = _to_list(layer.call(computed_tensor, **kwargs))
@@ -1927,7 +1927,7 @@ class Container(Layer):
             else:
               computed_tensors = [x[0] for x in computed_data]
               computed_masks = [x[1] for x in computed_data]
-              if 'mask' in tf_inspect.getargspec(layer.call).args:
+              if has_arg(layer.call, 'mask'):
                 if 'mask' not in kwargs:
                   kwargs['mask'] = computed_masks
               output_tensors = _to_list(layer.call(computed_tensors, **kwargs))
@@ -2352,8 +2352,25 @@ class Container(Layer):
       raise ImportError('Requires yaml module installed.')
     return yaml.dump(self._updated_config(), **kwargs)
 
-  def summary(self, line_length=None, positions=None):
-    print_layer_summary(self, line_length=line_length, positions=positions)
+  def summary(self, line_length=None, positions=None, print_fn=None):
+    """Prints a string summary of the network.
+
+    Arguments:
+        line_length: Total length of printed lines
+            (e.g. set this to adapt the display to different
+            terminal window sizes).
+        positions: Relative or absolute positions of log elements
+            in each line. If not provided,
+            defaults to `[.33, .55, .67, 1.]`.
+        print_fn: Print function to use. Defaults to `print`.
+            It will be called on each line of the summary.
+            You can set it to a custom function
+            in order to capture the string summary.
+    """
+    print_layer_summary(self,
+                        line_length=line_length,
+                        positions=positions,
+                        print_fn=print_fn)
 
 
 def get_source_inputs(tensor, layer=None, node_index=None):
@@ -2610,6 +2627,35 @@ def preprocess_weights_for_loading(layer,
           recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
         weights = [kernel, recurrent_kernel, bias]
 
+    if layer.__class__.__name__ in ['Model', 'Sequential']:
+      new_weights = []
+      # trainable weights
+      for sublayer in layer.layers:
+        num_weights = len(sublayer.trainable_weights)
+        if num_weights > 0:
+          new_weights.extend(
+              preprocess_weights_for_loading(
+                  layer=sublayer,
+                  weights=weights[:num_weights],
+                  original_keras_version=original_keras_version,
+                  original_backend=original_backend))
+          weights = weights[num_weights:]
+
+      # non-trainable weights
+      for sublayer in layer.layers:
+        num_weights = len([
+            l for l in sublayer.weights if l not in sublayer.trainable_weights
+        ])
+        if num_weights > 0:
+          new_weights.extend(
+              preprocess_weights_for_loading(
+                  layer=sublayer,
+                  weights=weights[:num_weights],
+                  original_keras_version=original_keras_version,
+                  original_backend=original_backend))
+          weights = weights[num_weights:]
+      weights = new_weights
+
   conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
   if layer.__class__.__name__ in conv_layers:
     if original_backend and K.backend() != original_backend:
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology_test.py b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
index 531ed4be3e..ec4fa2eed8 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology_test.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
@@ -195,13 +195,13 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(test_layer.output_shape, (None, 16))
 
     # pylint: disable=pointless-statement
-    with self.assertRaises(Exception):
+    with self.assertRaises(AttributeError):
       dense.input
-    with self.assertRaises(Exception):
+    with self.assertRaises(AttributeError):
       dense.output
-    with self.assertRaises(Exception):
+    with self.assertRaises(AttributeError):
       dense.input_mask
-    with self.assertRaises(Exception):
+    with self.assertRaises(AttributeError):
       dense.output_mask
     # pylint: enable=pointless-statement
 
diff --git a/tensorflow/contrib/keras/python/keras/engine/training.py b/tensorflow/contrib/keras/python/keras/engine/training.py
index 09459fd713..1563cf8c41 100644
--- a/tensorflow/contrib/keras/python/keras/engine/training.py
+++ b/tensorflow/contrib/keras/python/keras/engine/training.py
@@ -20,9 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import multiprocessing
-import threading
-import time
 
 import numpy as np
 import six
@@ -33,18 +30,13 @@ from tensorflow.contrib.keras.python.keras import losses
 from tensorflow.contrib.keras.python.keras import metrics as metrics_module
 from tensorflow.contrib.keras.python.keras import optimizers
 from tensorflow.contrib.keras.python.keras.engine.topology import Container
+from tensorflow.contrib.keras.python.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.contrib.keras.python.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.contrib.keras.python.keras.utils.data_utils import Sequence
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
 
 
-# pylint: disable=g-import-not-at-top
-try:
-  import queue
-except ImportError:
-  import Queue as queue
-# pylint: enable=g-import-not-at-top
-
-
 def _standardize_input_data(data,
                             names,
                             shapes=None,
@@ -115,8 +107,9 @@ def _standardize_input_data(data,
     if len(names) > 1:
       # Case: model expects multiple inputs but only received
       # a single Numpy array.
-      raise ValueError('The model expects ' + str(len(names)) + exception_prefix
-                       + ' arrays, but only received one array. '
+      raise ValueError('The model expects ' + str(len(names)) + ' ' +
+                       exception_prefix +
+                       ' arrays, but only received one array. '
                        'Found: array with shape ' + str(data.shape))
     arrays = [data]
 
@@ -205,7 +198,7 @@ def _standardize_sample_weights(sample_weight, output_names):
                                               'sample_weight')
 
 
-def _check_array_lengths(inputs, targets, weights):
+def _check_array_lengths(inputs, targets, weights=None):
   """Does user input validation for numpy arrays.
 
   Arguments:
@@ -216,29 +209,35 @@ def _check_array_lengths(inputs, targets, weights):
   Raises:
       ValueError: in case of incorrectly formatted data.
   """
-  x_lengths = [x.shape[0] for x in inputs]
-  y_lengths = [y.shape[0] for y in targets]
-  w_lengths = [w.shape[0] for w in weights]
-  set_x = set(x_lengths)
+
+  def set_of_lengths(x):
+    # return a set with the variation between
+    # different shapes, with None => 0
+    if x is None:
+      return {0}
+    else:
+      return set([0 if y is None else y.shape[0] for y in x])
+
+  set_x = set_of_lengths(inputs)
+  set_y = set_of_lengths(targets)
+  set_w = set_of_lengths(weights)
   if len(set_x) > 1:
     raise ValueError('All input arrays (x) should have '
                      'the same number of samples. Got array shapes: ' + str(
                          [x.shape for x in inputs]))
-  set_y = set(y_lengths)
   if len(set_y) > 1:
     raise ValueError('All target arrays (y) should have '
                      'the same number of samples. Got array shapes: ' + str(
                          [y.shape for y in targets]))
-  set_w = set(w_lengths)
-  if len(set_w) > 1:
-    raise ValueError('All sample_weight arrays should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [w.shape for w in weights]))
   if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
     raise ValueError('Input arrays should have '
                      'the same number of samples as target arrays. '
                      'Found ' + str(list(set_x)[0]) + ' input samples '
                      'and ' + str(list(set_y)[0]) + ' target samples.')
+  if len(set_w) > 1:
+    raise ValueError('All sample_weight arrays should have '
+                     'the same number of samples. Got array shapes: ' + str(
+                         [w.shape for w in weights]))
   if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
     raise ValueError('Sample_weight arrays should have '
                      'the same number of samples as target arrays. Got ' +
@@ -261,7 +260,7 @@ def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
           is incompatible with an output.
   """
   key_losses = {
-      'mean_square_error', 'binary_crossentropy', 'categorical_crossentropy'
+      'mean_squared_error', 'binary_crossentropy', 'categorical_crossentropy'
   }
   for y, loss, shape in zip(targets, loss_fns, output_shapes):
     if loss is None:
@@ -389,21 +388,25 @@ def _slice_arrays(arrays, start=None, stop=None):
   Returns:
       A slice of the array(s).
   """
-  if isinstance(arrays, list):
+  if arrays is None:
+    return [None]
+  elif isinstance(arrays, list):
     if hasattr(start, '__len__'):
       # hdf5 datasets only support list objects as indices
       if hasattr(start, 'shape'):
         start = start.tolist()
-      return [x[start] for x in arrays]
+      return [None if x is None else x[start] for x in arrays]
     else:
-      return [x[start:stop] for x in arrays]
+      return [None if x is None else x[start:stop] for x in arrays]
   else:
     if hasattr(start, '__len__'):
       if hasattr(start, 'shape'):
         start = start.tolist()
       return arrays[start]
-    else:
+    elif hasattr(start, '__getitem__'):
       return arrays[start:stop]
+    else:
+      return [None]
 
 
 def _weighted_masked_objective(fn):
@@ -445,13 +448,12 @@ def _weighted_masked_objective(fn):
       #  to the number of unmasked samples.
       score_array /= K.mean(mask)
 
-    # reduce score_array to same ndim as weight array
-    ndim = K.ndim(score_array)
-    weight_ndim = K.ndim(weights)
-    score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
-
     # apply sample weighting
     if weights is not None:
+      # reduce score_array to same ndim as weight array
+      ndim = K.ndim(score_array)
+      weight_ndim = K.ndim(weights)
+      score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
       score_array *= weights
       score_array /= K.mean(K.cast(K.not_equal(weights, 0), K.floatx()))
     return K.mean(score_array)
@@ -567,7 +569,7 @@ def _standardize_weights(y,
     return sample_weight
   elif isinstance(class_weight, dict):
     if len(y.shape) > 2:
-      raise ValueError('class_weight not supported for '
+      raise ValueError('`class_weight` not supported for '
                        '3+ dimensional targets.')
     if y.shape[1] > 1:
       y_classes = y.argmax(axis=1)
@@ -575,7 +577,18 @@ def _standardize_weights(y,
       y_classes = np.reshape(y, y.shape[0])
     else:
       y_classes = y
-    weights = np.asarray([class_weight[cls] for cls in y_classes])
+
+    weights = np.asarray(
+        [class_weight[cls] for cls in y_classes if cls in class_weight])
+
+    if len(weights) != len(y_classes):
+      # subtract the sets to pick all missing classes
+      existing_classes = set(y_classes)
+      existing_class_weight = set(class_weight.keys())
+      raise ValueError('`class_weight` must contain all classes in the data.'
+                       ' The classes %s exist in the data but not in '
+                       '`class_weight`.' %
+                       (existing_classes - existing_class_weight))
     return weights
   else:
     if sample_weight_mode is None:
@@ -584,97 +597,6 @@ def _standardize_weights(y,
       return np.ones((y.shape[0], y.shape[1]), dtype=K.floatx())
 
 
-class GeneratorEnqueuer(object):
-  """Builds a queue out of a data generator.
-
-  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
-
-  Arguments:
-      generator: a generator function which endlessly yields data
-      pickle_safe: use multiprocessing if True, otherwise threading
-  """
-
-  def __init__(self, generator, pickle_safe=False):
-    self._generator = generator
-    self._pickle_safe = pickle_safe
-    self._threads = []
-    self._stop_event = None
-    self.queue = None
-
-  def start(self, workers=1, max_q_size=10, wait_time=0.05):
-    """Kicks off threads which add data from the generator into the queue.
-
-    Arguments:
-        workers: number of worker threads
-        max_q_size: queue size (when full, threads could block on put())
-        wait_time: time to sleep in-between calls to put()
-    """
-
-    def data_generator_task():
-      while not self._stop_event.is_set():
-        try:
-          if self._pickle_safe or self.queue.qsize() < max_q_size:
-            generator_output = next(self._generator)
-            self.queue.put(generator_output)
-          else:
-            time.sleep(wait_time)
-        except Exception:
-          self._stop_event.set()
-          raise
-
-    try:
-      if self._pickle_safe:
-        self.queue = multiprocessing.Queue(maxsize=max_q_size)
-        self._stop_event = multiprocessing.Event()
-      else:
-        self.queue = queue.Queue()
-        self._stop_event = threading.Event()
-
-      for _ in range(workers):
-        if self._pickle_safe:
-          # Reset random seed else all children processes
-          # share the same seed
-          np.random.seed()
-          thread = multiprocessing.Process(target=data_generator_task)
-          thread.daemon = True
-        else:
-          thread = threading.Thread(target=data_generator_task)
-        self._threads.append(thread)
-        thread.start()
-    except:
-      self.stop()
-      raise
-
-  def is_running(self):
-    return self._stop_event is not None and not self._stop_event.is_set()
-
-  def stop(self, timeout=None):
-    """Stop running threads and wait for them to exit, if necessary.
-
-    Should be called by the same thread which called start().
-
-    Arguments:
-        timeout: maximum time to wait on thread.join()
-    """
-    if self.is_running():
-      self._stop_event.set()
-
-    for thread in self._threads:
-      if thread.is_alive():
-        if self._pickle_safe:
-          thread.terminate()
-        else:
-          thread.join(timeout)
-
-    if self._pickle_safe:
-      if self.queue is not None:
-        self.queue.close()
-
-    self._threads = []
-    self._stop_event = None
-    self.queue = None
-
-
 class Model(Container):
   """The `Model` class adds training & evaluation routines to a `Container`.
   """
@@ -723,7 +645,7 @@ class Model(Container):
     Raises:
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
-        RuntimeError: If the model has no loss to optimize.
+        RuntimeError: In case of ill-formulated optimization problem.
     """
     loss = loss or {}
     self.optimizer = optimizers.get(optimizer)
@@ -981,24 +903,22 @@ class Model(Container):
     # Functions for train, test and predict will
     # be compiled lazily when required.
     # This saves time when the user is not using all functions.
+    self._function_kwargs = kwargs
+
     self.train_function = None
     self.test_function = None
     self.predict_function = None
-    self._function_kwargs = kwargs
 
-    # Collected trainable weights and sort them deterministically.
+    # Collected trainable weights, sorted in topological order.
     trainable_weights = self.trainable_weights
-    # Sort weights by name.
-    if trainable_weights:
-      trainable_weights.sort(key=lambda x: x.name)
     self._collected_trainable_weights = trainable_weights
 
   def _make_train_function(self):
     if not hasattr(self, 'train_function'):
       raise RuntimeError('You must compile your model before using it.')
     if self.train_function is None:
-      inputs = (
-          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      inputs = (self._feed_inputs +
+                self._feed_targets + self._feed_sample_weights)
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         inputs += [K.learning_phase()]
 
@@ -1016,8 +936,8 @@ class Model(Container):
     if not hasattr(self, 'test_function'):
       raise RuntimeError('You must compile your model before using it.')
     if self.test_function is None:
-      inputs = (
-          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      inputs = (self._feed_inputs +
+                self._feed_targets + self._feed_sample_weights)
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         inputs += [K.learning_phase()]
       # Return loss and metrics, no gradient updates.
@@ -1031,7 +951,6 @@ class Model(Container):
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
       self.predict_function = None
-      self._function_kwargs = {}
     if self.predict_function is None:
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         inputs = self._feed_inputs + [K.learning_phase()]
@@ -1039,12 +958,13 @@ class Model(Container):
         inputs = self._feed_inputs
       # Gets network outputs. Does not update weights.
       # Does update the network states.
+      kwargs = getattr(self, '_function_kwargs', {})
       self.predict_function = K.function(
           inputs,
           self.outputs,
           updates=self.state_updates,
           name='predict_function',
-          **self._function_kwargs)
+          **kwargs)
 
   def _fit_loop(self,
                 f,
@@ -1430,6 +1350,7 @@ class Model(Container):
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
+
     # Validate user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1468,7 +1389,10 @@ class Model(Container):
 
     elif validation_split and 0. < validation_split < 1.:
       do_validation = True
-      split_at = int(len(x[0]) * (1. - validation_split))
+      if hasattr(x[0], 'shape'):
+        split_at = int(x[0].shape[0] * (1. - validation_split))
+      else:
+        split_at = int(len(x[0]) * (1. - validation_split))
       x, val_x = (_slice_arrays(x, 0, split_at), _slice_arrays(x, split_at))
       y, val_y = (_slice_arrays(y, 0, split_at), _slice_arrays(y, split_at))
       sample_weights, val_sample_weights = (_slice_arrays(
@@ -1725,18 +1649,25 @@ class Model(Container):
                     validation_data=None,
                     validation_steps=None,
                     class_weight=None,
-                    max_q_size=10,
+                    max_queue_size=10,
                     workers=1,
-                    pickle_safe=False,
-                    initial_epoch=0):
+                    use_multiprocessing=False,
+                    initial_epoch=0,
+                    **kwargs):
     """Fits the model on data yielded batch-by-batch by a Python generator.
 
     The generator is run in parallel to the model, for efficiency.
     For instance, this allows you to do real-time data augmentation
     on images on CPU in parallel to training your model on GPU.
 
+    The use of `keras.utils.Sequence` guarantees the ordering
+    and guarantees the single use of every input per epoch when
+    using `use_multiprocessing=True`.
+
     Arguments:
-        generator: a generator.
+        generator: a generator or an instance of Sequence (keras.utils.Sequence)
+                object in order to avoid duplicate data
+                when using multiprocessing.
             The output of the generator must be either
             - a tuple (inputs, targets)
             - a tuple (inputs, targets, sample_weights).
@@ -1761,10 +1692,10 @@ class Model(Container):
             to yield from `generator` before stopping.
         class_weight: dictionary mapping class indices to a weight
             for the class.
-        max_q_size: maximum size for the generator queue
+        max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
             when using process based threading
-        pickle_safe: if True, use process based threading.
+        use_multiprocessing: if True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
@@ -1773,6 +1704,7 @@ class Model(Container):
             easily to children processes.
         initial_epoch: epoch at which to start training
             (useful for resuming a previous training run)
+        **kwargs: support for legacy arguments.
 
     Returns:
         A `History` object.
@@ -1798,6 +1730,19 @@ class Model(Container):
         ValueError: In case the generator yields
             data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     wait_time = 0.01  # in seconds
     epoch = initial_epoch
 
@@ -1809,7 +1754,8 @@ class Model(Container):
     # python 2 has 'next', 3 has '__next__'
     # avoid any explicit version checks
     val_gen = (hasattr(validation_data, 'next') or
-               hasattr(validation_data, '__next__'))
+               hasattr(validation_data, '__next__') or
+               isinstance(validation_data, Sequence))
     if val_gen and not validation_steps:
       raise ValueError('When using a generator for validation data, '
                        'you must specify a value for '
@@ -1848,7 +1794,7 @@ class Model(Container):
       elif len(validation_data) == 3:
         val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
       else:
-        raise ValueError('validation_data should be a tuple '
+        raise ValueError('`validation_data` should be a tuple '
                          '`(val_x, val_y, val_sample_weight)` '
                          'or `(val_x, val_y)`. Found: ' + str(validation_data))
       val_x, val_y, val_sample_weights = self._standardize_user_data(
@@ -1858,11 +1804,25 @@ class Model(Container):
         val_data += [0.]
       for cbk in callbacks:
         cbk.validation_data = val_data
+    is_sequence = isinstance(generator, Sequence)
+    if not is_sequence and use_multiprocessing and workers > 1:
+      logging.warning(
+          'Using a generator with `use_multiprocessing=True`'
+          ' may duplicate your data.Please consider using '
+          'the `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
-      enqueuer.start(max_q_size=max_q_size, workers=workers)
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator, use_multiprocessing=use_multiprocessing)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
 
       callback_model.stop_training = False
       while epoch < epochs:
@@ -1870,25 +1830,19 @@ class Model(Container):
         steps_done = 0
         batch_index = 0
         while steps_done < steps_per_epoch:
-          generator_output = None
-          while enqueuer.is_running():
-            if not enqueuer.queue.empty():
-              generator_output = enqueuer.queue.get()
-              break
-            else:
-              time.sleep(wait_time)
+          generator_output = next(output_generator)
 
           if not hasattr(generator_output, '__len__'):
-            raise ValueError('output of generator should be '
+            raise ValueError('Output of generator should be '
                              'a tuple `(x, y, sample_weight)` '
                              'or `(x, y)`. Found: ' + str(generator_output))
           if len(generator_output) == 2:
-            x, y = generator_output  # pylint: disable=unpacking-non-sequence
+            x, y = generator_output
             sample_weight = None
           elif len(generator_output) == 3:
-            x, y, sample_weight = generator_output  # pylint: disable=unpacking-non-sequence
+            x, y, sample_weight = generator_output
           else:
-            raise ValueError('output of generator should be '
+            raise ValueError('Output of generator should be '
                              'a tuple `(x, y, sample_weight)` '
                              'or `(x, y)`. Found: ' + str(generator_output))
           # build batch logs
@@ -1924,9 +1878,9 @@ class Model(Container):
               val_outs = self.evaluate_generator(
                   validation_data,
                   validation_steps,
-                  max_q_size=max_q_size,
+                  max_queue_size=max_queue_size,
                   workers=workers,
-                  pickle_safe=pickle_safe)
+                  use_multiprocessing=use_multiprocessing)
             else:
               # No need for try/except because
               # data has already been validated.
@@ -1957,9 +1911,10 @@ class Model(Container):
   def evaluate_generator(self,
                          generator,
                          steps,
-                         max_q_size=10,
+                         max_queue_size=10,
                          workers=1,
-                         pickle_safe=False):
+                         use_multiprocessing=False,
+                         **kwargs):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -1968,18 +1923,22 @@ class Model(Container):
     Arguments:
         generator: Generator yielding tuples (inputs, targets)
             or (inputs, targets, sample_weights)
+            or an instance of Sequence (keras.utils.Sequence)
+                object in order to avoid duplicate data
+                when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-        max_q_size: maximum size for the generator queue
+        max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
             when using process based threading
-        pickle_safe: if True, use process based threading.
+        use_multiprocessing: if True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
+        **kwargs: support for legacy arguments.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1991,38 +1950,58 @@ class Model(Container):
         ValueError: In case the generator yields
             data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     self._make_test_function()
 
     steps_done = 0
     wait_time = 0.01
     all_outs = []
     batch_sizes = []
+    is_sequence = isinstance(generator, Sequence)
+    if not is_sequence and use_multiprocessing and workers > 1:
+      logging.warning(
+          'Using a generator with `use_multiprocessing=True`'
+          ' may duplicate your data.Please consider using '
+          'the `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
-      enqueuer.start(workers=workers, max_q_size=max_q_size)
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator, use_multiprocessing=use_multiprocessing)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
 
       while steps_done < steps:
-        generator_output = None
-        while enqueuer.is_running():
-          if not enqueuer.queue.empty():
-            generator_output = enqueuer.queue.get()
-            break
-          else:
-            time.sleep(wait_time)
-
+        generator_output = next(output_generator)
         if not hasattr(generator_output, '__len__'):
-          raise ValueError('output of generator should be a tuple '
+          raise ValueError('Output of generator should be a tuple '
                            '(x, y, sample_weight) '
                            'or (x, y). Found: ' + str(generator_output))
         if len(generator_output) == 2:
-          x, y = generator_output  # pylint: disable=unpacking-non-sequence
+          x, y = generator_output
           sample_weight = None
         elif len(generator_output) == 3:
-          x, y, sample_weight = generator_output  # pylint: disable=unpacking-non-sequence
+          x, y, sample_weight = generator_output
         else:
-          raise ValueError('output of generator should be a tuple '
+          raise ValueError('Output of generator should be a tuple '
                            '(x, y, sample_weight) '
                            'or (x, y). Found: ' + str(generator_output))
         outs = self.test_on_batch(x, y, sample_weight=sample_weight)
@@ -2033,6 +2012,9 @@ class Model(Container):
           batch_size = len(list(x.values())[0])
         else:
           batch_size = len(x)
+        if batch_size == 0:
+          raise ValueError('Received an empty batch. '
+                           'Batches should at least contain one item.')
         all_outs.append(outs)
 
         steps_done += 1
@@ -2054,23 +2036,27 @@ class Model(Container):
   def predict_generator(self,
                         generator,
                         steps,
-                        max_q_size=10,
+                        max_queue_size=10,
                         workers=1,
-                        pickle_safe=False,
-                        verbose=0):
+                        use_multiprocessing=False,
+                        verbose=0,
+                        **kwargs):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
     `predict_on_batch`.
 
     Arguments:
-        generator: Generator yielding batches of input samples.
+        generator: Generator yielding batches of input samples
+                or an instance of Sequence (keras.utils.Sequence)
+                object in order to avoid duplicate data
+                when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-        max_q_size: Maximum size for the generator queue.
+        max_queue_size: Maximum size for the generator queue.
         workers: Maximum number of processes to spin up
             when using process based threading
-        pickle_safe: If `True`, use process based threading.
+        use_multiprocessing: If `True`, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
@@ -2078,6 +2064,7 @@ class Model(Container):
             as they can't be passed
             easily to children processes.
         verbose: verbosity mode, 0 or 1.
+        **kwargs: support for legacy arguments.
 
     Returns:
         Numpy array(s) of predictions.
@@ -2086,38 +2073,58 @@ class Model(Container):
         ValueError: In case the generator yields
             data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     self._make_predict_function()
 
     steps_done = 0
     wait_time = 0.01
     all_outs = []
+    is_sequence = isinstance(generator, Sequence)
+    if not is_sequence and use_multiprocessing and workers > 1:
+      logging.warning(
+          'Using a generator with `use_multiprocessing=True`'
+          ' may duplicate your data.Please consider using '
+          'the `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
-      enqueuer.start(workers=workers, max_q_size=max_q_size)
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator, use_multiprocessing=use_multiprocessing)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
 
       if verbose == 1:
         progbar = Progbar(target=steps)
 
       while steps_done < steps:
-        generator_output = None
-        while enqueuer.is_running():
-          if not enqueuer.queue.empty():
-            generator_output = enqueuer.queue.get()
-            break
-          else:
-            time.sleep(wait_time)
-
+        generator_output = next(output_generator)
         if isinstance(generator_output, tuple):
           # Compatibility with the generators
           # used for training.
           if len(generator_output) == 2:
-            x, _ = generator_output  # pylint: disable=unpacking-non-sequence
+            x, _ = generator_output
           elif len(generator_output) == 3:
-            x, _, _ = generator_output  # pylint: disable=unpacking-non-sequence
+            x, _, _ = generator_output
           else:
-            raise ValueError('output of generator should be '
+            raise ValueError('Output of generator should be '
                              'a tuple `(x, y, sample_weight)` '
                              'or `(x, y)`. Found: ' + str(generator_output))
         else:
diff --git a/tensorflow/contrib/keras/python/keras/engine/training_test.py b/tensorflow/contrib/keras/python/keras/engine/training_test.py
index a23838f7b4..d2aac54c94 100644
--- a/tensorflow/contrib/keras/python/keras/engine/training_test.py
+++ b/tensorflow/contrib/keras/python/keras/engine/training_test.py
@@ -463,6 +463,38 @@ class LossWeightingTest(test.TestCase):
           temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
       self.assertLess(score, ref_score)
 
+  def test_class_weight_wrong_classes(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(num_classes),
+              input_shape=(timesteps, input_dim)))
+      model.add(keras.layers.Activation('softmax'))
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer='rmsprop')
+
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=test_samples,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      # convert class vectors to binary class matrices
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+      class_weight = dict([(i, 1.) for i in range(num_classes)])
+
+      del class_weight[1]
+      with self.assertRaises(ValueError):
+        model.fit(x_train, y_train,
+                  epochs=0, verbose=0, class_weight=class_weight)
+
 
 class LossMaskingTest(test.TestCase):
 
@@ -654,41 +686,41 @@ class TestGeneratorMethods(test.TestCase):
                         steps_per_epoch=5,
                         epochs=1,
                         verbose=1,
-                        max_q_size=10,
+                        max_queue_size=10,
                         workers=4,
-                        pickle_safe=True)
+                        use_multiprocessing=True)
     model.fit_generator(custom_generator(),
                         steps_per_epoch=5,
                         epochs=1,
                         verbose=1,
-                        max_q_size=10,
-                        pickle_safe=False)
+                        max_queue_size=10,
+                        use_multiprocessing=False)
     model.fit_generator(custom_generator(),
                         steps_per_epoch=5,
                         epochs=1,
                         verbose=1,
-                        max_q_size=10,
-                        pickle_safe=False,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
                         validation_data=custom_generator(),
                         validation_steps=10)
     model.predict_generator(custom_generator(),
                             steps=5,
-                            max_q_size=10,
+                            max_queue_size=10,
                             workers=2,
-                            pickle_safe=True)
+                            use_multiprocessing=True)
     model.predict_generator(custom_generator(),
                             steps=5,
-                            max_q_size=10,
-                            pickle_safe=False)
+                            max_queue_size=10,
+                            use_multiprocessing=False)
     model.evaluate_generator(custom_generator(),
                              steps=5,
-                             max_q_size=10,
+                             max_queue_size=10,
                              workers=2,
-                             pickle_safe=True)
+                             use_multiprocessing=True)
     model.evaluate_generator(custom_generator(),
                              steps=5,
-                             max_q_size=10,
-                             pickle_safe=False)
+                             max_queue_size=10,
+                             use_multiprocessing=False)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/keras/python/keras/initializers.py b/tensorflow/contrib/keras/python/keras/initializers.py
index b0b71e7cb4..af1dc914bb 100644
--- a/tensorflow/contrib/keras/python/keras/initializers.py
+++ b/tensorflow/contrib/keras/python/keras/initializers.py
@@ -57,6 +57,28 @@ class Identity(Initializer):
     return {'gain': self.gain}
 
 
+def lecun_normal(seed=None):
+  """LeCun normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(1 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+      - [Efficient
+      Backprop](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+  """
+  return VarianceScaling(
+      scale=1., mode='fan_in', distribution='normal', seed=seed)
+
+
 def lecun_uniform(seed=None):
   """LeCun uniform initializer.
 
diff --git a/tensorflow/contrib/keras/python/keras/initializers_test.py b/tensorflow/contrib/keras/python/keras/initializers_test.py
index 0a07eddd89..f39d2bfd52 100644
--- a/tensorflow/contrib/keras/python/keras/initializers_test.py
+++ b/tensorflow/contrib/keras/python/keras/initializers_test.py
@@ -103,6 +103,14 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
                    target_mean=0., target_max=scale, target_min=-scale)
 
+  def test_lecun_normal(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      scale = np.sqrt(1. / fan_in)
+      self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
+                   target_mean=0., target_std=None, target_max=2 * scale)
+
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
diff --git a/tensorflow/contrib/keras/python/keras/integration_test.py b/tensorflow/contrib/keras/python/keras/integration_test.py
index 0f6db097d1..32b0a95fe3 100644
--- a/tensorflow/contrib/keras/python/keras/integration_test.py
+++ b/tensorflow/contrib/keras/python/keras/integration_test.py
@@ -228,6 +228,13 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.85)
 
+  def test_embedding_with_clipnorm(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Embedding(input_dim=1, output_dim=1))
+      model.compile(optimizer=keras.optimizers.SGD(clipnorm=0.1), loss='mse')
+      model.fit(np.array([[0]]), np.array([[[0.5]]]), epochs=1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
index 2c957ece44..55f17ac4e2 100644
--- a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
+++ b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
@@ -57,7 +57,7 @@ class LeakyReLU(Layer):
     return K.relu(inputs, alpha=self.alpha)
 
   def get_config(self):
-    config = {'alpha': self.alpha}
+    config = {'alpha': float(self.alpha)}
     base_config = super(LeakyReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional.py b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
index 9ee5aa2121..24ff0baf84 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
@@ -324,7 +324,7 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
       filters: Integer, the dimensionality of the output space
           (i.e. the number output of filters in the convolution).
       kernel_size: An integer or tuple/list of 3 integers, specifying the
-          width and height of the 3D convolution window.
+          depth, height and width of the 3D convolution window.
           Can be a single integer to specify the same value for
           all spatial dimensions.
       strides: An integer or tuple/list of 3 integers,
@@ -599,6 +599,162 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
+  """Transposed convolution layer (sometimes called Deconvolution).
+
+  The need for transposed convolutions generally arises
+  from the desire to use a transformation going in the opposite direction
+  of a normal convolution, i.e., from something that has the shape of the
+  output of some convolution to something that has the shape of its input
+  while maintaining a connectivity pattern that is compatible with
+  said convolution.
+
+  When using this layer as the first layer in a model,
+  provide the keyword argument `input_shape`
+  (tuple of integers, does not include the sample axis),
+  e.g. `input_shape=(128, 128, 128, 3)` for a 128x128x128 volume with 3 channels
+  if `data_format="channels_last"`.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number of output filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the
+          width and height of the 3D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, depth, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, depth, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: an integer or tuple/list of 3 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any stride value != 1.
+      activation: Activation function to use
+          (see [activations](../activations.md)).
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation").
+          (see [regularizer](../regularizers.md)).
+      kernel_constraint: Constraint function applied to the kernel matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
+
+  Input shape:
+      5D tensor with shape:
+      `(batch, channels, depth, rows, cols)` if data_format='channels_first'
+      or 5D tensor with shape:
+      `(batch, depth, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      5D tensor with shape:
+      `(batch, filters, new_depth, new_rows, new_cols)` if
+        data_format='channels_first'
+      or 5D tensor with shape:
+      `(batch, new_depth, new_rows, new_cols, filters)` if
+        data_format='channels_last'.
+      `depth` and `rows` and `cols` values might have changed due to padding.
+
+  References:
+      - [A guide to convolution arithmetic for deep
+        learning](https://arxiv.org/abs/1603.07285v1)
+      - [Deconvolutional
+        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(Conv3DTranspose, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(Conv3DTranspose, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv3DTranspose, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
   """Depthwise separable 2D convolution.
 
@@ -976,7 +1132,7 @@ class ZeroPadding1D(Layer):
 class ZeroPadding2D(Layer):
   """Zero-padding layer for 2D input (e.g. picture).
 
-  This layer can add rows and columns or zeros
+  This layer can add rows and columns of zeros
   at the top, bottom, left and right side of an image tensor.
 
   Arguments:
@@ -1551,3 +1707,4 @@ Convolution3D = Conv3D
 SeparableConvolution2D = SeparableConv2D
 Convolution2DTranspose = Conv2DTranspose
 Deconvolution2D = Deconv2D = Conv2DTranspose
+Deconvolution3D = Deconv3D = Conv3DTranspose
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
index 7e567d3fb0..2a05ac55db 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -239,6 +239,66 @@ class Conv2DTransposeTest(test.TestCase):
       self.assertEqual(len(layer.constraints), 2)
 
 
+class Conv3DTransposeTest(test.TestCase):
+
+  def test_conv3d_transpose(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    num_row = 5
+    num_col = 6
+    depth = 4
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1, 1), (2, 2, 2)]:
+        if padding == 'same' and strides != (1, 1, 1):
+          continue
+
+        with self.test_session(use_gpu=True):
+          testing_utils.layer_test(
+              keras.layers.Conv3DTranspose,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': 'channels_last'
+              },
+              input_shape=(num_samples, depth, num_row, num_col, stack_size))
+
+  def test_conv3dtranspose_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv3DTranspose(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv3DTranspose(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
 class SeparableConv2DTest(test.TestCase):
 
   def test_separable_conv_2d(self):
diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
index 34548c83c5..1f9ee384c2 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -33,9 +33,9 @@ from tensorflow.contrib.keras.python.keras.engine import Layer
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_dump
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_load
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import has_arg
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import core as tf_core_layers
-from tensorflow.python.util import tf_inspect
 
 
 class Masking(Layer):
@@ -590,8 +590,7 @@ class Lambda(Layer):
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
-    arg_spec = tf_inspect.getargspec(self.function)
-    if 'mask' in arg_spec.args:
+    if has_arg(self.function, 'mask'):
       arguments['mask'] = mask
     return self.function(inputs, **arguments)
 
@@ -634,6 +633,16 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    # If arguments were numpy array, they have been saved as
+    # list. We need to recover the ndarray
+    if 'arguments' in config:
+      for key in config['arguments']:
+        if isinstance(config['arguments'][key], dict):
+          arg_dict = config['arguments'][key]
+          if 'type' in arg_dict and arg_dict['type'] == 'ndarray':
+            # Overwrite the argument with its numpy translation
+            config['arguments'][key] = np.array(arg_dict['value'])
+
     config['function'] = function
     return cls(**config)
 
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings.py b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
index bc0bae67d0..9f617fd3e4 100644
--- a/tensorflow/contrib/keras/python/keras/layers/embeddings.py
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
@@ -96,7 +96,6 @@ class Embedding(Layer):
                mask_zero=False,
                input_length=None,
                **kwargs):
-    kwargs['dtype'] = 'int32'
     if 'input_shape' not in kwargs:
       if input_length:
         kwargs['input_shape'] = (input_length,)
@@ -120,7 +119,8 @@ class Embedding(Layer):
         initializer=self.embeddings_initializer,
         name='embeddings',
         regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint)
+        constraint=self.embeddings_constraint,
+        dtype=self.dtype)
     self.built = True
 
   def compute_mask(self, inputs, mask=None):
@@ -131,12 +131,26 @@ class Embedding(Layer):
 
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if not self.input_length:
-      input_length = input_shape[1]
+    if self.input_length is None:
+      return tensor_shape.TensorShape(input_shape + [self.output_dim])
     else:
-      input_length = self.input_length
-    return tensor_shape.TensorShape(
-        [input_shape[0], input_length, self.output_dim])
+      # input_length can be tuple if input is 3D or higher
+      if isinstance(self.input_length, (list, tuple)):
+        in_lens = list(self.input_length)
+      else:
+        in_lens = [self.input_length]
+      if len(in_lens) != len(input_shape) - 1:
+        ValueError('"input_length" is %s, but received input has shape %s' %
+                   (str(self.input_length), str(input_shape)))
+      else:
+        for i, (s1, s2) in enumerate(zip(in_lens, input_shape[1:])):
+          if s1 is not None and s2 is not None and s1 != s2:
+            ValueError('"input_length" is %s, but received input has shape %s' %
+                       (str(self.input_length), str(input_shape)))
+          elif s1 is None:
+            in_lens[i] = s2
+      return tensor_shape.TensorShape(
+          (input_shape[0],) + tuple(in_lens) + (self.output_dim,))
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py b/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
index ca7ca3efd8..5d6d386862 100644
--- a/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
@@ -46,6 +46,27 @@ class EmbeddingTest(test.TestCase):
           input_dtype='int32',
           expected_output_dtype='float32')
 
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Embedding,
+          kwargs={'output_dim': 4,
+                  'input_dim': 10,
+                  'mask_zero': True},
+          input_shape=(3, 4, 2),
+          input_dtype='int32',
+          expected_output_dtype='float32')
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Embedding,
+          kwargs={'output_dim': 4,
+                  'input_dim': 10,
+                  'mask_zero': True,
+                  'input_length': (None, 2)},
+          input_shape=(3, 4, 2),
+          input_dtype='int32',
+          expected_output_dtype='float32')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/local.py b/tensorflow/contrib/keras/python/keras/layers/local.py
index 863674c1cb..31a29cdaf4 100644
--- a/tensorflow/contrib/keras/python/keras/layers/local.py
+++ b/tensorflow/contrib/keras/python/keras/layers/local.py
@@ -154,52 +154,30 @@ class LocallyConnected1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], length, self.filters])
 
   def call(self, inputs):
-    stride = self.strides[0]
-    output_length, feature_dim, filters = self.kernel_shape
-
-    xs = []
-    for i in range(output_length):
-      slice_length = slice(i * stride, i * stride + self.kernel_size[0])
-      xs.append(K.reshape(inputs[:, slice_length, :], (1, -1, feature_dim)))
-    x_aggregate = K.concatenate(xs, axis=0)
-    # Shape: `(output_length, batch_size, filters)`.
-    output = K.batch_dot(x_aggregate, self.kernel)
-    output = K.permute_dimensions(output, (1, 0, 2))
+    output = K.local_conv1d(inputs, self.kernel, self.kernel_size, self.strides)
 
     if self.use_bias:
-      output += K.reshape(self.bias, (1, output_length, filters))
+      output = K.bias_add(output, self.bias)
     if self.activation is not None:
       output = self.activation(output)
     return output
 
   def get_config(self):
     config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -381,97 +359,35 @@ class LocallyConnected2D(Layer):
           [input_shape[0], rows, cols, self.filters])
 
   def call(self, inputs):
-    stride_row, stride_col = self.strides
-    _, feature_dim, filters = self.kernel_shape
-
-    if self.data_format == 'channels_first':
-      if K.backend() == 'theano':
-        output = []
-        for i in range(self.output_row):
-          for j in range(self.output_col):
-            slice_row = slice(i * stride_row,
-                              i * stride_row + self.kernel_size[0])
-            slice_col = slice(j * stride_col,
-                              j * stride_col + self.kernel_size[1])
-            x_flatten = K.reshape(inputs[:, :, slice_row, slice_col],
-                                  (1, -1, feature_dim))
-            output.append(
-                K.dot(x_flatten, self.kernel[i * self.output_col + j, :, :]))
-        output = K.concatenate(output, axis=0)
-      else:
-        xs = []
-        for i in range(self.output_row):
-          for j in range(self.output_col):
-            slice_row = slice(i * stride_row,
-                              i * stride_row + self.kernel_size[0])
-            slice_col = slice(j * stride_col,
-                              j * stride_col + self.kernel_size[1])
-            xs.append(
-                K.reshape(inputs[:, :, slice_row, slice_col], (1, -1,
-                                                               feature_dim)))
-        x_aggregate = K.concatenate(xs, axis=0)
-        output = K.batch_dot(x_aggregate, self.kernel)
-      output = K.reshape(output, (self.output_row, self.output_col, -1,
-                                  filters))
-      output = K.permute_dimensions(output, (2, 3, 0, 1))
-
-    elif self.data_format == 'channels_last':
-      xs = []
-      for i in range(self.output_row):
-        for j in range(self.output_col):
-          slice_row = slice(i * stride_row,
-                            i * stride_row + self.kernel_size[0])
-          slice_col = slice(j * stride_col,
-                            j * stride_col + self.kernel_size[1])
-          xs.append(
-              K.reshape(inputs[:, slice_row, slice_col, :], (1, -1, feature_dim
-                                                            )))
-      x_aggregate = K.concatenate(xs, axis=0)
-      output = K.batch_dot(x_aggregate, self.kernel)
-      output = K.reshape(output, (self.output_row, self.output_col, -1,
-                                  filters))
-      output = K.permute_dimensions(output, (2, 0, 1, 3))
-
+    output = K.local_conv2d(inputs,
+                            self.kernel,
+                            self.kernel_size,
+                            self.strides,
+                            (self.output_row, self.output_col),
+                            self.data_format)
     if self.use_bias:
-      if self.data_format == 'channels_first':
-        output += K.reshape(self.bias, (1, filters, self.output_row,
-                                        self.output_col))
-      elif self.data_format == 'channels_last':
-        output += K.reshape(self.bias, (1, self.output_row, self.output_col,
-                                        filters))
+      output = K.bias_add(output, self.bias, data_format=self.data_format)
+
     output = self.activation(output)
     return output
 
   def get_config(self):
     config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/lstm_test.py b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
index 90bf95a781..04a04838e7 100644
--- a/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
@@ -118,7 +118,7 @@ class LSTMLayerTest(test.TestCase):
       # check that container-level reset_states() works
       model.reset_states()
       out4 = model.predict(np.ones((num_samples, timesteps)))
-      np.testing.assert_allclose(out3, out4, atol=1e-5)
+      self.assertAllClose(out3, out4, atol=1e-5)
 
       # check that the call to `predict` updated the states
       out5 = model.predict(np.ones((num_samples, timesteps)))
@@ -139,7 +139,7 @@ class LSTMLayerTest(test.TestCase):
       right_padded_input[1, -2:] = 0
       out7 = model.predict(right_padded_input)
 
-      np.testing.assert_allclose(out7, out6, atol=1e-5)
+      self.assertAllClose(out7, out6, atol=1e-5)
 
   def test_regularization_LSTM(self):
     embedding_dim = 4
@@ -252,7 +252,7 @@ class LSTMLayerTest(test.TestCase):
       layer.reset_states()
       assert len(layer.states) == num_states
       assert layer.states[0] is not None
-      np.testing.assert_allclose(
+      self.assertAllClose(
           keras.backend.eval(layer.states[0]),
           np.zeros(keras.backend.int_shape(layer.states[0])),
           atol=1e-4)
@@ -261,7 +261,7 @@ class LSTMLayerTest(test.TestCase):
       if len(values) == 1:
         values = values[0]
       layer.reset_states(values)
-      np.testing.assert_allclose(
+      self.assertAllClose(
           keras.backend.eval(layer.states[0]),
           np.ones(keras.backend.int_shape(layer.states[0])),
           atol=1e-4)
@@ -292,6 +292,42 @@ class LSTMLayerTest(test.TestCase):
       targets = np.random.random((num_samples, units))
       model.train_on_batch([inputs] + initial_state, targets)
 
+  def test_return_state(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      layer = keras.layers.LSTM(units, return_state=True, stateful=True)
+      outputs = layer(inputs)
+      state = outputs[1:]
+      assert len(state) == num_states
+      model = keras.models.Model(inputs, state[0])
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      state = model.predict(inputs)
+      self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  def test_state_reuse(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      layer = keras.layers.LSTM(units, return_state=True, return_sequences=True)
+      outputs = layer(inputs)
+      output, state = outputs[0], outputs[1:]
+      output = keras.layers.LSTM(units)(output, initial_state=state)
+      model = keras.models.Model(inputs, output)
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      outputs = model.predict(inputs)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/noise.py b/tensorflow/contrib/keras/python/keras/layers/noise.py
index adc88a4fce..e3cfa1f711 100644
--- a/tensorflow/contrib/keras/python/keras/layers/noise.py
+++ b/tensorflow/contrib/keras/python/keras/layers/noise.py
@@ -109,3 +109,65 @@ class GaussianDropout(Layer):
     config = {'rate': self.rate}
     base_config = super(GaussianDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+class AlphaDropout(Layer):
+  """Applies Alpha Dropout to the input.
+
+  Alpha Dropout is a `Dropout` that keeps mean and variance of inputs
+  to their original values, in order to ensure the self-normalizing property
+  even after this dropout.
+  Alpha Dropout fits well to Scaled Exponential Linear Units
+  by randomly setting activations to the negative saturation value.
+
+  Arguments:
+      rate: float, drop probability (as with `Dropout`).
+          The multiplicative noise will have
+          standard deviation `sqrt(rate / (1 - rate))`.
+      seed: A Python integer to use as random seed.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+  """
+
+  def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
+    super(AlphaDropout, self).__init__(**kwargs)
+    self.rate = rate
+    self.noise_shape = noise_shape
+    self.seed = seed
+    self.supports_masking = True
+
+  def _get_noise_shape(self, inputs):
+    return self.noise_shape if self.noise_shape else K.shape(inputs)
+
+  def call(self, inputs, training=None):
+    if 0. < self.rate < 1.:
+      noise_shape = self._get_noise_shape(inputs)
+      alpha = 1.6732632423543772848170429916717
+      scale = 1.0507009873554804934193349852946
+
+      def dropped_inputs(inputs=inputs, rate=self.rate, seed=self.seed):
+        alpha_p = -alpha * scale
+        kept_idx = K.greater_equal(K.random_uniform(noise_shape, seed=seed),
+                                   rate)
+        kept_idx = K.cast(kept_idx, K.floatx())
+        a = ((1 - rate) * (1 + rate * alpha_p ** 2)) ** -0.5
+        b = -a * alpha_p * rate
+        x = inputs * kept_idx + alpha_p * (1 - kept_idx)
+        return a * x + b
+
+      return K.in_train_phase(dropped_inputs, inputs, training=training)
+    return inputs
+
+  def get_config(self):
+    config = {'rate': self.rate}
+    base_config = super(AlphaDropout, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/noise_test.py b/tensorflow/contrib/keras/python/keras/layers/noise_test.py
index b0257b167a..8fb1339c2e 100644
--- a/tensorflow/contrib/keras/python/keras/layers/noise_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/noise_test.py
@@ -39,6 +39,13 @@ class NoiseLayersTest(test.TestCase):
           kwargs={'rate': 0.5},
           input_shape=(3, 2, 3))
 
+  def test_AlphaDropout(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.AlphaDropout,
+          kwargs={'rate': 0.2},
+          input_shape=(3, 2, 3))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
index cdef55f599..592e5f5e3a 100644
--- a/tensorflow/contrib/keras/python/keras/layers/recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -123,6 +123,8 @@ class Recurrent(Layer):
           `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
       return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
       go_backwards: Boolean (default False).
           If True, process the input sequence backwards and return the
           reversed sequence.
@@ -166,6 +168,9 @@ class Recurrent(Layer):
       (Optional) 2D tensors with shape `(batch_size, output_dim)`.
 
   Output shape:
+      - if `return_state`: a list of tensors. The first tensor is
+          the output. The remaining tensors are the last states,
+          each with shape `(batch_size, units)`.
       - if `return_sequences`: 3D tensor with shape
           `(batch_size, timesteps, units)`.
       - else, 2D tensor with shape `(batch_size, units)`.
@@ -211,6 +216,7 @@ class Recurrent(Layer):
 
   def __init__(self,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
                unroll=False,
@@ -218,6 +224,7 @@ class Recurrent(Layer):
                **kwargs):
     super(Recurrent, self).__init__(**kwargs)
     self.return_sequences = return_sequences
+    self.return_state = return_state
     self.go_backwards = go_backwards
     self.stateful = stateful
     self.unroll = unroll
@@ -233,18 +240,24 @@ class Recurrent(Layer):
       input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.return_sequences:
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], self.units])
+      output_shape = (input_shape[0], input_shape[1], self.units)
     else:
-      return tensor_shape.TensorShape([input_shape[0], self.units])
+      output_shape = (input_shape[0], self.units)
+
+    if self.return_state:
+      state_shape = [tensor_shape.TensorShape(
+          (input_shape[0], self.units)) for _ in self.states]
+      return [tensor_shape.TensorShape(output_shape)] + state_shape
+    return tensor_shape.TensorShape(output_shape)
 
   def compute_mask(self, inputs, mask):
-    if self.return_sequences:
-      if isinstance(mask, list):
-        return mask[0]
-      return mask
-    else:
-      return None
+    if isinstance(mask, list):
+      mask = mask[0]
+    output_mask = mask if self.return_sequences else None
+    if self.return_state:
+      state_mask = [None for _ in self.states]
+      return [output_mask] + state_mask
+    return output_mask
 
   def step(self, inputs, states):
     raise NotImplementedError
@@ -361,10 +374,16 @@ class Recurrent(Layer):
       last_output._uses_learning_phase = True
       outputs._uses_learning_phase = True
 
-    if self.return_sequences:
-      return outputs
-    else:
-      return last_output
+    if not self.return_sequences:
+      outputs = last_output
+
+    if self.return_state:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      else:
+        states = list(states)
+      return [outputs] + states
+    return outputs
 
   def reset_states(self, states=None):
     if not self.stateful:
@@ -406,6 +425,7 @@ class Recurrent(Layer):
   def get_config(self):
     config = {
         'return_sequences': self.return_sequences,
+        'return_state': self.return_state,
         'go_backwards': self.go_backwards,
         'stateful': self.stateful,
         'unroll': self.unroll,
@@ -601,36 +621,25 @@ class SimpleRNN(Recurrent):
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(SimpleRNN, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -902,38 +911,27 @@ class GRU(Recurrent):
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(GRU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -985,7 +983,7 @@ class LSTM(Recurrent):
 
   References:
       - [Long short-term
-        memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
+        memory]((http://www.bioinf.jku.at/publications/older/2604.pdf)
         (original 1997 paper)
       - [Supervised sequence labeling with recurrent neural
         networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
@@ -1239,40 +1237,28 @@ class LSTM(Recurrent):
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'unit_forget_bias': self.unit_forget_bias,
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(LSTM, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers.py b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
index dbc79fb193..91614c288d 100644
--- a/tensorflow/contrib/keras/python/keras/layers/wrappers.py
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
@@ -24,8 +24,8 @@ import copy
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.engine import InputSpec
 from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import has_arg
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.util import tf_inspect
 
 
 class Wrapper(Layer):
@@ -183,15 +183,29 @@ class TimeDistributed(Wrapper):
     return tensor_shape.TensorShape([child_output_shape[0], timesteps] +
                                     child_output_shape[1:])
 
-  def call(self, inputs, mask=None):
+  def call(self, inputs, training=None, mask=None):
+    kwargs = {}
+    if has_arg(self.layer.call, 'training'):
+      kwargs['training'] = training
+    uses_learning_phase = False  # pylint: disable=redefined-outer-name
+
     input_shape = K.int_shape(inputs)
     if input_shape[0]:
       # batch size matters, use rnn-based implementation
       def step(x, _):
-        output = self.layer.call(x)
+        global uses_learning_phase  # pylint: disable=global-variable-undefined
+        output = self.layer.call(x, **kwargs)
+        if hasattr(output, '_uses_learning_phase'):
+          uses_learning_phase = (output._uses_learning_phase or
+                                 uses_learning_phase)
         return output, []
 
-      _, outputs, _ = K.rnn(step, inputs, initial_states=[], unroll=False)
+      _, outputs, _ = K.rnn(
+          step,
+          inputs,
+          initial_states=[],
+          input_length=input_shape[1],
+          unroll=False)
       y = outputs
     else:
       # No batch size specified, therefore the layer will be able
@@ -202,16 +216,22 @@ class TimeDistributed(Wrapper):
         input_length = K.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...)
       inputs = K.reshape(inputs, (-1,) + input_shape[2:])
-      y = self.layer.call(inputs)  # (num_samples * timesteps, ...)
+      # (num_samples * timesteps, ...)
+      y = self.layer.call(inputs, **kwargs)
+      if hasattr(y, '_uses_learning_phase'):
+        uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
-      output_shape = self._compute_output_shape(input_shape).as_list()  # pylint: disable=protected-access
-      y = K.reshape(y, [-1, input_length] + output_shape[2:])
+      output_shape = self._compute_output_shape(input_shape).as_list()
+      y = K.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
 
     # Apply activity regularizer if any:
     if (hasattr(self.layer, 'activity_regularizer') and
         self.layer.activity_regularizer is not None):
       regularization_loss = self.layer.activity_regularizer(y)
       self.add_loss(regularization_loss, inputs)
+
+    if uses_learning_phase:
+      y._uses_learning_phase = True
     return y
 
 
@@ -285,10 +305,9 @@ class Bidirectional(Wrapper):
 
   def call(self, inputs, training=None, mask=None):
     kwargs = {}
-    func_args = tf_inspect.getargspec(self.layer.call).args
-    if 'training' in func_args:
+    if has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
-    if 'mask' in func_args:
+    if has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
 
     y = self.forward_layer.call(inputs, **kwargs)
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py b/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
index b892681ada..d4cd1ccbb4 100644
--- a/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
@@ -113,6 +113,16 @@ class TimeDistributedTest(test.TestCase):
       model.compile(optimizer='rmsprop', loss='mse')
       self.assertEqual(len(model.losses), 1)
 
+  def test_TimeDistributed_learning_phase(self):
+    # test layers that need learning_phase to be set
+    np.random.seed(1234)
+    x = keras.layers.Input(shape=(3, 2))
+    y = keras.layers.TimeDistributed(
+        keras.layers.Dropout(.999))(x, training=True)
+    model = keras.models.Model(x, y)
+    y = model.predict(np.random.random((10, 3, 2)))
+    self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
+
 
 class BidirectionalTest(test.TestCase):
 
diff --git a/tensorflow/contrib/keras/python/keras/metrics.py b/tensorflow/contrib/keras/python/keras/metrics.py
index 93c8684f91..999e9cb9d4 100644
--- a/tensorflow/contrib/keras/python/keras/metrics.py
+++ b/tensorflow/contrib/keras/python/keras/metrics.py
@@ -59,6 +59,11 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
 
 
+def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
+  return K.mean(K.in_top_k(y_pred,
+                           K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
+
+
 # Aliases
 
 mse = MSE = mean_squared_error
diff --git a/tensorflow/contrib/keras/python/keras/metrics_test.py b/tensorflow/contrib/keras/python/keras/metrics_test.py
index ac0a1372c6..84c6528174 100644
--- a/tensorflow/contrib/keras/python/keras/metrics_test.py
+++ b/tensorflow/contrib/keras/python/keras/metrics_test.py
@@ -42,6 +42,21 @@ class KerasMetricsTest(test.TestCase):
       y_b = keras.backend.variable(np.random.random((6, 7)))
       self.assertEqual(keras.backend.eval(metric(y_a, y_b)).shape, (6,))
 
+  def test_sparse_top_k_categorical_accuracy(self):
+    with self.test_session():
+      y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
+                                                [0.1, 0.2, 0.7]]))
+      y_true = keras.backend.variable(np.array([[1], [0]]))
+      result = keras.backend.eval(
+          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = keras.backend.eval(
+          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = keras.backend.eval(
+          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
   def test_top_k_categorical_accuracy(self):
     with self.test_session():
       y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
diff --git a/tensorflow/contrib/keras/python/keras/models.py b/tensorflow/contrib/keras/python/keras/models.py
index 8786e0b97a..8864f5e69d 100644
--- a/tensorflow/contrib/keras/python/keras/models.py
+++ b/tensorflow/contrib/keras/python/keras/models.py
@@ -97,7 +97,10 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
     # if obj is any numpy type
     if type(obj).__module__ == np.__name__:
-      return obj.item()
+      if isinstance(obj, np.ndarray):
+        return {'type': type(obj), 'value': obj.tolist()}
+      else:
+        return obj.item()
 
     # misc functions (e.g. loss function)
     if callable(obj):
@@ -232,89 +235,79 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
     if isinstance(obj, list):
       deserialized = []
       for value in obj:
-        if value in custom_objects:
-          deserialized.append(custom_objects[value])
-        else:
-          deserialized.append(value)
+        deserialized.append(convert_custom_objects(value))
       return deserialized
     if isinstance(obj, dict):
       deserialized = {}
       for key, value in obj.items():
-        deserialized[key] = []
-        if isinstance(value, list):
-          for element in value:
-            if element in custom_objects:
-              deserialized[key].append(custom_objects[element])
-            else:
-              deserialized[key].append(element)
-        elif value in custom_objects:
-          deserialized[key] = custom_objects[value]
-        else:
-          deserialized[key] = value
+        deserialized[key] = convert_custom_objects(value)
       return deserialized
     if obj in custom_objects:
       return custom_objects[obj]
     return obj
 
-  f = h5py.File(filepath, mode='r')
-
-  # instantiate model
-  model_config = f.attrs.get('model_config')
-  if model_config is None:
-    raise ValueError('No model found in config file.')
-  model_config = json.loads(model_config.decode('utf-8'))
-  model = model_from_config(model_config, custom_objects=custom_objects)
-
-  # set weights
-  topology.load_weights_from_hdf5_group(f['model_weights'], model.layers)
-
-  # Early return if compilation is not required.
-  if not compile:
-    f.close()
-    return model
-
-  # instantiate optimizer
-  training_config = f.attrs.get('training_config')
-  if training_config is None:
-    logging.warning('No training configuration found in save file: '
-                    'the model was *not* compiled. Compile it manually.')
-    f.close()
-    return model
-  training_config = json.loads(training_config.decode('utf-8'))
-  optimizer_config = training_config['optimizer_config']
-  optimizer = optimizers.deserialize(
-      optimizer_config, custom_objects=custom_objects)
-
-  # Recover loss functions and metrics.
-  loss = convert_custom_objects(training_config['loss'])
-  metrics = convert_custom_objects(training_config['metrics'])
-  sample_weight_mode = training_config['sample_weight_mode']
-  loss_weights = training_config['loss_weights']
-
-  # Compile model.
-  model.compile(
-      optimizer=optimizer,
-      loss=loss,
-      metrics=metrics,
-      loss_weights=loss_weights,
-      sample_weight_mode=sample_weight_mode)
-
-  # Set optimizer weights.
-  if 'optimizer_weights' in f:
-    # Build train function (to get weight updates).
-    if isinstance(model, Sequential):
-      model.model._make_train_function()
-    else:
-      model._make_train_function()
-    optimizer_weights_group = f['optimizer_weights']
-    optimizer_weight_names = [
-        n.decode('utf8') for n in optimizer_weights_group.attrs['weight_names']
-    ]
-    optimizer_weight_values = [
-        optimizer_weights_group[n] for n in optimizer_weight_names
-    ]
-    model.optimizer.set_weights(optimizer_weight_values)
-  f.close()
+  with h5py.File(filepath, mode='r') as f:
+    # instantiate model
+    model_config = f.attrs.get('model_config')
+    if model_config is None:
+      raise ValueError('No model found in config file.')
+    model_config = json.loads(model_config.decode('utf-8'))
+    model = model_from_config(model_config, custom_objects=custom_objects)
+
+    # set weights
+    topology.load_weights_from_hdf5_group(f['model_weights'], model.layers)
+
+    # Early return if compilation is not required.
+    if not compile:
+      return model
+
+    # instantiate optimizer
+    training_config = f.attrs.get('training_config')
+    if training_config is None:
+      logging.warning('No training configuration found in save file: '
+                      'the model was *not* compiled. Compile it manually.')
+      return model
+    training_config = json.loads(training_config.decode('utf-8'))
+    optimizer_config = training_config['optimizer_config']
+    optimizer = optimizers.deserialize(
+        optimizer_config, custom_objects=custom_objects)
+
+    # Recover loss functions and metrics.
+    loss = convert_custom_objects(training_config['loss'])
+    metrics = convert_custom_objects(training_config['metrics'])
+    sample_weight_mode = training_config['sample_weight_mode']
+    loss_weights = training_config['loss_weights']
+
+    # Compile model.
+    model.compile(
+        optimizer=optimizer,
+        loss=loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=sample_weight_mode)
+
+    # Set optimizer weights.
+    if 'optimizer_weights' in f:
+      # Build train function (to get weight updates).
+      if isinstance(model, Sequential):
+        model.model._make_train_function()
+      else:
+        model._make_train_function()
+      optimizer_weights_group = f['optimizer_weights']
+      optimizer_weight_names = [
+          n.decode('utf8')
+          for n in optimizer_weights_group.attrs['weight_names']
+      ]
+      optimizer_weight_values = [
+          optimizer_weights_group[n] for n in optimizer_weight_names
+      ]
+      try:
+        model.optimizer.set_weights(optimizer_weight_values)
+      except ValueError:
+        logging.warning('Error in loading the saved optimizer '
+                        'state. As a result, your model is '
+                        'starting with a freshly initialized '
+                        'optimizer.')
   return model
 
 
@@ -331,7 +324,7 @@ def model_from_config(config, custom_objects=None):
       A Keras model instance (uncompiled).
 
   Raises:
-      TypeError if `config` is not a dictionary
+      TypeError: if `config` is not a dictionary.
   """
   if isinstance(config, list):
     raise TypeError('`model_from_config` expects a dictionary, not a list. '
@@ -1018,10 +1011,11 @@ class Sequential(Model):
                     validation_data=None,
                     validation_steps=None,
                     class_weight=None,
-                    max_q_size=10,
+                    max_queue_size=10,
                     workers=1,
-                    pickle_safe=False,
-                    initial_epoch=0):
+                    use_multiprocessing=False,
+                    initial_epoch=0,
+                    **kwargs):
     """Fits the model on data generated batch-by-batch by a Python generator.
 
     The generator is run in parallel to the model, for efficiency.
@@ -1057,9 +1051,9 @@ class Sequential(Model):
             validation dataset divided by the batch size.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
-        max_q_size: Maximum size for the generator queue
+        max_queue_size: Maximum size for the generator queue
         workers: Maximum number of processes to spin up
-        pickle_safe: Ff True, use process based threading.
+        use_multiprocessing: If True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
@@ -1068,12 +1062,15 @@ class Sequential(Model):
             easily to children processes.
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
+        **kwargs: support for legacy arguments.
 
     Returns:
         A `History` object.
 
     Raises:
         RuntimeError: if the model was never compiled.
+        ValueError: In case the generator yields
+            data in an invalid format.
 
     Example:
 
@@ -1092,6 +1089,19 @@ class Sequential(Model):
                             steps_per_epoch=1000, epochs=10)
     ```
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     if self.model is None:
       raise RuntimeError('The model needs to be compiled ' 'before being used.')
     return self.model.fit_generator(
@@ -1103,17 +1113,18 @@ class Sequential(Model):
         validation_data=validation_data,
         validation_steps=validation_steps,
         class_weight=class_weight,
-        max_q_size=max_q_size,
+        max_queue_size=max_queue_size,
         workers=workers,
-        pickle_safe=pickle_safe,
+        use_multiprocessing=use_multiprocessing,
         initial_epoch=initial_epoch)
 
   def evaluate_generator(self,
                          generator,
                          steps,
-                         max_q_size=10,
+                         max_queue_size=10,
                          workers=1,
-                         pickle_safe=False):
+                         use_multiprocessing=False,
+                         **kwargs):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -1124,13 +1135,14 @@ class Sequential(Model):
             or (inputs, targets, sample_weights)
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-        max_q_size: maximum size for the generator queue
+        max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
-        pickle_safe: if True, use process based threading.
+        use_multiprocessing: if True, use process based threading.
             Note that because this implementation
             relies on multiprocessing, you should not pass
             non picklable arguments to the generator
             as they can't be passed easily to children processes.
+        **kwargs: support for legacy arguments.
 
     Returns:
         Scalar test loss (if the model has no metrics)
@@ -1140,23 +1152,39 @@ class Sequential(Model):
 
     Raises:
         RuntimeError: if the model was never compiled.
+        ValueError: In case the generator yields
+            data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     if self.model is None:
       raise RuntimeError('The model needs to be compiled ' 'before being used.')
     return self.model.evaluate_generator(
         generator,
         steps,
-        max_q_size=max_q_size,
+        max_queue_size=max_queue_size,
         workers=workers,
-        pickle_safe=pickle_safe)
+        use_multiprocessing=use_multiprocessing)
 
   def predict_generator(self,
                         generator,
                         steps,
-                        max_q_size=10,
+                        max_queue_size=10,
                         workers=1,
-                        pickle_safe=False,
-                        verbose=0):
+                        use_multiprocessing=False,
+                        verbose=0,
+                        **kwargs):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
@@ -1166,26 +1194,44 @@ class Sequential(Model):
         generator: generator yielding batches of input samples.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-        max_q_size: maximum size for the generator queue
+        max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
-        pickle_safe: if True, use process based threading.
+        use_multiprocessing: if True, use process based threading.
             Note that because this implementation
             relies on multiprocessing, you should not pass
             non picklable arguments to the generator
             as they can't be passed easily to children processes.
         verbose: verbosity mode, 0 or 1.
+        **kwargs: support for legacy arguments.
 
     Returns:
         A Numpy array of predictions.
+
+    Raises:
+        ValueError: In case the generator yields
+            data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     if self.model is None:
       self.build()
     return self.model.predict_generator(
         generator,
         steps,
-        max_q_size=max_q_size,
+        max_queue_size=max_queue_size,
         workers=workers,
-        pickle_safe=pickle_safe,
+        use_multiprocessing=use_multiprocessing,
         verbose=verbose)
 
   def get_config(self):
diff --git a/tensorflow/contrib/keras/python/keras/models_test.py b/tensorflow/contrib/keras/python/keras/models_test.py
index 99fd6e1cbe..f7246097ee 100644
--- a/tensorflow/contrib/keras/python/keras/models_test.py
+++ b/tensorflow/contrib/keras/python/keras/models_test.py
@@ -163,6 +163,27 @@ class TestModelSaving(test.TestCase):
       model = keras.models.load_model(fname)
       os.remove(fname)
 
+  def test_saving_lambda_numpy_array_arguments(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    mean = np.random.random((4, 2, 3))
+    std = np.abs(np.random.random((4, 2, 3))) + 1e-5
+    inputs = keras.layers.Input(shape=(4, 2, 3))
+    output = keras.layers.Lambda(lambda image, mu, std: (image - mu) / std,
+                                 arguments={'mu': mean, 'std': std})(inputs)
+    model = keras.models.Model(inputs, output)
+    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+    _, fname = tempfile.mkstemp('.h5')
+    keras.models.save_model(model, fname)
+
+    model = keras.models.load_model(fname)
+    os.remove(fname)
+
+    self.assertAllClose(mean, model.layers[1].arguments['mu'])
+    self.assertAllClose(std, model.layers[1].arguments['std'])
+
 
 class TestSequential(test.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
diff --git a/tensorflow/contrib/keras/python/keras/optimizers.py b/tensorflow/contrib/keras/python/keras/optimizers.py
index 75fce5c96f..a1bd3be026 100644
--- a/tensorflow/contrib/keras/python/keras/optimizers.py
+++ b/tensorflow/contrib/keras/python/keras/optimizers.py
@@ -18,18 +18,49 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
 
 
 def clip_norm(g, c, n):
+  """Clip a tensor by norm.
+
+  Arguments:
+    g: gradient tensor to clip.
+    c: clipping threshold.
+    n: norm of gradient tensor.
+
+  Returns:
+    Clipped gradient tensor.
+  """
   if c > 0:
-    g = K.switch(n >= c, g * c / n, g)
+    condition = n >= c
+    then_expression = lambda: math_ops.scalar_mul(c / n, g)
+    else_expression = lambda: g
+
+    # saving the shape to avoid converting sparse tensor to dense
+    if isinstance(g, ops.Tensor):
+      g_shape = copy.copy(g.get_shape())
+    elif isinstance(g, ops.IndexedSlices):
+      g_shape = copy.copy(g.dense_shape)
+    if condition.dtype != dtypes_module.bool:
+      condition = math_ops.cast(condition, 'bool')
+    g = control_flow_ops.cond(condition, then_expression, else_expression)
+    if isinstance(g, ops.Tensor):
+      g.set_shape(g_shape)
+    elif isinstance(g, ops.IndexedSlices):
+      g._dense_shape = g_shape  # pylint: disable=protected-access
   return g
 
 
diff --git a/tensorflow/contrib/keras/python/keras/optimizers_test.py b/tensorflow/contrib/keras/python/keras/optimizers_test.py
index af5e3c99b9..bb598f3037 100644
--- a/tensorflow/contrib/keras/python/keras/optimizers_test.py
+++ b/tensorflow/contrib/keras/python/keras/optimizers_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.contrib.keras.python import keras
 from tensorflow.contrib.keras.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training.adam import AdamOptimizer
 
 
 def _get_model(input_dim, num_hidden, output_dim):
@@ -103,5 +104,28 @@ class KerasOptimizersTest(test.TestCase):
                                            momentum=0.9,
                                            clipvalue=0.5))
 
+  def test_tfoptimizer(self):
+    optimizer = keras.optimizers.TFOptimizer(AdamOptimizer)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(
+        2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
+    # This is possible
+    model.compile(loss='mean_squared_error', optimizer=optimizer)
+    # TF optimizers do not support weights constraints
+    with self.assertRaises(ValueError):
+      model.fit(np.random.random((5, 3)),
+                np.random.random((5, 2)),
+                epochs=1,
+                batch_size=5,
+                verbose=0)
+    # not supported
+    with self.assertRaises(NotImplementedError):
+      _ = optimizer.weights
+    with self.assertRaises(NotImplementedError):
+      optimizer.get_config()
+    with self.assertRaises(NotImplementedError):
+      optimizer.from_config(None)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image.py b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
index 0d69396e8b..4f2cff804e 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/image.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
@@ -21,6 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from functools import partial
+import multiprocessing.pool
 import os
 import re
 import threading
@@ -178,7 +180,7 @@ def random_zoom(x,
       ValueError: if `zoom_range` isn't a tuple.
   """
   if len(zoom_range) != 2:
-    raise ValueError('zoom_range should be a tuple or list of two floats. '
+    raise ValueError('`zoom_range` should be a tuple or list of two floats. '
                      'Received arg: ', zoom_range)
 
   if zoom_range[0] == 1 and zoom_range[1] == 1:
@@ -408,8 +410,9 @@ class ImageDataGenerator(object):
       horizontal_flip: whether to randomly flip images horizontally.
       vertical_flip: whether to randomly flip images vertically.
       rescale: rescaling factor. If None or 0, no rescaling is applied,
-          otherwise we multiply the data by the value provided
-          (before applying any other transformation).
+          otherwise we multiply the data by the value provided. This is
+          applied after the `preprocessing_function` (if any provided)
+          but before any other transformation.
       preprocessing_function: function that will be implied on each input.
           The function will run before any other modification on it.
           The function should take one argument:
@@ -466,8 +469,8 @@ class ImageDataGenerator(object):
 
     if data_format not in {'channels_last', 'channels_first'}:
       raise ValueError(
-          'data_format should be "channels_last" (channel after row and '
-          'column) or "channels_first" (channel before row and column). '
+          '`data_format` should be `"channels_last"` (channel after row and '
+          'column) or `"channels_first"` (channel before row and column). '
           'Received arg: ', data_format)
     self.data_format = data_format
     if data_format == 'channels_first':
@@ -488,7 +491,7 @@ class ImageDataGenerator(object):
     elif len(zoom_range) == 2:
       self.zoom_range = [zoom_range[0], zoom_range[1]]
     else:
-      raise ValueError('zoom_range should be a float or '
+      raise ValueError('`zoom_range` should be a float or '
                        'a tuple or list of two floats. '
                        'Received arg: ', zoom_range)
 
@@ -590,11 +593,12 @@ class ImageDataGenerator(object):
                         'first by calling `.fit(numpy_data)`.')
     return x
 
-  def random_transform(self, x):
+  def random_transform(self, x, seed=None):
     """Randomly augment a single image tensor.
 
     Arguments:
         x: 3D tensor, single image.
+        seed: random seed.
 
     Returns:
         A randomly transformed version of the input (same shape).
@@ -610,6 +614,9 @@ class ImageDataGenerator(object):
     img_col_axis = self.col_axis - 1
     img_channel_axis = self.channel_axis - 1
 
+    if seed is not None:
+      np.random.seed(seed)
+
     # use composition of homographies
     # to generate final transform that needs to be applied
     if self.rotation_range:
@@ -709,8 +716,8 @@ class ImageDataGenerator(object):
     if x.ndim != 4:
       raise ValueError('Input to `.fit()` should have rank 4. '
                        'Got array with shape: ' + str(x.shape))
-    if x.shape[self.channel_axis] not in {1, 3, 4}:
-      raise ValueError(
+    if x.shape[self.channel_axis] not in {3, 4}:
+      logging.warning(
           'Expected input to be images (as Numpy array) '
           'following the data format convention "' + self.data_format + '" '
           '(channels on axis ' + str(self.channel_axis) + '), i.e. expected '
@@ -911,6 +918,81 @@ class NumpyArrayIterator(Iterator):
     return batch_x, batch_y
 
 
+def _count_valid_files_in_directory(directory, white_list_formats,
+                                    follow_links):
+  """Count files with extension in `white_list_formats` in a directory.
+
+  Arguments:
+      directory: absolute path to the directory containing files to be counted
+      white_list_formats: set of strings containing allowed extensions for
+          the files to be counted.
+      follow_links: boolean.
+
+  Returns:
+      the count of files with extension in `white_list_formats` contained in
+      the directory.
+  """
+
+  def _recursive_list(subpath):
+    return sorted(
+        os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+
+  samples = 0
+  for _, _, files in _recursive_list(directory):
+    for fname in files:
+      is_valid = False
+      for extension in white_list_formats:
+        if fname.lower().endswith('.' + extension):
+          is_valid = True
+          break
+      if is_valid:
+        samples += 1
+  return samples
+
+
+def _list_valid_filenames_in_directory(directory, white_list_formats,
+                                       class_indices, follow_links):
+  """List paths of files in `subdir` with extensions in `white_list_formats`.
+
+  Arguments:
+      directory: absolute path to a directory containing the files to list.
+          The directory name is used as class label and must be a key of
+            `class_indices`.
+      white_list_formats: set of strings containing allowed extensions for
+          the files to be counted.
+      class_indices: dictionary mapping a class name to its index.
+      follow_links: boolean.
+
+  Returns:
+      classes: a list of class indices
+      filenames: the path of valid files in `directory`, relative from
+          `directory`'s parent (e.g., if `directory` is "dataset/class1",
+          the filenames will be ["class1/file1.jpg", "class1/file2.jpg", ...]).
+  """
+
+  def _recursive_list(subpath):
+    return sorted(
+        os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+
+  classes = []
+  filenames = []
+  subdir = os.path.basename(directory)
+  basedir = os.path.dirname(directory)
+  for root, _, files in _recursive_list(directory):
+    for fname in files:
+      is_valid = False
+      for extension in white_list_formats:
+        if fname.lower().endswith('.' + extension):
+          is_valid = True
+          break
+      if is_valid:
+        classes.append(class_indices[subdir])
+        # add filename relative to directory
+        absolute_path = os.path.join(root, fname)
+        filenames.append(os.path.relpath(absolute_path, basedir))
+  return classes, filenames
+
+
 class DirectoryIterator(Iterator):
   """Iterator capable of reading images from a directory on disk.
 
@@ -1007,43 +1089,35 @@ class DirectoryIterator(Iterator):
     self.num_class = len(classes)
     self.class_indices = dict(zip(classes, range(len(classes))))
 
-    def _recursive_list(subpath):
-      return sorted(
-          os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
-
-    for subdir in classes:
-      subpath = os.path.join(directory, subdir)
-      for root, _, files in _recursive_list(subpath):
-        for fname in files:
-          is_valid = False
-          for extension in white_list_formats:
-            if fname.lower().endswith('.' + extension):
-              is_valid = True
-              break
-          if is_valid:
-            self.samples += 1
+    pool = multiprocessing.pool.ThreadPool()
+    function_partial = partial(
+        _count_valid_files_in_directory,
+        white_list_formats=white_list_formats,
+        follow_links=follow_links)
+    self.samples = sum(
+        pool.map(function_partial, (os.path.join(directory, subdir)
+                                    for subdir in classes)))
+
     print('Found %d images belonging to %d classes.' % (self.samples,
                                                         self.num_class))
 
     # second, build an index of the images in the different class subfolders
+    results = []
+
     self.filenames = []
     self.classes = np.zeros((self.samples,), dtype='int32')
     i = 0
-    for subdir in classes:
-      subpath = os.path.join(directory, subdir)
-      for root, _, files in _recursive_list(subpath):
-        for fname in files:
-          is_valid = False
-          for extension in white_list_formats:
-            if fname.lower().endswith('.' + extension):
-              is_valid = True
-              break
-          if is_valid:
-            self.classes[i] = self.class_indices[subdir]
-            i += 1
-            # add filename relative to directory
-            absolute_path = os.path.join(root, fname)
-            self.filenames.append(os.path.relpath(absolute_path, directory))
+    for dirpath in (os.path.join(directory, subdir) for subdir in classes):
+      results.append(
+          pool.apply_async(_list_valid_filenames_in_directory, (
+              dirpath, white_list_formats, self.class_indices, follow_links)))
+    for res in results:
+      classes, filenames = res.get()
+      self.classes[i:i + len(classes)] = classes
+      self.filenames += filenames
+      i += len(classes)
+    pool.close()
+    pool.join()
     super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle,
                                             seed)
 
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
index 0dedf8f850..94768f5258 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
@@ -99,22 +99,10 @@ class TestImage(test.TestCase):
     with self.assertRaises(ValueError):
       x = np.random.random((3, 10, 10))
       generator.fit(x)
-    with self.assertRaises(ValueError):
-      x = np.random.random((32, 3, 10, 10))
-      generator.fit(x)
-    with self.assertRaises(ValueError):
-      x = np.random.random((32, 10, 10, 5))
-      generator.fit(x)
     # Test flow with invalid data
     with self.assertRaises(ValueError):
-      x = np.random.random((32, 10, 10, 5))
-      generator.flow(np.arange(x.shape[0]))
-    with self.assertRaises(ValueError):
       x = np.random.random((32, 10, 10))
       generator.flow(np.arange(x.shape[0]))
-    with self.assertRaises(ValueError):
-      x = np.random.random((32, 3, 10, 10))
-      generator.flow(np.arange(x.shape[0]))
 
   def test_image_data_generator_fit(self):
     generator = keras.preprocessing.image.ImageDataGenerator(
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text.py b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
index 93e629af17..ed00eef6ad 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/text.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import OrderedDict
+from hashlib import md5
 import string
 import sys
 
@@ -61,8 +62,45 @@ def one_hot(text,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
+  return hashing_trick(
+      text, n, hash_function=hash, filters=filters, lower=lower, split=split)
+
+
+def hashing_trick(text,
+                  n,
+                  hash_function=None,
+                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                  lower=True,
+                  split=' '):
+  """Converts a text to a sequence of indexes in a fixed-size hashing space.
+
+  Arguments:
+      text: Input text (string).
+      n: Dimension of the hashing space.
+      hash_function: if `None` uses python `hash` function, can be 'md5' or
+          any function that takes in input a string and returns a int.
+          Note that `hash` is not a stable hashing function, so
+          it is not consistent across different runs, while 'md5'
+          is a stable hashing function.
+      filters: Sequence of characters to filter out.
+      lower: Whether to convert the input to lowercase.
+      split: Sentence split marker (string).
+
+  Returns:
+      A list of integer word indices (unicity non-guaranteed).
+
+  `0` is a reserved index that won't be assigned to any word.
+
+  Two or more words may be assigned to the same index, due to possible
+  collisions by the hashing function.
+  """
+  if hash_function is None:
+    hash_function = hash
+  elif hash_function == 'md5':
+    hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)
+
   seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split)
-  return [(abs(hash(w)) % (n - 1) + 1) for w in seq]
+  return [(hash_function(w) % (n - 1) + 1) for w in seq]
 
 
 class Tokenizer(object):
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
index e94b9019b2..7b26219e61 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
@@ -30,8 +30,8 @@ class TestText(test.TestCase):
     text = 'The cat sat on the mat.'
     encoded = keras.preprocessing.text.one_hot(text, 5)
     self.assertEqual(len(encoded), 6)
-    assert np.max(encoded) <= 4
-    assert np.min(encoded) >= 0
+    self.assertLessEqual(np.max(encoded), 4)
+    self.assertGreaterEqual(np.min(encoded), 0)
 
   def test_tokenizer(self):
     texts = [
@@ -45,7 +45,7 @@ class TestText(test.TestCase):
     sequences = []
     for seq in tokenizer.texts_to_sequences_generator(texts):
       sequences.append(seq)
-    assert np.max(np.max(sequences)) < 10
+    self.assertLess(np.max(np.max(sequences)), 10)
     self.assertEqual(np.min(np.min(sequences)), 1)
 
     tokenizer.fit_on_sequences(sequences)
@@ -54,6 +54,21 @@ class TestText(test.TestCase):
       matrix = tokenizer.texts_to_matrix(texts, mode)
       self.assertEqual(matrix.shape, (3, 10))
 
+  def test_hashing_trick_hash(self):
+    text = 'The cat sat on the mat.'
+    encoded = keras.preprocessing.text.hashing_trick(text, 5)
+    self.assertEqual(len(encoded), 6)
+    self.assertLessEqual(np.max(encoded), 4)
+    self.assertGreaterEqual(np.min(encoded), 1)
+
+  def test_hashing_trick_md5(self):
+    text = 'The cat sat on the mat.'
+    encoded = keras.preprocessing.text.hashing_trick(
+        text, 5, hash_function='md5')
+    self.assertEqual(len(encoded), 6)
+    self.assertLessEqual(np.max(encoded), 4)
+    self.assertGreaterEqual(np.min(encoded), 1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/utils/__init__.py b/tensorflow/contrib/keras/python/keras/utils/__init__.py
index 68c28ab585..3b197653f3 100644
--- a/tensorflow/contrib/keras/python/keras/utils/__init__.py
+++ b/tensorflow/contrib/keras/python/keras/utils/__init__.py
@@ -23,7 +23,10 @@ from tensorflow.contrib.keras.python.keras.utils import data_utils
 from tensorflow.contrib.keras.python.keras.utils import generic_utils
 from tensorflow.contrib.keras.python.keras.utils import io_utils
 from tensorflow.contrib.keras.python.keras.utils import np_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.contrib.keras.python.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.contrib.keras.python.keras.utils.data_utils import Sequence
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import custom_object_scope
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
diff --git a/tensorflow/contrib/keras/python/keras/utils/data_utils.py b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
index 61a11b95e8..9aa477d522 100644
--- a/tensorflow/contrib/keras/python/keras/utils/data_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
@@ -17,13 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from abc import abstractmethod
 import hashlib
+import multiprocessing
+import multiprocessing.managers
+from multiprocessing.pool import ThreadPool
 import os
+import random
 import shutil
 import sys
 import tarfile
+import threading
+import time
 import zipfile
 
+import numpy as np
 import six
 from six.moves.urllib.error import HTTPError
 from six.moves.urllib.error import URLError
@@ -31,6 +39,11 @@ from six.moves.urllib.request import urlopen
 
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
 
+try:
+  import queue  # pylint:disable=g-import-not-at-top
+except ImportError:
+  import Queue as queue  # pylint:disable=g-import-not-at-top
+
 
 if sys.version_info[0] == 2:
 
@@ -300,3 +313,375 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
     return True
   else:
     return False
+
+
+class HolderManager(multiprocessing.managers.BaseManager):
+  """Custom manager to share a Holder object."""
+  pass
+
+
+class Holder(object):
+  """Object to encapsulate a Sequence.
+
+  This allows the Sequence to be shared across multiple workers.
+
+  Arguments:
+      seq: Sequence object to be shared.
+  """
+
+  def __init__(self, seq):
+    self.seq = seq
+
+  def __getitem__(self, idx):
+    return self.seq[idx]
+
+  def __len__(self):
+    return len(self.seq)
+
+
+# Register the Holder class using the ListProxy (allows __len__ and __getitem__)
+HolderManager.register('Holder', Holder, multiprocessing.managers.ListProxy)
+
+
+class Sequence(object):
+  """Base object for fitting to a sequence of data, such as a dataset.
+
+  Every `Sequence` must implements the `__getitem__` and the `__len__` methods.
+
+  Examples:
+
+  ```python
+  from skimage.io import imread
+  from skimage.transform import resize
+  import numpy as np
+
+  # Here, `x_set` is list of path to the images
+  # and `y_set` are the associated classes.
+
+  class CIFAR10Sequence(Sequence):
+      def __init__(self, x_set, y_set, batch_size):
+          self.X,self.y = x_set,y_set
+          self.batch_size = batch_size
+
+      def __len__(self):
+          return len(self.X) // self.batch_size
+
+      def __getitem__(self,idx):
+          batch_x = self.X[idx*self.batch_size:(idx+1)*self.batch_size]
+          batch_y = self.y[idx*self.batch_size:(idx+1)*self.batch_size]
+
+          return np.array([
+              resize(imread(file_name), (200,200))
+                 for file_name in batch_x]), np.array(batch_y)
+  ```
+  """
+
+  @abstractmethod
+  def __getitem__(self, index):
+    """Gets batch at position `index`.
+
+    Arguments:
+        index: position of the batch in the Sequence.
+
+    Returns:
+        A batch
+    """
+    raise NotImplementedError
+
+  @abstractmethod
+  def __len__(self):
+    """Number of batch in the Sequence.
+
+    Returns:
+        The number of batches in the Sequence.
+    """
+    raise NotImplementedError
+
+
+def get_index(ds, i):
+  """Quick fix for Python2, otherwise, it cannot be pickled.
+
+  Arguments:
+      ds: a Holder or Sequence object.
+      i: index
+
+  Returns:
+      The value at index `i`.
+  """
+  return ds[i]
+
+
+class SequenceEnqueuer(object):
+  """Base class to enqueue inputs.
+
+  The task of an Enqueuer is to use parallelism to speed up preprocessing.
+  This is done with processes or threads.
+
+  Examples:
+
+  ```python
+  enqueuer = SequenceEnqueuer(...)
+  enqueuer.start()
+  datas = enqueuer.get()
+  for data in datas:
+      # Use the inputs; training, evaluating, predicting.
+      # ... stop sometime.
+  enqueuer.close()
+  ```
+
+  The `enqueuer.get()` should be an infinite stream of datas.
+
+  """
+
+  @abstractmethod
+  def is_running(self):
+    raise NotImplementedError
+
+  @abstractmethod
+  def start(self, workers=1, max_queue_size=10):
+    """Starts the handler's workers.
+
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, threads could block on `put()`).
+    """
+    raise NotImplementedError
+
+  @abstractmethod
+  def stop(self, timeout=None):
+    """Stop running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called start().
+
+    Arguments:
+        timeout: maximum time to wait on thread.join()
+    """
+    raise NotImplementedError
+
+  @abstractmethod
+  def get(self):
+    """Creates a generator to extract data from the queue.
+
+    Skip the data if it is `None`.
+
+    Returns:
+        Generator yielding tuples `(inputs, targets)`
+            or `(inputs, targets, sample_weights)`.
+    """
+    raise NotImplementedError
+
+
+class OrderedEnqueuer(SequenceEnqueuer):
+  """Builds a Enqueuer from a Sequence.
+
+  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
+
+  Arguments:
+      sequence: A `keras.utils.data_utils.Sequence` object.
+      use_multiprocessing: use multiprocessing if True, otherwise threading
+      scheduling: Sequential querying of datas if 'sequential', random
+        otherwise.
+  """
+
+  def __init__(self,
+               sequence,
+               use_multiprocessing=False,
+               scheduling='sequential'):
+    self.manager = HolderManager()
+    self.manager.start()
+    self.sequence = self.manager.Holder(sequence)
+    self.use_multiprocessing = use_multiprocessing
+    self.scheduling = scheduling
+    self.workers = 0
+    self.executor = None
+    self.queue = None
+    self.run_thread = None
+    self.stop_signal = None
+
+  def is_running(self):
+    return self.stop_signal is not None and not self.stop_signal.is_set()
+
+  def start(self, workers=1, max_queue_size=10):
+    """Start the handler's workers.
+
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, workers could block on `put()`)
+    """
+    if self.use_multiprocessing:
+      self.executor = multiprocessing.Pool(workers)
+    else:
+      self.executor = ThreadPool(workers)
+    self.queue = queue.Queue(max_queue_size)
+    self.stop_signal = threading.Event()
+    self.run_thread = threading.Thread(target=self._run)
+    self.run_thread.daemon = True
+    self.run_thread.start()
+
+  def _run(self):
+    """Submits requests to the executor and queues the `Future` objects."""
+    sequence = list(range(len(self.sequence)))
+    while True:
+      if self.scheduling is not 'sequential':
+        random.shuffle(sequence)
+      for i in sequence:
+        if self.stop_signal.is_set():
+          return
+        self.queue.put(
+            self.executor.apply_async(get_index, (self.sequence, i)),
+            block=True)
+
+  def get(self):
+    """Creates a generator to extract data from the queue.
+
+    Skip the data if it is `None`.
+
+    Yields:
+        Tuples (inputs, targets)
+            or (inputs, targets, sample_weights)
+    """
+    try:
+      while self.is_running():
+        inputs = self.queue.get(block=True).get()
+        if inputs is not None:
+          yield inputs
+    except Exception as e:
+      self.stop()
+      raise StopIteration(e)
+
+  def stop(self, timeout=None):
+    """Stops running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called `start()`.
+
+    Arguments:
+        timeout: maximum time to wait on `thread.join()`
+    """
+    self.stop_signal.set()
+    with self.queue.mutex:
+      self.queue.queue.clear()
+      self.queue.unfinished_tasks = 0
+      self.queue.not_full.notify()
+    self.executor.close()
+    self.executor.join()
+    self.run_thread.join(timeout)
+
+
+class GeneratorEnqueuer(SequenceEnqueuer):
+  """Builds a queue out of a data generator.
+
+  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
+
+  Arguments:
+      generator: a generator function which endlessly yields data
+      use_multiprocessing: use multiprocessing if True, otherwise threading
+      wait_time: time to sleep in-between calls to `put()`
+      random_seed: Initial seed for workers,
+          will be incremented by one for each workers.
+  """
+
+  def __init__(self,
+               generator,
+               use_multiprocessing=False,
+               wait_time=0.05,
+               random_seed=None):
+    self.wait_time = wait_time
+    self._generator = generator
+    self._use_multiprocessing = use_multiprocessing
+    self._threads = []
+    self._stop_event = None
+    self.queue = None
+    self.random_seed = random_seed
+
+  def start(self, workers=1, max_queue_size=10):
+    """Kicks off threads which add data from the generator into the queue.
+
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, threads could block on `put()`)
+    """
+
+    def data_generator_task():
+      while not self._stop_event.is_set():
+        try:
+          if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
+            generator_output = next(self._generator)
+            self.queue.put(generator_output)
+          else:
+            time.sleep(self.wait_time)
+        except Exception:
+          self._stop_event.set()
+          raise
+
+    try:
+      if self._use_multiprocessing:
+        self.queue = multiprocessing.Queue(maxsize=max_queue_size)
+        self._stop_event = multiprocessing.Event()
+      else:
+        self.queue = queue.Queue()
+        self._stop_event = threading.Event()
+
+      for _ in range(workers):
+        if self._use_multiprocessing:
+          # Reset random seed else all children processes
+          # share the same seed
+          np.random.seed(self.random_seed)
+          thread = multiprocessing.Process(target=data_generator_task)
+          thread.daemon = True
+          if self.random_seed is not None:
+            self.random_seed += 1
+        else:
+          thread = threading.Thread(target=data_generator_task)
+        self._threads.append(thread)
+        thread.start()
+    except:
+      self.stop()
+      raise
+
+  def is_running(self):
+    return self._stop_event is not None and not self._stop_event.is_set()
+
+  def stop(self, timeout=None):
+    """Stops running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called `start()`.
+
+    Arguments:
+        timeout: maximum time to wait on `thread.join()`.
+    """
+    if self.is_running():
+      self._stop_event.set()
+
+    for thread in self._threads:
+      if thread.is_alive():
+        if self._use_multiprocessing:
+          thread.terminate()
+        else:
+          thread.join(timeout)
+
+    if self._use_multiprocessing:
+      if self.queue is not None:
+        self.queue.close()
+
+    self._threads = []
+    self._stop_event = None
+    self.queue = None
+
+  def get(self):
+    """Creates a generator to extract data from the queue.
+
+    Skip the data if it is `None`.
+
+    Yields:
+        Data arrays.
+    """
+    while self.is_running():
+      if not self.queue.empty():
+        inputs = self.queue.get()
+        if inputs is not None:
+          yield inputs
+      else:
+        time.sleep(self.wait_time)
diff --git a/tensorflow/contrib/keras/python/keras/utils/data_utils_test.py b/tensorflow/contrib/keras/python/keras/utils/data_utils_test.py
new file mode 100644
index 0000000000..7b73775f46
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/data_utils_test.py
@@ -0,0 +1,172 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for data_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from itertools import cycle
+import threading
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class ThreadsafeIter(object):
+
+  def __init__(self, it):
+    self.it = it
+    self.lock = threading.Lock()
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):
+    return self.next()
+
+  def next(self):
+    with self.lock:
+      return next(self.it)
+
+
+def threadsafe_generator(f):
+
+  def g(*a, **kw):
+    return ThreadsafeIter(f(*a, **kw))
+
+  return g
+
+
+class TestSequence(keras.utils.data_utils.Sequence):
+
+  def __init__(self, shape):
+    self.shape = shape
+
+  def __getitem__(self, item):
+    return np.ones(self.shape, dtype=np.uint8) * item
+
+  def __len__(self):
+    return 100
+
+
+class FaultSequence(keras.utils.data_utils.Sequence):
+
+  def __getitem__(self, item):
+    raise IndexError(item, 'item is not present')
+
+  def __len__(self):
+    return 100
+
+
+@threadsafe_generator
+def create_generator_from_sequence_threads(ds):
+  for i in cycle(range(len(ds))):
+    yield ds[i]
+
+
+def create_generator_from_sequence_pcs(ds):
+  for i in cycle(range(len(ds))):
+    yield ds[i]
+
+
+class TestEnqueuers(test.TestCase):
+
+  def test_generator_enqueuer_threads(self):
+    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+        create_generator_from_sequence_threads(TestSequence([3, 200, 200, 3])),
+        use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(int(next(gen_output)[0, 0, 0, 0]))
+
+    self.assertEqual(len(set(acc) - set(range(100))), 0)
+    enqueuer.stop()
+
+  def test_generator_enqueuer_processes(self):
+    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+        create_generator_from_sequence_pcs(TestSequence([3, 200, 200, 3])),
+        use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(int(next(gen_output)[0, 0, 0, 0]))
+    self.assertNotEqual(acc, list(range(100)))
+    enqueuer.stop()
+
+  def test_generator_enqueuer_fail_threads(self):
+    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+        create_generator_from_sequence_threads(FaultSequence()),
+        use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with self.assertRaises(StopIteration):
+      next(gen_output)
+
+  def test_generator_enqueuer_fail_processes(self):
+    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+        create_generator_from_sequence_pcs(FaultSequence()),
+        use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with self.assertRaises(StopIteration):
+      next(gen_output)
+
+  def test_ordered_enqueuer_threads(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        TestSequence([3, 200, 200, 3]), use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(next(gen_output)[0, 0, 0, 0])
+    self.assertEqual(acc, list(range(100)))
+    enqueuer.stop()
+
+  def test_ordered_enqueuer_processes(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(next(gen_output)[0, 0, 0, 0])
+    self.assertEqual(acc, list(range(100)))
+    enqueuer.stop()
+
+  def test_ordered_enqueuer_fail_threads(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        FaultSequence(), use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with self.assertRaises(StopIteration):
+      next(gen_output)
+
+  def test_ordered_enqueuer_fail_processes(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        FaultSequence(), use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with self.assertRaises(StopIteration):
+      next(gen_output)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
index 5cae694d54..ed57144f9c 100644
--- a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
@@ -227,6 +227,24 @@ def func_load(code, defaults=None, closure=None, globs=None):
       code, globs, name=code.co_name, argdefs=defaults, closure=closure)
 
 
+def has_arg(fn, name, accept_all=False):
+  """Checks if a callable accepts a given keyword argument.
+
+  Arguments:
+      fn: Callable to inspect.
+      name: Check if `fn` can be called with `name` as a keyword argument.
+      accept_all: What to return if there is no parameter called `name`
+                  but the function accepts a `**kwargs` argument.
+
+  Returns:
+      bool, whether `fn` accepts a `name` keyword argument.
+  """
+  arg_spec = tf_inspect.getargspec(fn)
+  if accept_all and arg_spec.keywords is not None:
+    return True
+  return name in arg_spec.args
+
+
 class Progbar(object):
   """Displays a progress bar.
 
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils_test.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils_test.py
new file mode 100644
index 0000000000..8a6519f4cc
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils_test.py
@@ -0,0 +1,75 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras generic Python utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class HasArgTest(test.TestCase):
+
+  def test_has_arg(self):
+
+    def f_x(x):
+      return x
+
+    def f_x_args(x, *args):
+      _ = args
+      return x
+
+    def f_x_kwargs(x, **kwargs):
+      _ = kwargs
+      return x
+
+    self.assertTrue(keras.utils.generic_utils.has_arg(
+        f_x, 'x', accept_all=False))
+    self.assertFalse(keras.utils.generic_utils.has_arg(
+        f_x, 'y', accept_all=False))
+    self.assertTrue(keras.utils.generic_utils.has_arg(
+        f_x_args, 'x', accept_all=False))
+    self.assertFalse(keras.utils.generic_utils.has_arg(
+        f_x_args, 'y', accept_all=False))
+    self.assertTrue(keras.utils.generic_utils.has_arg(
+        f_x_kwargs, 'x', accept_all=False))
+    self.assertFalse(keras.utils.generic_utils.has_arg(
+        f_x_kwargs, 'y', accept_all=False))
+    self.assertTrue(keras.utils.generic_utils.has_arg(
+        f_x_kwargs, 'y', accept_all=True))
+
+
+class TestCustomObjectScope(test.TestCase):
+
+  def test_custom_object_scope(self):
+
+    def custom_fn():
+      pass
+
+    class CustomClass(object):
+      pass
+
+    with keras.utils.generic_utils.custom_object_scope(
+        {'CustomClass': CustomClass, 'custom_fn': custom_fn}):
+      act = keras.activations.get('custom_fn')
+      self.assertEqual(act, custom_fn)
+      cl = keras.regularizers.get('CustomClass')
+      self.assertEqual(cl.__class__, CustomClass)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/utils/io_utils.py b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
index 55c135b5eb..70b2d96907 100644
--- a/tensorflow/contrib/keras/python/keras/utils/io_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
@@ -113,8 +113,40 @@ class HDF5Matrix(object):
 
   @property
   def shape(self):
+    """Gets a numpy-style shape tuple giving the dataset dimensions.
+
+    Returns:
+        A numpy-style shape tuple.
+    """
     return (self.end - self.start,) + self.data.shape[1:]
 
+  @property
+  def dtype(self):
+    """Gets the datatype of the dataset.
+
+    Returns:
+        A numpy dtype string.
+    """
+    return self.data.dtype
+
+  @property
+  def ndim(self):
+    """Gets the number of dimensions (rank) of the dataset.
+
+    Returns:
+        An integer denoting the number of dimensions (rank) of the dataset.
+    """
+    return self.data.ndim
+
+  @property
+  def size(self):
+    """Gets the total dataset size (number of elements).
+
+    Returns:
+        An integer denoting the number of elements in the dataset.
+    """
+    return np.prod(self.shape)
+
 
 def ask_to_proceed_with_overwrite(filepath):
   """Produces a prompt asking about overwriting a file.
diff --git a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
index 154070fb93..1c3481fdb8 100644
--- a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
@@ -24,15 +24,24 @@ from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils.conv_utils import convert_kernel
 
 
-def print_summary(model, line_length=None, positions=None):
+def print_summary(model, line_length=None, positions=None, print_fn=None):
   """Prints a summary of a model.
 
   Arguments:
       model: Keras model instance.
-      line_length: total length of printed lines
-      positions: relative or absolute positions of log elements in each line.
+      line_length: Total length of printed lines
+          (e.g. set this to adapt the display to different
+          terminal window sizes).
+      positions: Relative or absolute positions of log elements in each line.
           If not provided, defaults to `[.33, .55, .67, 1.]`.
+      print_fn: Print function to use (defaults to `print`).
+          It will be called on each line of the summary.
+          You can set it to a custom function
+          in order to capture the string summary.
   """
+  if print_fn is None:
+    print_fn = print
+
   if model.__class__.__name__ == 'Sequential':
     sequential_like = True
   else:
@@ -70,11 +79,11 @@ def print_summary(model, line_length=None, positions=None):
       line += str(fields[i])
       line = line[:positions[i]]
       line += ' ' * (positions[i] - len(line))
-    print(line)
+    print_fn(line)
 
-  print('_' * line_length)
+  print_fn('_' * line_length)
   print_row(to_display, positions)
-  print('=' * line_length)
+  print_fn('=' * line_length)
 
   def print_layer_summary(layer):
     try:
@@ -131,19 +140,19 @@ def print_summary(model, line_length=None, positions=None):
     else:
       print_layer_summary_with_connections(layers[i])
     if i == len(layers) - 1:
-      print('=' * line_length)
+      print_fn('=' * line_length)
     else:
-      print('_' * line_length)
+      print_fn('_' * line_length)
 
   trainable_count = int(
       np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
   non_trainable_count = int(
       np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
 
-  print('Total params: {:,}'.format(trainable_count + non_trainable_count))
-  print('Trainable params: {:,}'.format(trainable_count))
-  print('Non-trainable params: {:,}'.format(non_trainable_count))
-  print('_' * line_length)
+  print_fn('Total params: {:,}'.format(trainable_count + non_trainable_count))
+  print_fn('Trainable params: {:,}'.format(trainable_count))
+  print_fn('Non-trainable params: {:,}'.format(non_trainable_count))
+  print_fn('_' * line_length)
 
 
 def convert_all_kernels_in_model(model):
author	Francois Chollet <fchollet@google.com>	2017-07-06 19:29:26 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-07-06 19:33:27 -0700
commit	24101b35f3baebbfff3d8057ac223b325bc415ce (patch)
tree	f2a022c43038928119fc9476769a692726531a30 /tensorflow/contrib/keras
parent	154df32a959df74b3a1c377ff72f955d755b3d34 (diff)