34 files changed, 2195 insertions, 798 deletions
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 4056818a95..df409d2aa5 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -704,6 +704,17 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "training_gpu_test",
+    size = "small",
+    srcs = ["engine/training_gpu_test.py"],
+    additional_deps = [
+        ":keras",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "imagenet_utils_test",
     size = "small",
@@ -721,7 +732,6 @@ py_test(
     size = "medium",
     srcs = ["preprocessing/image_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["nomsan"],  # TODO(b/110990716) reenable
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -793,6 +803,19 @@ py_test(
 )
 
 py_test(
+    name = "training_utils_test",
+    size = "medium",
+    srcs = ["engine/training_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index f608dea430..99645de736 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -128,20 +128,26 @@ def softsign(x):
 
 
 @tf_export('keras.activations.relu')
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified Linear Unit.
 
+  With default values, it returns element-wise `max(x, 0)`.
+
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = alpha * (x - threshold)` otherwise.
+
   Arguments:
-      x: Input tensor.
-      alpha: Slope of the negative part. Defaults to zero.
-      max_value: Maximum value for the output.
+      x: A tensor or variable.
+      alpha: A scalar, slope of negative section (default=`0.`).
+      max_value: float. Saturation threshold.
+      threshold: float. Threshold value for thresholded activation.
 
   Returns:
-      The (leaky) rectified linear unit activation: `x` if `x > 0`,
-        `alpha * x` if `x < 0`. If `max_value` is defined, the result
-        is truncated to this value.
+      A tensor.
   """
-  return K.relu(x, alpha=alpha, max_value=max_value)
+  return K.relu(x, alpha=alpha, max_value=max_value, threshold=threshold)
 
 
 @tf_export('keras.activations.tanh')
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index e56c695a28..7285e03963 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -72,13 +72,9 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import constraints
-from tensorflow.python.keras import initializers
-from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.layers import Activation
 from tensorflow.python.keras.layers import BatchNormalization
 from tensorflow.python.keras.layers import Conv2D
@@ -87,10 +83,10 @@ from tensorflow.python.keras.layers import Dropout
 from tensorflow.python.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import ReLU
 from tensorflow.python.keras.layers import Reshape
 from tensorflow.python.keras.layers import ZeroPadding2D
 from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
@@ -100,10 +96,6 @@ from tensorflow.python.util.tf_export import tf_export
 BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
 
 
-def relu6(x):
-  return K.relu(x, max_value=6)
-
-
 @tf_export('keras.applications.mobilenet.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
@@ -130,12 +122,6 @@ def MobileNet(input_shape=None,
               classes=1000):
   """Instantiates the MobileNet architecture.
 
-  To load a MobileNet model via `load_model`, import the custom
-  objects `relu6` and pass them to the `custom_objects` parameter.
-  E.g.
-  model = load_model('mobilenet.h5', custom_objects={
-                     'relu6': mobilenet.relu6})
-
   Arguments:
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
@@ -412,7 +398,7 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
       strides=strides,
       name='conv1')(x)
   x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
-  return Activation(relu6, name='conv1_relu')(x)
+  return ReLU(6, name='conv1_relu')(x)
 
 
 def _depthwise_conv_block(inputs,
@@ -479,7 +465,7 @@ def _depthwise_conv_block(inputs,
       use_bias=False,
       name='conv_dw_%d' % block_id)(x)
   x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
-  x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
+  x = ReLU(6, name='conv_dw_%d_relu' % block_id)(x)
 
   x = Conv2D(
       pointwise_conv_filters, (1, 1),
@@ -489,4 +475,4 @@ def _depthwise_conv_block(inputs,
       name='conv_pw_%d' % block_id)(
           x)
   x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x)
-  return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)
+  return ReLU(6, name='conv_pw_%d_relu' % block_id)(x)
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index cb3423598b..38794f1612 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -3372,26 +3372,48 @@ def in_test_phase(x, alt, training=None):
 
 
 @tf_export('keras.backend.relu')
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
   With default values, it returns element-wise `max(x, 0)`.
 
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = alpha * (x - threshold)` otherwise.
+
   Arguments:
       x: A tensor or variable.
       alpha: A scalar, slope of negative section (default=`0.`).
-      max_value: Saturation threshold.
+      max_value: float. Saturation threshold.
+      threshold: float. Threshold value for thresholded activation.
 
   Returns:
       A tensor.
   """
+  clip_max = max_value is not None
+
   if alpha != 0.:
-    negative_part = nn.relu(-x)
-  x = nn.relu(x)
-  if max_value is not None:
+    if threshold != 0:
+      negative_part = nn.relu(-x + threshold)
+    else:
+      negative_part = nn.relu(-x)
+
+  if threshold != 0:
+    # computes x for x > threshold else 0
+    x = x * math_ops.cast(math_ops.greater(x, threshold), floatx())
+  elif max_value == 6:
+    # if no threshold, then can use nn.relu6 native TF op for performance
+    x = nn.relu6(x)
+    clip_max = False
+  else:
+    x = nn.relu(x)
+
+  if clip_max:
     max_value = _to_tensor(max_value, x.dtype.base_dtype)
     zero = _to_tensor(0., x.dtype.base_dtype)
     x = clip_ops.clip_by_value(x, zero, max_value)
+
   if alpha != 0.:
     alpha = _to_tensor(alpha, x.dtype.base_dtype)
     x -= alpha * negative_part
@@ -3458,7 +3480,7 @@ def softsign(x):
 
 
 @tf_export('keras.backend.categorical_crossentropy')
-def categorical_crossentropy(target, output, from_logits=False):
+def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy between an output tensor and a target tensor.
 
   Arguments:
@@ -3468,28 +3490,33 @@ def categorical_crossentropy(target, output, from_logits=False):
           case `output` is expected to be the logits).
       from_logits: Boolean, whether `output` is the
           result of a softmax, or is a tensor of logits.
+      axis: Int specifying the channels axis. `axis=-1` corresponds to data
+          format `channels_last', and `axis=1` corresponds to data format
+          `channels_first`.
 
   Returns:
       Output tensor.
+
+  Raises:
+      ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
+  rank = len(output.get_shape())
+  axis = axis % rank
   # Note: nn.softmax_cross_entropy_with_logits_v2
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
-    output = output / math_ops.reduce_sum(  # pylint: disable=g-no-augmented-assignment
-        output, len(output.get_shape()) - 1, True)
+    output = output / math_ops.reduce_sum(output, axis, True)
     # manual computation of crossentropy
     epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
     output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
-    return -math_ops.reduce_sum(
-        target * math_ops.log(output),
-        axis=len(output.get_shape()) - 1)
+    return -math_ops.reduce_sum(target * math_ops.log(output), axis)
   else:
     return nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=output)
 
 
 @tf_export('keras.backend.sparse_categorical_crossentropy')
-def sparse_categorical_crossentropy(target, output, from_logits=False):
+def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy with integer targets.
 
   Arguments:
@@ -3499,10 +3526,22 @@ def sparse_categorical_crossentropy(target, output, from_logits=False):
           case `output` is expected to be the logits).
       from_logits: Boolean, whether `output` is the
           result of a softmax, or is a tensor of logits.
+      axis: Int specifying the channels axis. `axis=-1` corresponds to data
+          format `channels_last', and `axis=1` corresponds to data format
+          `channels_first`.
 
   Returns:
       Output tensor.
+
+  Raises:
+      ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
+  rank = len(output.get_shape())
+  axis = axis % rank
+  if axis != rank - 1:
+    permutation = list(range(axis)) + list(range(axis + 1, rank)) + [axis]
+    output = array_ops.transpose(output, perm=permutation)
+
   # Note: nn.sparse_softmax_cross_entropy_with_logits
   # expects logits, Keras expects probabilities.
   if not from_logits:
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 36478ea089..40e7910061 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -23,6 +23,7 @@ import scipy.sparse
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -490,6 +491,66 @@ class BackendLinearAlgebraTest(test.TestCase):
                                        input_shape_a=(4, 7),
                                        input_shape_b=(4, 7))
 
+  def test_relu(self):
+    x = ops.convert_to_tensor([[-4, 0], [2, 7]], 'float32')
+    with self.test_session():
+      # standard relu
+      relu_op = keras.backend.relu(x)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # alpha
+      relu_op = keras.backend.relu(x, alpha=0.5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
+
+      # max_value < some elements
+      relu_op = keras.backend.relu(x, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
+
+      # nn.relu6 used
+      relu_op = keras.backend.relu(x, max_value=6)
+      self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
+
+      # max value > 6
+      relu_op = keras.backend.relu(x, max_value=10)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # max value is float
+      relu_op = keras.backend.relu(x, max_value=4.3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
+
+      # max value == 0
+      relu_op = keras.backend.relu(x, max_value=0)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
+
+      # alpha and max_value
+      relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
+
+      # threshold
+      relu_op = keras.backend.relu(x, threshold=3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
+
+      # threshold is float
+      relu_op = keras.backend.relu(x, threshold=1.5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # threshold is negative
+      relu_op = keras.backend.relu(x, threshold=-5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
+
+      # threshold and max_value
+      relu_op = keras.backend.relu(x, threshold=3, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
+
+      # threshold and alpha
+      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
+
+      # threshold, alpha, and max_value
+      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
+
 
 class BackendShapeOpsTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 5d66db232a..d1b9dc27bd 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -31,13 +31,16 @@ import time
 import numpy as np
 import six
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine.training_utils import standardize_input_data
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
+from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -644,35 +647,17 @@ class LearningRateScheduler(Callback):
     self.verbose = verbose
 
   def on_epoch_begin(self, epoch, logs=None):
-    # TODO(yashkatariya): Change the property checking when the learning
-    # rate attribute is unified across all TF Optimizers.
-    if isinstance(self.model.optimizer, optimizers.TFOptimizer):
-      if not hasattr(self.model.optimizer.optimizer, '_lr') and not hasattr(
-          self.model.optimizer.optimizer, '_learning_rate'):
-        raise ValueError(
-            'TF Optimizer must have a "_lr" or "_learning_rate" attribute.')
-      else:
-        opt = self.model.optimizer.optimizer
-        if hasattr(opt, '_lr'):
-          opt_lr = Variable(opt._lr)  # pylint: disable=protected-access
-        elif hasattr(opt, '_learning_rate'):
-          opt_lr = Variable(opt._learning_rate)  # pylint: disable=protected-access
-    else:
-      if not hasattr(self.model.optimizer, 'lr'):
-        raise ValueError('Optimizer must have a "lr" attribute.')
-      else:
-        opt = self.model.optimizer
-        opt_lr = opt.lr
-
+    if not hasattr(self.model.optimizer, 'lr'):
+      raise ValueError('Optimizer must have a "lr" attribute.')
     try:  # new API
-      lr = float(K.get_value(opt_lr))
+      lr = float(K.get_value(self.model.optimizer.lr))
       lr = self.schedule(epoch, lr)
     except TypeError:  # Support for old API for backward compatibility
       lr = self.schedule(epoch)
     if not isinstance(lr, (float, np.float32, np.float64)):
       raise ValueError('The output of the "schedule" function '
                        'should be float.')
-    K.set_value(opt_lr, lr)
+    K.set_value(self.model.optimizer.lr, lr)
     if self.verbose > 0:
       print('\nEpoch %05d: LearningRateScheduler reducing learning '
             'rate to %s.' % (epoch + 1, lr))
@@ -717,7 +702,9 @@ class TensorBoard(Callback):
       write_images: whether to write model weights to visualize as
           image in TensorBoard.
       embeddings_freq: frequency (in epochs) at which selected embedding
-          layers will be saved.
+          layers will be saved. If set to 0, embeddings won't be computed.
+          Data to be visualized in TensorBoard's Embedding tab must be passed
+          as `embeddings_data`.
       embeddings_layer_names: a list of names of layers to keep eye on. If
           None or empty list all the embedding layer will be watched.
       embeddings_metadata: a dictionary which maps layer name to a file name
@@ -725,6 +712,10 @@ class TensorBoard(Callback):
           [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
           about metadata files format. In case if the same metadata file is
           used for all embedding layers, string can be passed.
+      embeddings_data: data to be embedded at layers specified in
+          `embeddings_layer_names`. Numpy array (if the model has a single
+          input) or list of Numpy arrays (if the model has multiple inputs).
+          Learn [more about embeddings](https://www.tensorflow.org/programmers_guide/embedding)
   """
 
   # pylint: enable=line-too-long
@@ -735,7 +726,11 @@ class TensorBoard(Callback):
                batch_size=32,
                write_graph=True,
                write_grads=False,
-               write_images=False):
+               write_images=False,
+               embeddings_freq=0,
+               embeddings_layer_names=None,
+               embeddings_metadata=None,
+               embeddings_data=None):
     super(TensorBoard, self).__init__()
     self.log_dir = log_dir
     self.histogram_freq = histogram_freq
@@ -745,8 +740,13 @@ class TensorBoard(Callback):
     self.write_images = write_images
     self.batch_size = batch_size
     self._current_batch = 0
+    self._total_batches_seen = 0
     # abstracted writer class to be able to stub for testing
     self._writer_class = tf_summary.FileWriter
+    self.embeddings_freq = embeddings_freq
+    self.embeddings_layer_names = embeddings_layer_names
+    self.embeddings_metadata = embeddings_metadata
+    self.embeddings_data = embeddings_data
 
   def set_model(self, model):
     """Sets Keras model and creates summary ops."""
@@ -798,7 +798,11 @@ class TensorBoard(Callback):
             tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
 
         if hasattr(layer, 'output'):
-          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+          if isinstance(layer.output, list):
+            for i, output in enumerate(layer.output):
+              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
+          else:
+            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
 
     if self.write_graph:
@@ -806,12 +810,98 @@ class TensorBoard(Callback):
     else:
       self.writer = self._writer_class(self.log_dir)
 
+    # If both embedding_freq and embeddings_data are available, we will
+    # visualize embeddings.
+    if self.embeddings_freq and self.embeddings_data is not None:
+      self.embeddings_data = standardize_input_data(self.embeddings_data,
+                                                    model.input_names)
+
+      # If embedding_layer_names are not provided, get all of the embedding
+      # layers from the model.
+      embeddings_layer_names = self.embeddings_layer_names
+      if not embeddings_layer_names:
+        embeddings_layer_names = [
+            layer.name
+            for layer in self.model.layers
+            if type(layer).__name__ == 'Embedding'
+        ]
+
+      self.assign_embeddings = []
+      embeddings_vars = {}
+
+      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
+      self.step = step = array_ops.placeholder(dtypes.int32)
+
+      for layer in self.model.layers:
+        if layer.name in embeddings_layer_names:
+          embedding_input = self.model.get_layer(layer.name).output
+          embedding_size = np.prod(embedding_input.shape[1:])
+          embedding_input = array_ops.reshape(embedding_input,
+                                              (step, int(embedding_size)))
+          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
+          embedding = variables.Variable(
+              array_ops.zeros(shape), name=layer.name + '_embedding')
+          embeddings_vars[layer.name] = embedding
+          batch = state_ops.assign(embedding[batch_id:batch_id + step],
+                                   embedding_input)
+          self.assign_embeddings.append(batch)
+
+      self.saver = saver.Saver(list(embeddings_vars.values()))
+
+      # Create embeddings_metadata dictionary
+      if isinstance(self.embeddings_metadata, str):
+        embeddings_metadata = {
+            layer_name: self.embeddings_metadata
+            for layer_name in embeddings_vars.keys()
+        }
+      else:
+        # If embedding_metadata is already a dictionary
+        embeddings_metadata = self.embeddings_metadata
+
+      try:
+        from tensorboard.plugins import projector
+      except ImportError:
+        raise ImportError('Failed to import TensorBoard. Please make sure that '
+                          'TensorBoard integration is complete."')
+
+      # TODO(psv): Add integration tests to test embedding visualization
+      # with TensorBoard callback. We are unable to write a unit test for this
+      # because TensorBoard dependency assumes TensorFlow package is installed.
+      config = projector.ProjectorConfig()
+      for layer_name, tensor in embeddings_vars.items():
+        embedding = config.embeddings.add()
+        embedding.tensor_name = tensor.name
+
+        if (embeddings_metadata is not None and
+            layer_name in embeddings_metadata):
+          embedding.metadata_path = embeddings_metadata[layer_name]
+
+      projector.visualize_embeddings(self.writer, config)
+
   def _fetch_callback(self, summary):
     self.writer.add_summary(
         summary,
         self._epoch + self._current_val_batch / self._validation_batches)
     self._current_val_batch += 1
 
+  def _write_custom_summaries(self, step, logs=None):
+    """Writes metrics out as custom scalar summaries.
+
+    Arguments:
+        step: the global step to use for Tensorboard.
+        logs: dict. Keys are scalar summary names, values are
+            NumPy scalars.
+
+    """
+    logs = logs or {}
+    for name, value in logs.items():
+      summary = tf_summary.Summary()
+      summary_value = summary.value.add()
+      summary_value.simple_value = value.item()
+      summary_value.tag = name
+      self.writer.add_summary(summary, step)
+    self.writer.flush()
+
   def on_train_begin(self, logs=None):
     """Checks if histogram summaries can be run."""
 
@@ -828,6 +918,16 @@ class TensorBoard(Callback):
         raise ValueError(
             'If printing histograms, validation data must have length > 0.')
 
+  def on_batch_end(self, batch, logs=None):
+    """Writes scalar summaries for metrics on every training batch."""
+    # Don't output batch_size and batch number as Tensorboard summaries
+    logs = logs or {}
+    batch_logs = {('batch_' + k): v
+                  for k, v in logs.items()
+                  if k not in ['batch', 'size']}
+    self._write_custom_summaries(self._total_batches_seen, batch_logs)
+    self._total_batches_seen += 1
+
   def on_epoch_begin(self, epoch, logs=None):
     """Add histogram op to Model test_function callbacks, reset batch count."""
 
@@ -844,7 +944,12 @@ class TensorBoard(Callback):
   def on_epoch_end(self, epoch, logs=None):
     """Checks if summary ops should run next epoch, logs scalar summaries."""
 
-    logs = logs or {}
+    # don't output batch_size and
+    # batch number as Tensorboard summaries
+    logs = {('epoch_' + k): v
+            for k, v in logs.items()
+            if k not in ['batch', 'size']}
+    self._write_custom_summaries(epoch, logs)
 
     # pop the histogram summary op after each epoch
     if self.histogram_freq:
@@ -853,15 +958,45 @@ class TensorBoard(Callback):
       if self.merged in self.model.test_function.fetch_callbacks:
         self.model.test_function.fetch_callbacks.pop(self.merged)
 
-    for name, value in logs.items():
-      if name in ['batch', 'size']:
-        continue
-      summary = tf_summary.Summary()
-      summary_value = summary.value.add()
-      summary_value.simple_value = value.item()
-      summary_value.tag = name
-      self.writer.add_summary(summary, epoch)
-    self.writer.flush()
+    if self.embeddings_data is None and self.embeddings_freq:
+      raise ValueError('To visualize embeddings, embeddings_data must '
+                       'be provided.')
+
+    if self.embeddings_freq and self.embeddings_data is not None:
+      if epoch % self.embeddings_freq == 0:
+        # We need a second forward-pass here because we're passing
+        # the `embeddings_data` explicitly. This design allows to pass
+        # arbitrary data as `embeddings_data` and results from the fact
+        # that we need to know the size of the `tf.Variable`s which
+        # hold the embeddings in `set_model`. At this point, however,
+        # the `validation_data` is not yet set.
+
+        embeddings_data = self.embeddings_data
+        n_samples = embeddings_data[0].shape[0]
+        i = 0
+        while i < n_samples:
+          step = min(self.batch_size, n_samples - i)
+          batch = slice(i, i + step)
+
+          if isinstance(self.model.input, list):
+            feed_dict = {
+                model_input: embeddings_data[idx][batch]
+                for idx, model_input in enumerate(self.model.input)
+            }
+          else:
+            feed_dict = {self.model.input: embeddings_data[0][batch]}
+
+          feed_dict.update({self.batch_id: i, self.step: step})
+
+          if self.model.uses_learning_phase:
+            feed_dict[K.learning_phase()] = False
+
+          self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
+          self.saver.save(self.sess,
+                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
+                          epoch)
+
+          i += self.batch_size
 
   def on_train_end(self, logs=None):
     self.writer.close()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 244d48591c..7d830078ce 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -29,16 +29,10 @@ import numpy as np
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
-from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training.adam import AdamOptimizer
-from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
-
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -376,76 +370,6 @@ class KerasCallbacksTest(test.TestCase):
           float(keras.backend.get_value(
               model.optimizer.lr)) - 0.01 / 4) < keras.backend.epsilon()
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_TF_LearningRateScheduler_Adam(self):
-    with self.test_session():
-      with context.eager_mode():
-        np.random.seed(1337)
-        (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-            train_samples=TRAIN_SAMPLES,
-            test_samples=TEST_SAMPLES,
-            input_shape=(INPUT_DIM,),
-            num_classes=NUM_CLASSES)
-        y_test = keras.utils.to_categorical(y_test)
-        y_train = keras.utils.to_categorical(y_train)
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.Dense(
-                NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-        model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-        model.compile(
-            loss='categorical_crossentropy',
-            optimizer=AdamOptimizer(),
-            metrics=['accuracy'])
-        cbks = [keras.callbacks.LearningRateScheduler(lambda x: 1. / (1. + x))]
-        model.fit(
-            x_train,
-            y_train,
-            batch_size=BATCH_SIZE,
-            validation_data=(x_test, y_test),
-            callbacks=cbks,
-            epochs=5,
-            verbose=0)
-        opt_lr = model.optimizer.optimizer._lr
-        self.assertLess(
-            float(keras.backend.get_value(
-                Variable(opt_lr))) - 0.2, keras.backend.epsilon())
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_TF_LearningRateScheduler_GradientDescent(self):
-    with self.test_session():
-      with context.eager_mode():
-        np.random.seed(1337)
-        (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-            train_samples=TRAIN_SAMPLES,
-            test_samples=TEST_SAMPLES,
-            input_shape=(INPUT_DIM,),
-            num_classes=NUM_CLASSES)
-        y_test = keras.utils.to_categorical(y_test)
-        y_train = keras.utils.to_categorical(y_train)
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.Dense(
-                NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-        model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-        model.compile(
-            loss='categorical_crossentropy',
-            optimizer=GradientDescentOptimizer(1e-3),
-            metrics=['accuracy'])
-        cbks = [keras.callbacks.LearningRateScheduler(lambda x: 1. / (1. + x))]
-        model.fit(
-            x_train,
-            y_train,
-            batch_size=BATCH_SIZE,
-            validation_data=(x_test, y_test),
-            callbacks=cbks,
-            epochs=5,
-            verbose=0)
-        opt_lr = model.optimizer.optimizer._learning_rate
-        self.assertLess(
-            float(keras.backend.get_value(
-                Variable(opt_lr))) - 0.2, keras.backend.epsilon())
-
   def test_ReduceLROnPlateau(self):
     with self.test_session():
       np.random.seed(1337)
@@ -1172,6 +1096,74 @@ class KerasCallbacksTest(test.TestCase):
 
       assert os.path.exists(temp_dir)
 
+  def test_Tensorboard_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batches_logged = []
+        self.summary_values = []
+        self.summary_tags = []
+
+      def add_summary(self, summary, step):
+        self.summary_values.append(summary.value[0].simple_value)
+        self.summary_tags.append(summary.value[0].tag)
+        self.batches_logged.append(step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    logdir = 'fake_dir'
+
+    # log every batch
+    tb_cbk = keras.callbacks.TensorBoard(logdir)
+    tb_cbk.writer = FileWriterStub(logdir)
+
+    for batch in range(5):
+      tb_cbk.on_batch_end(batch, {'acc': np.float32(batch)})
+    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
+    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
+    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
+
+  def test_Tensorboard_epoch_and_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summary = (step, summary)
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summary = (step, summary)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    logdir = 'fake_dir'
+
+    tb_cbk = keras.callbacks.TensorBoard(logdir)
+    tb_cbk.writer = FileWriterStub(logdir)
+
+    tb_cbk.on_batch_end(0, {'acc': np.float32(5.0)})
+    tb_cbk.on_epoch_end(0, {'acc': np.float32(10.0)})
+    batch_step, batch_summary = tb_cbk.writer.batch_summary
+    self.assertEqual(batch_step, 0)
+    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
+    self.assertEqual(epoch_step, 0)
+    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
+
   def test_RemoteMonitorWithJsonPayload(self):
     if requests is None:
       self.skipTest('`requests` required to run this test')
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index e02792208b..b41f6ee03b 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -723,9 +723,17 @@ class Layer(checkpointable.CheckpointableBase):
             self._dtype = input_list[0].dtype.base_dtype.name
           except AttributeError:
             pass
+
         if all(hasattr(x, 'shape') for x in input_list):
           input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-        self.build(input_shapes)
+
+        if (not hasattr(self, '_is_graph_network') or
+            self.__class__.__name__ == 'Sequential'):
+          # Only if self is a layer or an instance of a sequential model do we
+          # need to build it.
+          self.build(input_shapes)
+        # We must set self.built since user defined build functions are not
+        # constrained to set self.built.
         self.built = True
 
       # Check input assumptions set after layer building, e.g. input shape.
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index a4d96de74f..752e9963ca 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -318,8 +318,8 @@ class Network(base_layer.Layer):
     else:
       self._expects_training_arg = False
     self._call_convention = self._determine_call_convention(call_argspec)
-    self.outputs = None
-    self.inputs = None
+    self.outputs = []
+    self.inputs = []
     self.built = False
 
   def _determine_call_convention(self, call_argspec):
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 371504a503..41cdfda660 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -213,13 +213,31 @@ class Sequential(Model):
       self.outputs = [self.layers[-1].output]
       self.build()
 
-  @checkpointable.no_automatic_dependency_tracking
   def build(self, input_shape=None):
-    if input_shape and not self.inputs:
-      batch_shape = tuple(input_shape)
+    self._set_inputs_and_outputs(input_shape=input_shape)
+
+  def symbolic_set_inputs(self, inputs):
+    self._set_inputs_and_outputs(tensor=inputs)
+
+  @checkpointable.no_automatic_dependency_tracking
+  def _set_inputs_and_outputs(self, input_shape=None, tensor=None):
+    """Set model's input and output specs based on the input received.
+
+    If `tensor` is provided, `input_shape` is not required.
+
+    Args:
+      input_shape: Optional shape of input.
+      tensor: Optional existing tensor to wrap into the `Input` layer.
+    """
+    if not self.inputs:
       dtype = K.floatx()
-      x = Input(
-          batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
+      if tensor is not None:
+        batch_shape = (None,) + tuple(tensor.get_shape().as_list()[1:])
+        x = Input(dtype=dtype, name=self.name + '_input', tensor=tensor)
+      elif input_shape is not None:
+        batch_shape = tuple(input_shape)
+        x = Input(
+            batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
       self.inputs = [x]
       for layer in self._layers:
         x = layer(x)
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 0f54e29cee..4f4adca333 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -104,9 +103,6 @@ class TestSequential(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sequential_deferred_build_with_dataset_iterators(self):
-    if not context.executing_eagerly():
-      # TODO(psv/fchollet): Add support for this use case in graph mode.
-      return
     num_hidden = 5
     input_dim = 3
     num_classes = 2
@@ -136,6 +132,48 @@ class TestSequential(test.TestCase):
                      [None, num_classes])
     self.assertEqual(len(model.weights), 2 * 2)
 
+  def test_training_and_eval_methods_on_symbolic_tensors(self):
+    with self.test_session():
+
+      def create_model():
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(4, activation='softmax'))
+
+        model.compile(
+            optimizer=rmsprop.RMSPropOptimizer(1e-3),
+            loss='categorical_crossentropy',
+            metrics=['accuracy'])
+        return model
+
+      inputs = keras.backend.zeros(shape=(10, 3))
+      targets = keras.backend.zeros(shape=(10, 4))
+
+      model = create_model()
+      model.fit(inputs, targets, epochs=10, steps_per_epoch=30)
+
+      model = create_model()
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+
+      model = create_model()
+      model.predict(inputs, steps=2)
+
+      model = create_model()
+      model.train_on_batch(inputs, targets)
+
+      model = create_model()
+      model.test_on_batch(inputs, targets)
+
+      model = create_model()
+      model.fit(
+          inputs,
+          targets,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          validation_data=(inputs, targets),
+          validation_steps=2)
+
   @tf_test_util.run_in_graph_and_eager_modes
   def test_invalid_use_cases(self):
     # Added objects must be layer instances
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index bd03f4871f..4df739254b 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -27,6 +27,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
@@ -43,6 +44,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -217,10 +219,9 @@ class Model(Network):
       for name in self.output_names:
         if name not in loss:
           logging.warning(
-              'Output "' + name + '" missing from loss dictionary. '
-              'We assume this was done on purpose, '
-              'and we will not be expecting '
-              'any data to be passed to "' + name + '" during training.')
+              'Output "' + name + '" missing from loss dictionary. We assume '
+              'this was done on purpose. The fit and evaluate APIs will not be '
+              'expecting any data to be passed to "' + name + '".')
         loss_functions.append(losses.get(loss.get(name)))
     elif isinstance(loss, list):
       if len(loss) != len(self.outputs):
@@ -561,6 +562,95 @@ class Model(Network):
     trainable_weights = self.trainable_weights
     self._collected_trainable_weights = trainable_weights
 
+  def build(self, input_shape):
+    """Build the model based on input shapes received.
+
+    This is to be used for subclassed models, which do not know at instantiation
+    time what their inputs look like.
+
+    Args:
+     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
+         are tuples, integers, or TensorShapes.
+
+    Raises:
+      ValueError:
+        1. In case of invalid user-provided data (not of type tuple,
+           list, or TensorShape).
+        2. If the model requires call arguments that are agnostic
+           to the input shapes (positional or kwarg in call signature).
+        3. If not all layers were properly built.
+        4. If float type inputs are not supported within the layers.
+
+      In each of these cases, the user should build their model by calling it
+      on real tensor data.
+    """
+    if self._is_graph_network:
+      self.built = True
+      return
+
+    # If subclass network
+    if input_shape is None:
+      raise ValueError('Input shape must be defined when calling build on a '
+                       'model subclass network.')
+    valid_types = (tuple, list, tensor_shape.TensorShape)
+    if not isinstance(input_shape, valid_types):
+      raise ValueError('Specified input shape is not one of the valid types. '
+                       'Please specify a batch input shape of type tuple or '
+                       'list of input shapes. User provided '
+                       'input type: {}'.format(type(input_shape)))
+
+    def _generate_dummy_data_from_shape(shape):
+      if isinstance(shape, tensor_shape.TensorShape):
+        shape = shape.as_list()
+
+      # Replace Nones in input shape with dummy `1` value
+      shape = [x.value if isinstance(x, tensor_shape.Dimension) else x
+               for x in shape]
+      shape = [1 if x is None else x for x in shape]
+      return array_ops.ones(shape, dtype=K.floatx())
+
+    if input_shape and not self.inputs:
+      if isinstance(input_shape, list):
+        # List of input shapes
+        x = [_generate_dummy_data_from_shape(shape) for shape in input_shape]
+      else:
+        x = _generate_dummy_data_from_shape(input_shape)
+
+      kwargs = {}
+      num_call_args = len(tf_inspect.getargspec(self.call).args)
+      if self._expects_training_arg and num_call_args == 3:
+        # Has call signature of call(self, input, training)
+        kwargs['training'] = False
+      elif num_call_args > 2:
+        # Has invalid call signature of call(self, input, *args, **kwargs)
+        raise ValueError('Currently, you cannot build your model if it has '
+                         'positional or keyword arguments that are not '
+                         'inputs to the model, but are required for its '
+                         '`call` method. Instead, in order to instantiate '
+                         'and build your model, `call` your model on real '
+                         'tensor data with all expected call arguments.')
+
+      try:
+        self.call(x, **kwargs)
+      except (errors.InvalidArgumentError, TypeError):
+        raise ValueError('You cannot build your model by calling `build` '
+                         'if your layers do not support float type inputs. '
+                         'Instead, in order to instantiate and build your '
+                         'model, `call` your model on real tensor data (of '
+                         'the correct dtype).')
+
+    if self._layers:
+      self._track_layers(self._layers)
+    if self.layers:
+      for layer in self.layers:
+        if not layer.built:
+          raise ValueError('Layer: {} was not built in your model. Calling '
+                           '`build` manually on a subclassed model is only '
+                           'allowed for models with a static topology. '
+                           'In this case, you can build your model by '
+                           'calling it on real tensor data.'.format(layer))
+    self.built = True
+
   def _check_trainable_weights_consistency(self):
     """Check trainable weights count consistency.
 
@@ -897,7 +987,11 @@ class Model(Network):
         for output_shape, loss_fn in zip(self._feed_output_shapes,
                                          self._feed_loss_fns):
           if loss_fn is losses.sparse_categorical_crossentropy:
-            feed_output_shapes.append(output_shape[:-1] + (1,))
+            if K.image_data_format() == 'channels_first':
+              feed_output_shapes.append(
+                  (output_shape[0], 1) + output_shape[2:])
+            else:
+              feed_output_shapes.append(output_shape[:-1] + (1,))
           elif (not hasattr(loss_fn, '__name__') or
                 getattr(losses, loss_fn.__name__, None) is None):
             # If `loss_fn` is not a function (e.g. callable class)
@@ -988,10 +1082,14 @@ class Model(Network):
         inputs = inputs[0]
 
       if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+        if context.executing_eagerly():
+          input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+          self.build(input_shape=input_shape)
+        else:
+          self.symbolic_set_inputs(inputs)
       else:
         input_shape = (None,) + inputs.shape[1:]
-      self.build(input_shape=input_shape)
+        self.build(input_shape=input_shape)
     elif context.executing_eagerly():
       self._eager_set_inputs(inputs)
     else:
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index c78684c9f4..397de42985 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -34,7 +34,6 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -194,7 +193,8 @@ def iterator_fit_loop(model,
                       callbacks=None,
                       callback_metrics=None,
                       validation_steps=None,
-                      do_validation=False):
+                      do_validation=False,
+                      batch_size=None):
   """Fit function for eager execution when input is given as dataset iterator.
 
   Updates the given epoch logs.
@@ -224,16 +224,23 @@ def iterator_fit_loop(model,
       validation_steps: Number of steps to run validation for (only if doing
         validation from data tensors). Ignored with default value of `None`.
       do_validation: Boolean value indicating whether we should do validation.
+      batch_size: int, val_inputs and val_targets will be evaled batch by
+        batch with size batch_size if they are array.
 
   Raises:
       ValueError: In case of mismatch between given number of inputs and
         expectations of the model.
   """
   assert isinstance(inputs, iterator_ops.EagerIterator)
+
+  # make sure either x,y or x,y,sample_weights is provided
+  if (not isinstance(inputs.output_shapes, (list, tuple)) or
+      len(inputs.output_shapes) not in (2, 3)):
+    raise ValueError('Please provide either inputs and targets'
+                     'or inputs, targets, and sample_weights')
+
   for step_index in range(steps_per_epoch):
-    batch_logs = {}
-    batch_logs['batch'] = step_index
-    batch_logs['size'] = 1
+    batch_logs = {'batch': step_index, 'size': 1}
     callbacks.on_batch_begin(step_index, batch_logs)
 
     # Get data from the iterator.
@@ -247,19 +254,21 @@ def iterator_fit_loop(model,
           'batches (in this case, %d batches).' % steps_per_epoch * epochs)
       break
 
-    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
-      raise ValueError('Please provide data as a list or tuple of 2 elements '
-                       ' - input and target pair. Received %s' % next_element)
-    x, y = next_element
+    if len(inputs.output_shapes) == 2:
+      x, y = next_element
+      sample_weights = None
+    else:
+      x, y, sample_weights = next_element
 
     # Validate and standardize data.
     x, y, sample_weights = model._standardize_user_data(
-        x, y, class_weight=class_weight)
+        x, y, sample_weight=sample_weights, class_weight=class_weight)
     x = training_utils.cast_if_floating_dtype(x)
     y = training_utils.cast_if_floating_dtype(y)
     if sample_weights:
       sample_weights = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
+          training_utils.cast_if_floating_dtype(
+              ops.convert_to_tensor(val, dtype=backend.floatx()))
           if val is not None else None for val in sample_weights
       ]
 
@@ -307,122 +316,8 @@ def iterator_fit_loop(model,
             val_targets,
             sample_weights=val_sample_weights,
             steps=validation_steps,
-            verbose=0)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(out_labels, val_outs):
-          epoch_logs['val_' + l] = o
-
-
-def batch_fit_loop(model,
-                   inputs,
-                   targets,
-                   epoch_logs,
-                   index_array,
-                   out_labels,
-                   callback_model,
-                   batch_size,
-                   sample_weights=None,
-                   val_inputs=None,
-                   val_targets=None,
-                   val_sample_weights=None,
-                   callbacks=None,
-                   shuffle=True,
-                   num_train_samples=None,
-                   do_validation=False):
-  """Fit function for eager execution when input is given as arrays or tensors.
-
-  Updates the given epoch logs.
-
-  Arguments:
-      model: Instance of the `Model`.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      epoch_logs: Dictionary of logs from every epoch.
-      index_array: Index array generated from number of training samples.
-      out_labels: Output labels generated from model metric names.
-      callback_model: Instance of `Model` to callback.
-      batch_size: Integer batch size or None if unknown.
-      sample_weights: Optional list of sample weight arrays.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      callbacks: List of callbacks to be called during training.
-      shuffle: Whether to shuffle the data at the beginning of each epoch.
-      num_train_samples: Integer number of training samples.
-      do_validation: Boolean value indicating whether we should do validation.
-  """
-  # TODO(psv): Create a dataset iterator instead of manually creating batches
-  # here and in batch_test_loop, batch_predict_loop.
-  if shuffle == 'batch':
-    index_array = model._batch_shuffle(index_array, batch_size)
-  elif shuffle:
-    np.random.shuffle(index_array)
-
-  batches = generic_utils.make_batches(num_train_samples, batch_size)
-
-  for batch_index, (batch_start, batch_end) in enumerate(batches):
-    batch_ids = index_array[batch_start:batch_end]
-    inputs_batch = slice_arrays(inputs, batch_ids, contiguous=not shuffle)
-    targets_batch = slice_arrays(targets, batch_ids, contiguous=not shuffle)
-    if sample_weights:
-      sample_weights_batch = slice_arrays(
-          sample_weights, batch_ids, contiguous=not shuffle)
-    else:
-      sample_weights_batch = None
-    batch_logs = {}
-    batch_logs['batch'] = batch_index
-    batch_logs['size'] = len(batch_ids)
-
-    callbacks.on_batch_begin(batch_index, batch_logs)
-
-    inputs_batch = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
-        for val in inputs_batch
-    ]
-    targets_batch = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
-        for val in targets_batch
-    ]
-    if sample_weights:
-      sample_weights_batch = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
-          if val is not None else None for val in sample_weights_batch
-      ]
-
-    outs, loss, loss_metrics = _process_single_batch(
-        model,
-        inputs_batch,
-        targets_batch,
-        sample_weights=sample_weights_batch,
-        training=True)
-
-    if not isinstance(outs, list):
-      outs = [outs]
-
-    for l, o in zip(out_labels, outs):
-      batch_logs[l] = o
-    # Required for eager execution
-    metrics_results = _eager_metrics_fn(model, outs, targets_batch)
-    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
-
-    for k, v in zip(model.metrics_names,
-                    [backend.mean(loss)] + loss_metrics + metrics_results):
-      batch_logs[k] = tensor_util.constant_value(v)
-    callbacks.on_batch_end(batch_index, batch_logs)
-    if callback_model.stop_training:
-      break
-
-    if batch_index == len(batches) - 1:  # Last batch.
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_inputs,
-            val_targets,
-            sample_weights=val_sample_weights,
-            batch_size=batch_size,
-            verbose=0)
+            verbose=0,
+            batch_size=batch_size)
         if not isinstance(val_outs, list):
           val_outs = [val_outs]
         # Same labels assumed.
@@ -451,6 +346,11 @@ def iterator_test_loop(model, inputs, steps, verbose=0):
         expectations of the model.
   """
   assert isinstance(inputs, iterator_ops.EagerIterator)
+  # make sure either x,y or x,y,sample_weights is provided
+  if (not isinstance(inputs.output_shapes, (list, tuple)) or
+      len(inputs.output_shapes) < 2 or len(inputs.output_shapes) > 3):
+    raise ValueError('Please provide either inputs and targets'
+                     'or inputs, targets, and sample_weights')
   outs = []
   num_samples = 0
   if verbose == 1:
@@ -466,10 +366,11 @@ def iterator_test_loop(model, inputs, steps, verbose=0):
           '(in this case, %d batches).', steps)
       break
 
-    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
-      raise ValueError('Please provide data as a list or tuple of 2 elements '
-                       ' - input and target pair. Received %s' % next_element)
-    x, y = next_element
+    if len(inputs.output_shapes) == 2:
+      x, y = next_element
+      sample_weights = None
+    else:
+      x, y, sample_weights = next_element
 
     # Validate and standardize data.
     x, y, sample_weights = model._standardize_user_data(x, y)
@@ -512,94 +413,6 @@ def iterator_test_loop(model, inputs, steps, verbose=0):
   return outs
 
 
-def batch_test_loop(model,
-                    inputs,
-                    targets,
-                    batch_size,
-                    sample_weights=None,
-                    verbose=0):
-  """Test function for eager execution when input is given as arrays or tensors.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      batch_size: Integer batch size.
-      sample_weights: Optional list of sample weight arrays.
-      verbose: Verbosity mode.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  outs = []
-  feed_data = inputs + targets
-  if sample_weights:
-    feed_data += sample_weights
-  num_samples = training_utils.check_num_samples(
-      feed_data, batch_size=batch_size)
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=num_samples)
-  batches = generic_utils.make_batches(num_samples, batch_size)
-  index_array = np.arange(num_samples)
-  for batch_index, (batch_start, batch_end) in enumerate(batches):
-    batch_ids = index_array[batch_start:batch_end]
-    inputs_batch = slice_arrays(inputs, batch_ids)
-    targets_batch = slice_arrays(targets, batch_ids)
-    if sample_weights:
-      sample_weights_batch = slice_arrays(sample_weights, batch_ids)
-    else:
-      sample_weights_batch = None
-
-    inputs_batch = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
-        for val in inputs_batch
-    ]
-    targets_batch = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
-        for val in targets_batch
-    ]
-    if sample_weights:
-      sample_weights_batch = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
-          if val is not None else None for val in sample_weights_batch
-      ]
-
-    loss_outs, loss, loss_metrics = _model_loss(
-        model,
-        inputs_batch,
-        targets_batch,
-        sample_weights=sample_weights_batch,
-        training=False)
-    metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
-    batch_outs = []
-    for _, v in zip(model.metrics_names,
-                    [backend.mean(loss)] + loss_metrics + metrics_results):
-      batch_outs.append(tensor_util.constant_value(v))
-
-    if isinstance(batch_outs, list):
-      if batch_index == 0:
-        for _ in enumerate(batch_outs):
-          outs.append(0.)
-      for i, batch_out in enumerate(batch_outs):
-        outs[i] += batch_out * len(batch_ids)
-    else:
-      if batch_index == 0:
-        outs.append(0.)
-      outs[0] += batch_outs * len(batch_ids)
-
-    if verbose == 1:
-      progbar.update(batch_end)
-
-  for i in range(len(outs)):
-    outs[i] /= num_samples
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
 def iterator_predict_loop(model, inputs, steps, verbose=0):
   """Predict function for eager execution when input is dataset iterator.
 
@@ -619,6 +432,12 @@ def iterator_predict_loop(model, inputs, steps, verbose=0):
         expectations of the model.
   """
   assert isinstance(inputs, iterator_ops.EagerIterator)
+  if not isinstance(inputs.output_shapes,
+                    (list, tuple)) or len(inputs.output_shapes) > 2:
+    raise ValueError(
+        'Please provide data as a list or tuple of 1 or 2 elements '
+        ' - input or input and target pair. Received %s. We do not use the '
+        '`target` value here.' % inputs.output_shapes)
   outs = []
   if verbose == 1:
     progbar = generic_utils.Progbar(target=steps)
@@ -634,12 +453,8 @@ def iterator_predict_loop(model, inputs, steps, verbose=0):
           'batches (in this case, %d batches).', steps)
       break
 
-    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
-      raise ValueError(
-          'Please provide data as a list or tuple of 2 elements '
-          ' - input and target pair. Received %s. We do not use the '
-          '`target` value here.' % next_element)
-    x, _ = next_element
+    # expects a tuple, where first element of tuple represents inputs
+    x = next_element[0]
 
     # Validate and standardize data.
     x, _, _ = model._standardize_user_data(x)
@@ -670,99 +485,6 @@ def iterator_predict_loop(model, inputs, steps, verbose=0):
   return outs
 
 
-def batch_predict_loop(model, inputs, batch_size, verbose=0):
-  """Predict function for eager execution when input is arrays or tensors.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: List of input arrays.
-      batch_size: Integer batch size.
-      verbose: Verbosity mode.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions (if the model has multiple outputs).
-  """
-  outs = []
-  num_samples = training_utils.check_num_samples(inputs, batch_size)
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=num_samples)
-  batches = generic_utils.make_batches(num_samples, batch_size)
-  index_array = np.arange(num_samples)
-  for batch_index, (batch_start, batch_end) in enumerate(batches):
-    batch_ids = index_array[batch_start:batch_end]
-    inputs_batch = slice_arrays(inputs, batch_ids)
-
-    inputs_batch = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
-        for val in inputs_batch
-    ]
-
-    if len(inputs_batch) == 1:
-      if model._expects_training_arg:
-        batch_outs = model.call(inputs_batch[0], training=False)
-      else:
-        batch_outs = model.call(inputs_batch[0])
-    else:
-      if model._expects_training_arg:
-        batch_outs = model.call(inputs_batch, training=False)
-      else:
-        batch_outs = model.call(inputs_batch)
-
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-    if batch_index == 0:
-      # Pre-allocate the results arrays.
-      for batch_out in batch_outs:
-        dims = batch_out.shape[1:].dims
-        dims_list = [d.value for d in dims]
-        shape = (num_samples,) + tuple(dims_list)
-        outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
-    for i, batch_out in enumerate(batch_outs):
-      outs[i][batch_start:batch_end] = batch_out
-    if verbose == 1:
-      progbar.update(batch_end)
-
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
-def slice_arrays(arrays, indices, contiguous=True):
-  """Slices batches out of provided arrays (workaround for eager tensors).
-
-  Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
-  hence we cannot use `generic_utils.slice_arrays` directly
-  and we have to implement this workaround based on `concat`. This has a
-  performance cost.
-
-  Arguments:
-    arrays: Single array or list of arrays.
-    indices: List of indices in the array that should be included in the output
-      batch.
-    contiguous: Boolean flag indicating whether the indices are contiguous.
-
-  Returns:
-    Slice of data (either single array or list of arrays).
-  """
-  if any(tensor_util.is_tensor(x) for x in arrays):
-    converted_to_list = False
-    if not isinstance(arrays, list):
-      converted_to_list = True
-      arrays = [arrays]
-    if not contiguous:
-      entries = [[x[i:i + 1] for i in indices] for x in arrays]
-      slices = [array_ops.concat(x, axis=0) for x in entries]
-    else:
-      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
-    if converted_to_list:
-      slices = slices[0]
-    return slices
-  else:
-    return generic_utils.slice_arrays(arrays, indices)
-
-
 def _process_single_batch(model,
                           inputs,
                           targets,
@@ -935,19 +657,24 @@ def fit_loop(model,
   Raises:
     ValueError: In case of invalid argument values.
   """
+  # Convert training inputs to an EagerIterator
+  inputs, steps_per_epoch = training_utils.convert_to_iterator(
+      x=inputs,
+      y=targets,
+      sample_weights=sample_weights,
+      batch_size=batch_size,
+      steps_per_epoch=steps_per_epoch,
+      epochs=epochs,
+      shuffle=shuffle)
   # Required for eager execution
   with backend.learning_phase_scope(1):
     do_validation = False
     if val_inputs:
       do_validation = True
-      if (steps_per_epoch is None and verbose and inputs and
-          hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
-        print('Train on %d samples, validate on %d samples' %
-              (inputs[0].shape[0], val_inputs[0].shape[0]))
 
     num_train_samples = None
     out_labels = None
-    if steps_per_epoch is None or model._is_compiled:
+    if model._is_compiled:
       out_labels = model.metrics_names
       if do_validation:
         callback_metrics = copy.copy(out_labels) + [
@@ -956,28 +683,10 @@ def fit_loop(model,
       else:
         callback_metrics = copy.copy(out_labels)
 
-    if steps_per_epoch is None:
-      if sample_weights:
-        feed_data = inputs + targets + sample_weights
-      else:
-        feed_data = inputs + targets
-      num_train_samples = training_utils.check_num_samples(
-          feed_data,
-          batch_size=batch_size,
-          steps=steps_per_epoch,
-          steps_name='steps_per_epoch')
-
-      if num_train_samples is not None:
-        index_array = np.arange(num_train_samples)
-
     model.history = cbks.History()
     callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
     if verbose:
-      if steps_per_epoch is not None:
-        count_mode = 'steps'
-      else:
-        count_mode = 'samples'
-      callbacks += [cbks.ProgbarLogger(count_mode)]
+      callbacks += [cbks.ProgbarLogger('steps')]
     callbacks = cbks.CallbackList(callbacks)
 
     # it's possible to callback a different model than self
@@ -1019,43 +728,24 @@ def fit_loop(model,
     for epoch in range(initial_epoch, epochs):
       callbacks.on_epoch_begin(epoch)
       epoch_logs = {}
-
-      if steps_per_epoch is not None:
-        iterator_fit_loop(
-            model,
-            inputs,
-            class_weight,
-            steps_per_epoch=steps_per_epoch,
-            callback_model=callback_model,
-            out_labels=out_labels,
-            epoch_logs=epoch_logs,
-            val_inputs=val_inputs,
-            val_targets=val_targets,
-            val_sample_weights=val_sample_weights,
-            epochs=epochs,
-            verbose=verbose,
-            callbacks=callbacks,
-            callback_metrics=callback_metrics,
-            validation_steps=validation_steps,
-            do_validation=do_validation)
-      else:
-        batch_fit_loop(
-            model,
-            inputs,
-            targets,
-            epoch_logs=epoch_logs,
-            index_array=index_array,
-            out_labels=out_labels,
-            callback_model=callback_model,
-            batch_size=batch_size,
-            sample_weights=sample_weights,
-            val_inputs=val_inputs,
-            val_targets=val_targets,
-            val_sample_weights=val_sample_weights,
-            callbacks=callbacks,
-            shuffle=shuffle,
-            num_train_samples=num_train_samples,
-            do_validation=do_validation)
+      iterator_fit_loop(
+          model,
+          inputs,
+          class_weight,
+          steps_per_epoch=steps_per_epoch,
+          callback_model=callback_model,
+          out_labels=out_labels,
+          epoch_logs=epoch_logs,
+          val_inputs=val_inputs,
+          val_targets=val_targets,
+          val_sample_weights=val_sample_weights,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          callback_metrics=callback_metrics,
+          validation_steps=validation_steps,
+          do_validation=do_validation,
+          batch_size=batch_size)
       callbacks.on_epoch_end(epoch, epoch_logs)
       if callback_model.stop_training:
         break
@@ -1087,17 +777,14 @@ def test_loop(model, inputs, targets,
       and/or metrics). The attribute `model.metrics_names` will give you
       the display labels for the scalar outputs.
   """
+  inputs, steps = training_utils.convert_to_iterator(
+      x=inputs,
+      y=targets,
+      sample_weights=sample_weights,
+      batch_size=batch_size,
+      steps_per_epoch=steps)
   with backend.learning_phase_scope(0):
-    if steps is not None:
-      return iterator_test_loop(model, inputs, steps, verbose=verbose)
-    else:
-      return batch_test_loop(
-          model,
-          inputs,
-          targets,
-          batch_size=batch_size,
-          sample_weights=sample_weights,
-          verbose=verbose)
+    return iterator_test_loop(model, inputs, steps, verbose=verbose)
 
 
 def predict_loop(model, inputs,
@@ -1121,8 +808,6 @@ def predict_loop(model, inputs,
       (if the model has multiple outputs).
   """
   with backend.learning_phase_scope(0):
-    if steps is not None:
-      return iterator_predict_loop(model, inputs, steps, verbose=verbose)
-    else:
-      return batch_predict_loop(
-          model, inputs, batch_size=batch_size, verbose=verbose)
+    inputs, steps = training_utils.convert_to_iterator(
+        x=inputs, batch_size=batch_size, steps_per_epoch=steps)
+    return iterator_predict_loop(model, inputs, steps, verbose=verbose)
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
new file mode 100644
index 0000000000..5825ce814f
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -0,0 +1,125 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.layers.convolutional import Conv2D
+from tensorflow.python.platform import test
+from tensorflow.python.training import rmsprop
+
+
+class TrainingGPUTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_model_with_crossentropy_losses_channels_first(self):
+    """Tests use of all crossentropy losses with `channels_first`.
+
+    Tests `sparse_categorical_crossentropy`, `categorical_crossentropy`,
+    and `binary_crossentropy`.
+    Verifies that evaluate gives the same result with either `channels_first`
+    or `channels_last` image_data_format.
+    """
+    def prepare_simple_model(input_tensor, loss_name, target):
+      axis = 1 if K.image_data_format() == 'channels_first' else -1
+      loss = None
+      num_channels = None
+      activation = None
+      if loss_name == 'sparse_categorical_crossentropy':
+        loss = lambda y_true, y_pred: K.sparse_categorical_crossentropy(  # pylint: disable=g-long-lambda
+            y_true, y_pred, axis=axis)
+        num_channels = np.amax(target) + 1
+        activation = 'softmax'
+      elif loss_name == 'categorical_crossentropy':
+        loss = lambda y_true, y_pred: K.categorical_crossentropy(  # pylint: disable=g-long-lambda
+            y_true, y_pred, axis=axis)
+        num_channels = target.shape[axis]
+        activation = 'softmax'
+      elif loss_name == 'binary_crossentropy':
+        loss = lambda y_true, y_pred: K.binary_crossentropy(y_true, y_pred)  # pylint: disable=unnecessary-lambda
+        num_channels = target.shape[axis]
+        activation = 'sigmoid'
+      predictions = Conv2D(num_channels,
+                           1,
+                           activation=activation,
+                           kernel_initializer='ones',
+                           bias_initializer='ones')(input_tensor)
+      simple_model = keras.models.Model(inputs=input_tensor,
+                                        outputs=predictions)
+      simple_model.compile(optimizer=rmsprop.RMSPropOptimizer(1e-3), loss=loss)
+      return simple_model
+
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        losses_to_test = ['sparse_categorical_crossentropy',
+                          'categorical_crossentropy', 'binary_crossentropy']
+
+        data_channels_first = np.array([[[[8., 7.1, 0.], [4.5, 2.6, 0.55],
+                                          [0.9, 4.2, 11.2]]]], dtype=np.float32)
+        # Labels for testing 4-class sparse_categorical_crossentropy, 4-class
+        # categorical_crossentropy, and 2-class binary_crossentropy:
+        labels_channels_first = [np.array([[[[0, 1, 3], [2, 1, 0], [2, 2, 1]]]], dtype=np.float32),  # pylint: disable=line-too-long
+                                 np.array([[[[0, 1, 0], [0, 1, 0], [0, 0, 0]],
+                                            [[1, 0, 0], [0, 0, 1], [0, 1, 0]],
+                                            [[0, 0, 0], [1, 0, 0], [0, 0, 1]],
+                                            [[0, 0, 1], [0, 0, 0], [1, 0, 0]]]], dtype=np.float32),  # pylint: disable=line-too-long
+                                 np.array([[[[0, 1, 0], [0, 1, 0], [0, 0, 1]],
+                                            [[1, 0, 1], [1, 0, 1], [1, 1, 0]]]], dtype=np.float32)]  # pylint: disable=line-too-long
+        # Compute one loss for each loss function in the list `losses_to_test`:
+        loss_channels_last = [0., 0., 0.]
+        loss_channels_first = [0., 0., 0.]
+
+        old_data_format = K.image_data_format()
+
+        # Evaluate a simple network with channels last, with all three loss
+        # functions:
+        K.set_image_data_format('channels_last')
+        data = np.moveaxis(data_channels_first, 1, -1)
+        for index, loss_function in enumerate(losses_to_test):
+          labels = np.moveaxis(labels_channels_first[index], 1, -1)
+          inputs = keras.Input(shape=(3, 3, 1))
+          model = prepare_simple_model(inputs, loss_function, labels)
+          loss_channels_last[index] = model.evaluate(x=data, y=labels,
+                                                     batch_size=1, verbose=0)
+
+        # Evaluate the same network with channels first, with all three loss
+        # functions:
+        K.set_image_data_format('channels_first')
+        data = data_channels_first
+        for index, loss_function in enumerate(losses_to_test):
+          labels = labels_channels_first[index]
+          inputs = keras.Input(shape=(1, 3, 3))
+          model = prepare_simple_model(inputs, loss_function, labels)
+          loss_channels_first[index] = model.evaluate(x=data, y=labels,
+                                                      batch_size=1, verbose=0)
+
+        K.set_image_data_format(old_data_format)
+
+        np.testing.assert_allclose(loss_channels_first,
+                                   loss_channels_last,
+                                   err_msg='{}{}'.format(
+                                       'Computed different losses for ',
+                                       'channels_first and channels_last'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index d9e548f01f..301a6ca866 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import logging
 import os
 import unittest
 
@@ -415,6 +416,28 @@ class TrainingTest(test.TestCase):
       x2 = model.predict(val_a)
       self.assertAllClose(x1, x2, atol=1e-7)
 
+  def test_compile_warning_for_loss_missing_output(self):
+    with self.test_session():
+      inp = keras.layers.Input(shape=(16,), name='input_a')
+      out_1 = keras.layers.Dense(8, name='dense_1')(inp)
+      out_2 = keras.layers.Dense(3, activation='softmax', name='dense_2')(out_1)
+      model = keras.models.Model(inputs=[inp], outputs=[out_1, out_2])
+
+      with test.mock.patch.object(logging, 'warning') as mock_log:
+        model.compile(
+            loss={
+                'dense_2': 'categorical_crossentropy',
+            },
+            optimizer='rmsprop',
+            metrics={
+                'dense_2': 'categorical_accuracy',
+                'dense_1': 'categorical_accuracy',
+            })
+        msg = ('Output "dense_1" missing from loss dictionary. We assume this '
+               'was done on purpose. The fit and evaluate APIs will not be '
+               'expecting any data to be passed to "dense_1".')
+        self.assertRegexpMatches(str(mock_log.call_args), msg)
+
 
 class LossWeightingTest(test.TestCase):
 
@@ -744,6 +767,22 @@ class LossMaskingTest(test.TestCase):
               keras.backend.variable(weights), keras.backend.variable(mask)))
 
 
+class LearningPhaseTest(test.TestCase):
+
+  def test_empty_model_no_learning_phase(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      self.assertFalse(model.uses_learning_phase)
+
+  def test_dropout_has_learning_phase(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_dim=3))
+      model.add(keras.layers.Dropout(0.5))
+      model.add(keras.layers.Dense(2))
+      self.assertTrue(model.uses_learning_phase)
+
+
 class TestDynamicTrainability(test.TestCase):
 
   def test_trainable_warning(self):
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 728a2b493b..dbbc87daf9 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -19,9 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import math
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
@@ -31,6 +33,135 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.ops import math_ops
 
 
+def _map_nested(data, func):
+  """Maps each nested element using func."""
+  if isinstance(data, list):
+    return [_map_nested(nested_data, func) for nested_data in data]
+  elif isinstance(data, tuple):
+    return tuple(_map_nested(nested_data, func) for nested_data in data)
+  elif isinstance(data, dict):
+    return {
+        k: _map_nested(nested_data, func) for k, nested_data in data.items()
+    }
+  else:
+    return func(data)
+
+
+def _nested_all(data, cond_func):
+  """Checks if all elements in a nested structure satisfy cond_func."""
+  if isinstance(data, (tuple, list)):
+    return all([_nested_all(nested_data, cond_func) for nested_data in data])
+  elif isinstance(data, dict):
+    return all(
+        [_nested_all(nested_data, cond_func) for nested_data in data.values()])
+  else:
+    return cond_func(data)
+
+
+def _nested_any(data, cond_func):
+  """Checks if any nested_elements in a nested structure satisfy cond_func."""
+  if isinstance(data, (tuple, list)):
+    return any([_nested_any(nested_data, cond_func) for nested_data in data])
+  elif isinstance(data, dict):
+    return any(
+        [_nested_any(nested_data, cond_func) for nested_data in data.values()])
+  else:
+    return cond_func(data)
+
+
+def _convert_lists_to_tuples(data):
+  """Converts all lists to tuples, since Datasets expect tuples."""
+  if isinstance(data, (tuple, list)):
+    return tuple(_convert_lists_to_tuples(nested_data) for nested_data in data)
+  elif isinstance(data, dict):
+    return {
+        k: _convert_lists_to_tuples(nested_data)
+        for k, nested_data in data.items()
+    }
+  else:
+    return data
+
+
+def _get_batch_axis_size(data):
+  """Returns batch axis shape for nested data."""
+  if isinstance(data, (tuple, list)):
+    return _get_batch_axis_size(data[0])
+  elif isinstance(data, dict):
+    return _get_batch_axis_size(list(data.values()))
+  else:
+    return int(data.shape[0])
+
+
+def convert_to_iterator(x=None,
+                        y=None,
+                        sample_weights=None,
+                        batch_size=None,
+                        steps_per_epoch=None,
+                        epochs=1,
+                        shuffle=False):
+  """Converts NumPy arrays or EagerTensors to an EagerIterator.
+
+  Combines all provided data into a single EagerIterator.
+
+  Arguments:
+      x: NumPy array or EagerTensor,  or list of Numpy arrays or EagerTensors
+        representing inputs to a model.
+      y: Optional. NumPy array or EagerTensor, or list of Numpy arrays or
+        EagerTensors representing targets of a model.
+      sample_weights: Optional NumPy array or EagerTensor representing sample
+        weights.
+      batch_size: Used to batch data and calculate how many steps EagerIterator
+        should take per epoch.
+      steps_per_epoch: If provided, how many steps EagerIterator should take per
+        epoch.
+      epochs: Epochs to repeat iterator for.
+      shuffle: Whether to shuffle data after each epoch.
+
+  Raises:
+      ValueError: if steps_per_epoch cannot be calculated from the data
+      provided.
+
+  Returns:
+      (Iterator, steps_per_epoch).
+
+  """
+  if isinstance(x, iterator_ops.EagerIterator):
+    return x, steps_per_epoch
+
+  if not _nested_any(sample_weights, lambda x: x is None):
+    data = (x, y, sample_weights)
+  elif not _nested_any(y, lambda x: x is None):
+    data = (x, y)
+  else:
+    # always wrap in a tuple, so we know y, sample_weights weren't set
+    # even when x has multiple elements
+    data = (x,)
+
+  data = _convert_lists_to_tuples(data)
+  if steps_per_epoch is None and batch_size is not None:
+    num_samples = _get_batch_axis_size(data)
+    steps_per_epoch = int(math.ceil(num_samples / batch_size))
+
+  if steps_per_epoch is None:
+    raise ValueError('Could not determine steps_per_epoch.'
+                     'Please provide either batch_size or'
+                     'steps_per_epoch.')
+
+  # TODO(omalleyt) for NumPy arrays in graph mode
+  # placeholder ops should be used
+  # this is only ideal for eager mode
+  dataset = dataset_ops.Dataset.from_tensor_slices(data)
+
+  if batch_size is not None:
+    dataset = dataset.batch(batch_size)
+  if shuffle:
+    dataset = dataset.shuffle(buffer_size=10000)
+  dataset = dataset.repeat(epochs)
+  iterator = dataset.make_one_shot_iterator()
+
+  return iterator, steps_per_epoch
+
+
 def check_num_samples(ins,
                       batch_size=None,
                       steps=None,
@@ -128,8 +259,8 @@ def standardize_input_data(data,
     except KeyError as e:
       raise ValueError('No data provided for "' + e.args[0] + '". Need data '
                        'for each key in: ' + str(names))
-  elif isinstance(data, list):
-    if isinstance(data[0], list):
+  elif isinstance(data, (list, tuple)):
+    if isinstance(data[0], (list, tuple)):
       data = [np.asarray(d) for d in data]
     elif len(names) == 1 and isinstance(data[0], (float, int)):
       data = [np.asarray(data)]
@@ -482,6 +613,9 @@ def standardize_weights(y,
   Raises:
       ValueError: In case of invalid user-provided arguments.
   """
+  # Iterator may return sample_weight as 1-tuple
+  if isinstance(sample_weight, tuple):
+    sample_weight = sample_weight[0]
   if sample_weight_mode is not None:
     if sample_weight_mode != 'temporal':
       raise ValueError('"sample_weight_mode '
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
new file mode 100644
index 0000000000..297a1ae494
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.platform import test
+
+
+class TrainingUtilTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_convert_to_iterator_single_numpy(self):
+    batch_size = 2
+    a = np.ones([10, 10])
+    iterator, steps_per_epoch = training_utils.convert_to_iterator(
+        x=a, batch_size=batch_size)
+    self.assertEquals(steps_per_epoch, 5)
+
+    expected_batch = a[:batch_size, :]
+    actual_batch, = iterator.get_next()
+    self.assertAllEqual(expected_batch, actual_batch)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_convert_to_iterator_single_tensor(self):
+    batch_size = 2
+    a = ops.convert_to_tensor(np.ones([10, 10]))
+    iterator, steps_per_epoch = training_utils.convert_to_iterator(
+        x=a, batch_size=batch_size)
+    self.assertEquals(steps_per_epoch, 5)
+
+    expected_batch = a[:batch_size, :]
+    actual_batch, = iterator.get_next()
+    self.assertAllEqual(expected_batch, actual_batch)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_convert_to_iterator_y(self):
+    batch_size = 2
+    a = np.ones([10, 100])
+    b = np.ones([10, 10])
+    iterator, steps_per_epoch = training_utils.convert_to_iterator(
+        x=a, y=b, batch_size=batch_size)
+    self.assertEquals(steps_per_epoch, 5)
+
+    expected_x = a[:batch_size, :]
+    expected_y = b[:batch_size, :]
+    actual_x, actual_y = iterator.get_next()
+    self.assertAllEqual(expected_x, actual_x)
+    self.assertAllEqual(expected_y, actual_y)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_convert_to_iterator_sample_weights(self):
+    batch_size = 2
+    a = ops.convert_to_tensor(np.ones([10, 100]))
+    b = ops.convert_to_tensor(np.ones([10, 10]))
+    sw = ops.convert_to_tensor(np.ones([10]))
+    iterator, steps_per_epoch = training_utils.convert_to_iterator(
+        x=a, y=b, sample_weights=sw, batch_size=batch_size)
+    self.assertEquals(steps_per_epoch, 5)
+
+    expected_x = a[:batch_size, :]
+    expected_y = b[:batch_size, :]
+    expected_sw = sw[:batch_size]
+    actual_x, actual_y, actual_sw = iterator.get_next()
+    self.assertAllEqual(expected_x, actual_x)
+    self.assertAllEqual(expected_y, actual_y)
+    self.assertAllEqual(expected_sw, actual_sw)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_convert_to_iterator_nested(self):
+    batch_size = 2
+    x = {'1': np.ones([10, 100]), '2': [np.zeros([10, 10]), np.ones([10, 20])]}
+    iterator, steps_per_epoch = training_utils.convert_to_iterator(
+        x=x, batch_size=batch_size)
+    self.assertEquals(steps_per_epoch, 5)
+
+    expected_x1 = x['1'][:batch_size, :]
+    expected_x2_0 = x['2'][0][:batch_size, :]
+    expected_x2_1 = x['2'][1][:batch_size, :]
+
+    actual_x, = iterator.get_next()
+    actual_x1 = actual_x['1'][:batch_size, :]
+    actual_x2_0 = actual_x['2'][0][:batch_size, :]
+    actual_x2_1 = actual_x['2'][1][:batch_size, :]
+
+    self.assertAllEqual(expected_x1, actual_x1)
+    self.assertAllEqual(expected_x2_0, actual_x2_0)
+    self.assertAllEqual(expected_x2_1, actual_x2_1)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_convert_to_iterator_epochs(self):
+    batch_size = 2
+    a = np.ones([10, 10])
+    iterator, steps_per_epoch = training_utils.convert_to_iterator(
+        x=a, batch_size=batch_size, epochs=2)
+    self.assertEquals(steps_per_epoch, 5)
+
+    expected_batch = a[:batch_size, :]
+    # loop through one whole epoch
+    for _ in range(6):
+      actual_batch, = iterator.get_next()
+    self.assertAllEqual(expected_batch, actual_batch)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_convert_to_iterator_insufficient_info(self):
+    # with batch_size and steps_per_epoch not set
+    with self.assertRaises(ValueError):
+      a = np.ones([10, 10])
+      _ = training_utils.convert_to_iterator(x=a)
+
+  def test_nested_all(self):
+    nested_data = {'a': True, 'b': [True, True, (False, True)]}
+    all_true = training_utils._nested_all(nested_data, lambda x: x)
+    self.assertEquals(all_true, False)
+
+    nested_data = {'a': True, 'b': [True, True, (True, True)]}
+    all_true = training_utils._nested_all(nested_data, lambda x: x)
+    self.assertEquals(all_true, True)
+
+  def test_nested_any(self):
+    nested_data = [False, {'a': False, 'b': (False, True)}]
+    any_true = training_utils._nested_any(nested_data, lambda x: x)
+    self.assertEquals(any_true, True)
+
+    nested_data = [False, {'a': False, 'b': (False, False)}]
+    any_true = training_utils._nested_any(nested_data, lambda x: x)
+    self.assertEquals(any_true, False)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/initializers.py b/tensorflow/python/keras/initializers.py
index 28beb6760d..b9d856efa8 100644
--- a/tensorflow/python/keras/initializers.py
+++ b/tensorflow/python/keras/initializers.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras initializer classes (soon to be replaced with core TF initializers).
+"""Keras initializer serialization / deserialization.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -22,107 +22,27 @@ import six
 
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+
+# These imports are brought in so that keras.initializers.deserialize
+# has them available in module_objects.
 from tensorflow.python.ops.init_ops import Constant
 from tensorflow.python.ops.init_ops import glorot_normal_initializer
 from tensorflow.python.ops.init_ops import glorot_uniform_initializer
-
+from tensorflow.python.ops.init_ops import he_normal  # pylint: disable=unused-import
+from tensorflow.python.ops.init_ops import he_uniform  # pylint: disable=unused-import
 from tensorflow.python.ops.init_ops import Identity
 from tensorflow.python.ops.init_ops import Initializer  # pylint: disable=unused-import
+from tensorflow.python.ops.init_ops import lecun_normal  # pylint: disable=unused-import
+from tensorflow.python.ops.init_ops import lecun_uniform  # pylint: disable=unused-import
 from tensorflow.python.ops.init_ops import Ones
 from tensorflow.python.ops.init_ops import Orthogonal
 from tensorflow.python.ops.init_ops import RandomNormal
 from tensorflow.python.ops.init_ops import RandomUniform
 from tensorflow.python.ops.init_ops import TruncatedNormal
-from tensorflow.python.ops.init_ops import VarianceScaling
+from tensorflow.python.ops.init_ops import VarianceScaling  # pylint: disable=unused-import
 from tensorflow.python.ops.init_ops import Zeros
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export('keras.initializers.lecun_normal')
-def lecun_normal(seed=None):
-  """LeCun normal initializer.
-
-  It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(1 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
-
-  Arguments:
-      seed: A Python integer. Used to seed the random generator.
-
-  Returns:
-      An initializer.
-
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-      - [Efficient
-      Backprop](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
-  """
-  return VarianceScaling(
-      scale=1., mode='fan_in', distribution='normal', seed=seed)
-
-
-@tf_export('keras.initializers.lecun_uniform')
-def lecun_uniform(seed=None):
-  """LeCun uniform initializer.
-
-  It draws samples from a uniform distribution within [-limit, limit]
-  where `limit` is `sqrt(3 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
-
-  Arguments:
-      seed: A Python integer. Used to seed the random generator.
-
-  Returns:
-      An initializer.
 
-  References:
-      LeCun 98, Efficient Backprop,
-      http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
-  """
-  return VarianceScaling(
-      scale=1., mode='fan_in', distribution='uniform', seed=seed)
-
-
-@tf_export('keras.initializers.he_normal')
-def he_normal(seed=None):
-  """He normal initializer.
-
-  It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(2 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
-
-  Arguments:
-      seed: A Python integer. Used to seed the random generator.
-
-  Returns:
-      An initializer.
-
-  References:
-      He et al., http://arxiv.org/abs/1502.01852
-  """
-  return VarianceScaling(
-      scale=2., mode='fan_in', distribution='normal', seed=seed)
-
-
-@tf_export('keras.initializers.he_uniform')
-def he_uniform(seed=None):
-  """He uniform variance scaling initializer.
-
-  It draws samples from a uniform distribution within [-limit, limit]
-  where `limit` is `sqrt(6 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
-
-  Arguments:
-      seed: A Python integer. Used to seed the random generator.
-
-  Returns:
-      An initializer.
-
-  References:
-      He et al., http://arxiv.org/abs/1502.01852
-  """
-  return VarianceScaling(
-      scale=2., mode='fan_in', distribution='uniform', seed=seed)
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Compatibility aliases
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index c519e194bd..51725e03f2 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -31,16 +31,6 @@ class KerasInitializersTest(test.TestCase):
               target_max=None, target_min=None):
     variable = keras.backend.variable(init(shape))
     output = keras.backend.get_value(variable)
-    lim = 3e-2
-    if target_std is not None:
-      self.assertGreater(lim, abs(output.std() - target_std))
-    if target_mean is not None:
-      self.assertGreater(lim, abs(output.mean() - target_mean))
-    if target_max is not None:
-      self.assertGreater(lim, abs(output.max() - target_max))
-    if target_min is not None:
-      self.assertGreater(lim, abs(output.min() - target_min))
-
     # Test serialization (assumes deterministic behavior).
     config = init.get_config()
     reconstructed_init = init.__class__.from_config(config)
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index eba10da6f3..61ab69c16f 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -284,6 +284,13 @@ class Softmax(Layer):
 class ReLU(Layer):
   """Rectified Linear Unit activation function.
 
+  With default values, it returns element-wise `max(x, 0)`.
+
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = negative_slope * (x - threshold)` otherwise.
+
   Input shape:
       Arbitrary. Use the keyword argument `input_shape`
       (tuple of integers, does not include the samples axis)
@@ -294,21 +301,39 @@ class ReLU(Layer):
 
   Arguments:
       max_value: float >= 0. Maximum activation value.
+      negative_slope: float >= 0. Negative slope coefficient.
+      threshold: float. Threshold value for thresholded activation.
   """
 
-  def __init__(self, max_value=None, **kwargs):
+  def __init__(self, max_value=None, negative_slope=0, threshold=0, **kwargs):
     super(ReLU, self).__init__(**kwargs)
-    self.support_masking = True
-    self.max_value = K.cast_to_floatx(max_value)
-    if self.max_value < 0.:
+    if max_value is not None and max_value < 0.:
       raise ValueError('max_value of Relu layer '
                        'cannot be negative value: ' + str(max_value))
+    if negative_slope < 0.:
+      raise ValueError('negative_slope of Relu layer '
+                       'cannot be negative value: ' + str(negative_slope))
+
+    self.support_masking = True
+    self.max_value = K.cast_to_floatx(max_value)
+    self.negative_slope = K.cast_to_floatx(negative_slope)
+    self.threshold = K.cast_to_floatx(threshold)
 
   def call(self, inputs):
-    return activations.relu(inputs, max_value=self.max_value)
+    # alpha is used for leaky relu slope in activations instead of
+    # negative_slope.
+    return activations.relu(
+        inputs,
+        alpha=self.negative_slope,
+        max_value=self.max_value,
+        threshold=self.threshold)
 
   def get_config(self):
-    config = {'max_value': self.max_value}
+    config = {
+        'max_value': self.max_value,
+        'negative_slope': self.negative_slope,
+        'threshold': self.threshold
+    }
     base_config = super(ReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index 9e1f15b1bc..53c1baa2bb 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -75,6 +75,14 @@ class AdvancedActivationsTest(test.TestCase):
         testing_utils.layer_test(keras.layers.ReLU,
                                  kwargs={'max_value': -10},
                                  input_shape=(2, 3, 4))
+    with self.assertRaisesRegexp(
+        ValueError,
+        'negative_slope of Relu layer cannot be negative value: -2'):
+      with self.test_session():
+        testing_utils.layer_test(
+            keras.layers.ReLU,
+            kwargs={'negative_slope': -2},
+            input_shape=(2, 3, 4))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 84d794cada..e61dd3043d 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -788,7 +788,7 @@ class ConvLSTM2D(ConvRNN2D):
 
   Arguments:
     filters: Integer, the dimensionality of the output space
-        (i.e. the number output of filters in the convolution).
+        (i.e. the number of output filters in the convolution).
     kernel_size: An integer or tuple/list of n integers, specifying the
         dimensions of the convolution window.
     strides: An integer or tuple/list of n integers,
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 8fd970239f..2ed0aa8f26 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -220,7 +220,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
         self.assertNotEqual(out4.max(), out5.max())
 
   @parameterized.named_parameters(
-      *testing_utils.generate_combinations_with_testcase_name(
+      *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
           bidirectional=[True, False], implementation=[1, 2],
           model_nest_level=[1, 2], model_type=['seq', 'func']))
@@ -301,7 +301,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
     os.remove(fname)
 
   @parameterized.named_parameters(
-      *testing_utils.generate_combinations_with_testcase_name(
+      *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False]))
   def test_load_weights_between_noncudnn_rnn_time_distributed(self, rnn_type,
                                                               to_cudnn):
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 58c8a8a66d..a7835bc0a2 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -370,7 +370,7 @@ class BatchNormalization(Layer):
         decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
-        update_delta = (variable - value) * decay
+        update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
         return state_ops.assign_sub(variable, update_delta, name=scope)
 
   def _fused_batch_norm(self, inputs, training):
@@ -619,6 +619,10 @@ class BatchNormalization(Layer):
     else:
       mean, variance = self.moving_mean, self.moving_variance
 
+    mean = math_ops.cast(mean, inputs.dtype)
+    variance = math_ops.cast(variance, inputs.dtype)
+    if offset is not None:
+      offset = math_ops.cast(offset, inputs.dtype)
     outputs = nn.batch_normalization(inputs,
                                      _broadcast(mean),
                                      _broadcast(variance),
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index b22f3bd152..a97b4cac46 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -95,6 +95,24 @@ class NormalizationLayersTest(test.TestCase):
       np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
       np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
+  def test_batchnorm_mixed_precision(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(
+          loc=5.0, scale=10.0, size=(1000, 10)).astype(np.float16)
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= keras.backend.eval(norm.beta)
+      out /= keras.backend.eval(norm.gamma)
+
+      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
   def test_batchnorm_convnet(self):
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 61775da47b..534c0eca08 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -403,6 +404,8 @@ class RNN(Layer):
                        'one integer per RNN state).')
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
+    if isinstance(cell, checkpointable.CheckpointableBase):
+      self._track_checkpointable(self.cell, name='cell')
     self.return_sequences = return_sequences
     self.return_state = return_state
     self.go_backwards = go_backwards
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index 802374d2d2..fefb92826b 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
+from tensorflow.python.training.checkpointable import util as checkpointable_util
 
 
 class RNNTest(test.TestCase):
@@ -556,5 +557,22 @@ class RNNTest(test.TestCase):
         [tuple(o.as_list()) for o in output_shape],
         expected_output_shape)
 
+  def test_checkpointable_dependencies(self):
+    rnn = keras.layers.SimpleRNN
+    with self.test_session():
+      x = np.random.random((2, 2, 2))
+      y = np.random.random((2, 2))
+      model = keras.models.Sequential()
+      model.add(rnn(2))
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.fit(x, y, epochs=1, batch_size=1)
+
+      # check whether the model variables are present in the
+      # checkpointable list of objects
+      checkpointed_objects = set(checkpointable_util.list_objects(model))
+      for v in model.variables:
+        self.assertIn(v, checkpointed_objects)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index f651e03874..f0c1e76156 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -47,7 +47,6 @@ class Wrapper(Layer):
   def __init__(self, layer, **kwargs):
     assert isinstance(layer, Layer)
     self.layer = layer
-    self._track_checkpointable(layer, name='layer')
     # Tracks mapping of Wrapper inputs to inner layer inputs. Useful when
     # the inner layer has update ops that depend on its inputs (as opposed
     # to the inputs to the Wrapper layer).
@@ -168,6 +167,7 @@ class TimeDistributed(Wrapper):
           '`Layer` instance. You passed: {input}'.format(input=layer))
     super(TimeDistributed, self).__init__(layer, **kwargs)
     self.supports_masking = True
+    self._track_checkpointable(layer, name='layer')
 
   def _get_shape_tuple(self, init_tuple, tensor, start_idx, int_shape=None):
     """Finds non-specific dimensions in the static shapes.
@@ -417,6 +417,8 @@ class Bidirectional(Wrapper):
     self._num_constants = None
     super(Bidirectional, self).__init__(layer, **kwargs)
     self.input_spec = layer.input_spec
+    self._track_checkpointable(self.forward_layer, name='forward_layer')
+    self._track_checkpointable(self.backward_layer, name='backward_layer')
 
   @property
   def trainable(self):
@@ -526,7 +528,8 @@ class Bidirectional(Wrapper):
     else:
       return super(Bidirectional, self).__call__(inputs, **kwargs)
 
-  def call(self, inputs,
+  def call(self,
+           inputs,
            training=None,
            mask=None,
            initial_state=None,
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 3f268acf5c..0cd774ef0f 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -87,6 +87,8 @@ class TimeDistributedTest(test.TestCase):
     # test config
     model.get_config()
 
+    # check whether the model variables are present in the
+    # checkpointable list of objects
     checkpointed_objects = set(checkpointable_util.list_objects(model))
     for v in model.variables:
       self.assertIn(v, checkpointed_objects)
@@ -278,6 +280,12 @@ class BidirectionalTest(test.TestCase):
         model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
         model.fit(x, y, epochs=1, batch_size=1)
 
+        # check whether the model variables are present in the
+        # checkpointable list of objects
+        checkpointed_objects = set(checkpointable_util.list_objects(model))
+        for v in model.variables:
+          self.assertIn(v, checkpointed_objects)
+
         # test compute output shape
         ref_shape = model.layers[-1].output.get_shape()
         shape = model.layers[-1].compute_output_shape(
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index e03d7dfe93..7d8b1fec45 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -19,9 +19,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from abc import ABCMeta
+from abc import abstractmethod
+
+import types
 import six
 
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
 from tensorflow.python.keras.losses import cosine_proximity
@@ -37,14 +46,471 @@ from tensorflow.python.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras.losses import squared_hinge
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
 
+def check_is_tensor_or_operation(x, name):
+  """Raises type error if the given input is not a tensor or operation."""
+  if not (isinstance(x, ops.Tensor) or isinstance(x, ops.Operation)):
+    raise TypeError('{0} must be a Tensor or Operation, given: {1}'.format(
+        name, x))
+
+
+def update_state_wrapper(update_state_fn):
+  """Decorator to wrap metric `update_state()` with `defun()`, `add_update()`.
+
+  Args:
+    update_state_fn: function that accumulates metric statistics.
+
+  Returns:
+    If eager execution is enabled, returns None.
+    If graph execution is enabled, returns an update op. This op should be
+      executed to update the metric state with the given inputs.
+  """
+
+  def decorated(metric_obj, *args, **kwargs):
+    """Decorated function with `defun()` and `add_update()`."""
+
+    # Converting update_state_fn() into a graph function, so that
+    # we can return a single op that performs all of the variable updates.
+    # Assigning to a different method name to avoid reference cycle.
+    defuned_update_state_fn = function.defun(update_state_fn)
+    update_op = defuned_update_state_fn(*args, **kwargs)
+    if update_op is not None:  # update_op will be None in eager execution.
+      metric_obj.add_update(update_op, inputs=True)
+      check_is_tensor_or_operation(
+          update_op, 'Metric {0}\'s update'.format(metric_obj.name))
+    return update_op
+
+  return tf_decorator.make_decorator(update_state_fn, decorated)
+
+
+def result_wrapper(result_fn):
+  """Decorator to wrap metric `result()` function in `merge_call()`.
+
+  Result computation is an idempotent operation that simply calculates the
+  metric value using the state variables.
+
+  If metric state variables are distributed across towers/devices and
+  `result()` is requested from the context of one device - This function wraps
+  `result()` in a distribution strategy `merge_call()`. With this,
+  the metric state variables will be aggregated across devices.
+
+  Args:
+    result_fn: function that computes the metric result.
+
+  Returns:
+    The metric result tensor.
+  """
+
+  def decorated(metric_obj, *args):
+    """Decorated function with merge_call."""
+    tower_context = distribute_lib.get_tower_context()
+    if tower_context is None:  # if in cross tower context already
+      result_t = result_fn(*args)
+    else:
+      # TODO(psv): Test distribution of metrics using different distribution
+      # strategies.
+
+      # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
+      # with distribution object as the first parameter. We create a wrapper
+      # here so that the result function need not have that parameter.
+      def merge_fn_wrapper(distribution, merge_fn, *args):
+        # We will get `PerDevice` merge function. Taking the first one as all
+        # are identical copies of the function that we had passed below.
+        return distribution.unwrap(merge_fn)[0](*args)
+
+      # Wrapping result in merge_call. merge_call is used when we want to leave
+      # tower mode and compute a value in cross tower mode.
+      result_t = tower_context.merge_call(merge_fn_wrapper, result_fn, *args)
+    check_is_tensor_or_operation(result_t,
+                                 'Metric {0}\'s result'.format(metric_obj.name))
+    return result_t
+
+  return tf_decorator.make_decorator(result_fn, decorated)
+
+
+def _safe_div(numerator, denominator):
+  """Divides two tensors element-wise, returning 0 if the denominator is <= 0.
+
+  Args:
+    numerator: A `Tensor`.
+    denominator: A `Tensor`, with dtype matching `numerator`.
+
+  Returns:
+    0 if `denominator` <= 0, else `numerator` / `denominator`
+  """
+  t = math_ops.truediv(numerator, denominator)
+  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
+  condition = math_ops.greater(denominator, zero)
+  zero = math_ops.cast(zero, t.dtype)
+  return array_ops.where(condition, t, zero)
+
+
+def _squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
+  """Squeeze or expand last dimension if needed.
+
+  1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
+  (using `confusion_matrix.remove_squeezable_dimensions`).
+  2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
+  from the new rank of `y_pred`.
+  If `sample_weight` is scalar, it is kept scalar.
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
+    y_true: Optional label `Tensor` whose dimensions match `y_pred`.
+    sample_weight: Optional weight scalar or `Tensor` whose dimensions match
+      `y_pred`.
+
+  Returns:
+    Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
+    the last dimension squeezed,
+    `sample_weight` could be extended by one dimension.
+  """
+  if y_true is not None:
+    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
+    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
+        y_true, y_pred)
+    y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
+
+  if sample_weight is None:
+    return y_pred, y_true, None
+
+  sample_weight = ops.convert_to_tensor(sample_weight)
+  weights_shape = sample_weight.get_shape()
+  weights_rank = weights_shape.ndims
+  if weights_rank == 0:  # If weights is scalar, do nothing.
+    return y_pred, y_true, sample_weight
+
+  y_pred_shape = y_pred.get_shape()
+  y_pred_rank = y_pred_shape.ndims
+  if (y_pred_rank is not None) and (weights_rank is not None):
+    # Use static rank.
+    if weights_rank - y_pred_rank == 1:
+      sample_weight = array_ops.squeeze(sample_weight, [-1])
+    elif y_pred_rank - weights_rank == 1:
+      sample_weight = array_ops.expand_dims(sample_weight, [-1])
+    return y_pred, y_true, sample_weight
+
+  # Use dynamic rank.
+  weights_rank_tensor = array_ops.rank(sample_weight)
+  rank_diff = weights_rank_tensor - array_ops.rank(y_pred)
+  maybe_squeeze_weights = lambda: array_ops.squeeze(sample_weight, [-1])
+
+  def _maybe_expand_weights():
+    return control_flow_ops.cond(
+        math_ops.equal(rank_diff,
+                       -1), lambda: array_ops.expand_dims(sample_weight, [-1]),
+        lambda: sample_weight)
+
+  def _maybe_adjust_weights():
+    return control_flow_ops.cond(
+        math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
+        _maybe_expand_weights)
+
+  # squeeze or expand last dim of `sample_weight` if its rank differs by 1
+  # from the new rank of `y_pred`.
+  sample_weight = control_flow_ops.cond(
+      math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
+      _maybe_adjust_weights)
+  return y_pred, y_true, sample_weight
+
+
+class Metric(Layer):
+  """Encapsulates metric logic and state.
+
+  Usage with eager execution:
+
+  ```python
+  m = SomeMetric(...)
+  for input in ...:
+    m.update_state(input)
+  print('Final result: ', m.result().numpy())
+  ```
+
+  Usage with graph execution:
+
+  ```python
+  m = SomeMetric(...)
+  init_op = tf.global_variables_initializer()  # Initialize variables
+  with tf.Session() as sess:
+    sess.run(init_op)
+    for input in ...:
+      update_op = m.update_state(input)
+      sess.run(update_op)
+    print('Final result: ', sess.run(m.result()))
+  ```
+
+  To be implemented by subclasses:
+  * `__init__()`: All state variables should be created in this method by
+    calling `self.add_weight()` like: `self.var = self.add_weight(...)`
+  * `update_state()`: Has all updates to the state variables like:
+    self.var.assign_add(...).
+  * `result()`: Computes and returns a value for the metric
+    from the state variables.
+
+  Example subclass implementation:
+
+  ```
+  class BinaryTruePositives(Metric):
+    def __init__(self, name='binary-true-positives', dtype=None):
+      super(BinaryTruePositives, self).__init__(name=name, dtype=dtype)
+      self.true_positives = self.add_weight(
+          'true_positives', initializer=init_ops.zeros_initializer)
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+      y_true = math_ops.cast(y_true, dtypes.bool)
+      y_pred = math_ops.cast(y_pred, dtypes.bool)
+      y_pred, y_true, sample_weight = _squeeze_or_expand_dimensions(
+          y_pred, y_true, sample_weight)
+
+      values = math_ops.logical_and(
+          math_ops.equal(y_true, True), math_ops.equal(y_pred, True))
+      values = math_ops.cast(values, self._dtype)
+      if sample_weight is not None:
+        sample_weight = math_ops.cast(sample_weight, self._dtype)
+        values = math_ops.multiply(values, sample_weight)
+      state_ops.assign_add(self.true_positives, math_ops.reduce_sum(values))
+
+    def result(self):
+      return array_ops.identity(self.true_positives)
+  ```
+  """
+  __metaclass__ = ABCMeta
+
+  def __init__(self, name=None, dtype=None):
+    super(Metric, self).__init__(name=name, dtype=dtype)
+    self.stateful = True  # All metric layers are stateful.
+    self.built = True
+    self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
+
+  def __new__(cls, *args, **kwargs):
+    obj = super(Metric, cls).__new__(cls, *args, **kwargs)
+    obj.update_state = types.MethodType(
+        update_state_wrapper(obj.update_state), obj)
+    obj.result = types.MethodType(result_wrapper(obj.result), obj)
+    return obj
+
+  def __call__(self, *args, **kwargs):
+    """Accumulates statistics and then computes metric result value.
+
+    Args:
+      *args:
+      **kwargs: A mini-batch of inputs to the Metric,
+        passed on to `update_state()`.
+
+    Returns:
+      The metric value tensor.
+    """
+    update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
+    with ops.control_dependencies([update_op]):
+      return self.result()  # pylint: disable=not-callable
+
+  def reset_states(self):
+    """Resets all of the metric state variables.
+
+    This function is called between epochs/steps,
+    when a metric is evaluated during training.
+    """
+    for v in self.variables:
+      K.set_value(v, 0)
+
+  @abstractmethod
+  def update_state(self, *args, **kwargs):
+    """Accumulates statistics for the metric.
+
+    Note: This function is executed as a graph function in graph mode.
+    This means:
+      a) Operations on the same resource are executed in textual order.
+         This should make it easier to do things like add the updated
+         value of a variable to another, for example.
+      b) You don't need to worry about collecting the update ops to execute.
+         All update ops added to the graph by this function will be executed.
+      As a result, code should generally work the same way with graph or
+      eager execution.
+    and adds the update op to the metric layer.
+
+    Args:
+      *args:
+      **kwargs: A mini-batch of inputs to the Metric.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
+
+  @abstractmethod
+  def result(self):
+    """Computes and returns the metric value tensor.
+
+    Result computation is an idempotent operation that simply calculates the
+    metric value using the state variables.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
+
+  ### For use by subclasses ###
+  def add_weight(self,
+                 name,
+                 shape=(),
+                 aggregation=vs.VariableAggregation.SUM,
+                 synchronization=vs.VariableSynchronization.ON_READ,
+                 initializer=None):
+    """Adds state variable. Only for use by subclasses."""
+    return super(Metric, self).add_weight(
+        name=name,
+        shape=shape,
+        dtype=self._dtype,
+        trainable=False,
+        initializer=initializer,
+        synchronization=synchronization,
+        aggregation=aggregation)
+
+  ### End: For use by subclasses ###
+
+
+class Mean(Metric):
+  """Computes the (weighted) mean of the given values.
+
+  This metric creates two variables, `total` and `count` that are used to
+  compute the average of `values`. This average is ultimately returned as `mean`
+  which is an idempotent operation that simply divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, name='mean', dtype=None):
+    """Creates a `Mean` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Mean, self).__init__(name=name, dtype=dtype)
+    # Create new state variables
+    self.total = self.add_weight(
+        'total', initializer=init_ops.zeros_initializer)
+    self.count = self.add_weight(
+        'count', initializer=init_ops.zeros_initializer)
+
+  def update_state(self, values, sample_weight=None):
+    """Accumulates statistics for computing the mean.
+
+    For example, if `values` is [1, 3, 5, 7] then the mean is 4. If
+    the `sample_weight` is specified as [1, 1, 0, 0] then the mean would be 2.
+
+    Args:
+      values: Per-example value.
+      sample_weight: Optional weighting of each example. Defaults to 1.
+    """
+    values = math_ops.cast(values, self._dtype)
+    if sample_weight is None:
+      num_values = math_ops.cast(array_ops.size(values), self._dtype)
+    else:
+      sample_weight = math_ops.cast(sample_weight, self._dtype)
+
+      # Update dimensions of weights to match with values.
+      values, _, sample_weight = _squeeze_or_expand_dimensions(
+          values, None, sample_weight)
+      sample_weight = weights_broadcast_ops.broadcast_weights(
+          sample_weight, values)
+      num_values = math_ops.reduce_sum(sample_weight)
+      values = math_ops.multiply(values, sample_weight)
+    values = math_ops.reduce_sum(values)
+
+    # Update state variables
+    state_ops.assign_add(self.total, values)
+    state_ops.assign_add(self.count, num_values)
+
+  def result(self):
+    return _safe_div(self.total, self.count)
+
+
+class MeanMetricWrapper(Mean):
+  """Wraps a stateless metric function with the Mean metric."""
+
+  def __init__(self, fn, name=None, dtype=None, **kwargs):
+    """Creates a `MeanMetricWrapper` instance.
+
+    Args:
+      fn: The metric function to wrap, with signature
+        `fn(y_true, y_pred, **kwargs)`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
+    super(MeanMetricWrapper, self).__init__(name=name, dtype=dtype)
+    self._fn = fn
+    self._fn_kwargs = kwargs
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    `y_true` and `y_pred` should have the same shape.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be
+        a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+        and must be broadcastable to `y_true`.
+    """
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = _squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+
+    matches = self._fn(y_true, y_pred, **self._fn_kwargs)
+    super(MeanMetricWrapper, self).update_state(
+        matches, sample_weight=sample_weight)
+
+  def get_config(self):
+    config = self._fn_kwargs
+    base_config = super(MeanMetricWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class BinaryAccuracy(MeanMetricWrapper):
+  """Calculates how often predictions matches labels.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the frequency with which `y_pred` matches `y_true`. This frequency is
+  ultimately returned as `binary accuracy`: an idempotent operation that simply
+  divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, name='binary-accuracy', dtype=None, threshold=0.5):
+    """Creates a `BinaryAccuracy` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      threshold: (Optional) Float representing the threshold for deciding
+      whether prediction values are 1 or 0.
+    """
+    super(BinaryAccuracy, self).__init__(
+        binary_accuracy, name, dtype=dtype, threshold=threshold)
+
+
 @tf_export('keras.metrics.binary_accuracy')
-def binary_accuracy(y_true, y_pred):
-  return K.mean(math_ops.equal(y_true, math_ops.round(y_pred)), axis=-1)
+def binary_accuracy(y_true, y_pred, threshold=0.5):
+  threshold = math_ops.cast(threshold, y_pred.dtype)
+  y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
+  return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
 
 
 @tf_export('keras.metrics.categorical_accuracy')
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 15e793f5fc..d583379708 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -18,67 +18,72 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import numpy as np
 
-from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import metrics
+from tensorflow.python.keras.engine.training import Model
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class KerasMetricsTest(test.TestCase):
 
   def test_metrics(self):
     with self.test_session():
-      y_a = keras.backend.variable(np.random.random((6, 7)))
-      y_b = keras.backend.variable(np.random.random((6, 7)))
-      for metric in [keras.metrics.binary_accuracy,
-                     keras.metrics.categorical_accuracy]:
+      y_a = K.variable(np.random.random((6, 7)))
+      y_b = K.variable(np.random.random((6, 7)))
+      for metric in [metrics.binary_accuracy, metrics.categorical_accuracy]:
         output = metric(y_a, y_b)
-        self.assertEqual(keras.backend.eval(output).shape, (6,))
+        self.assertEqual(K.eval(output).shape, (6,))
 
   def test_sparse_categorical_accuracy(self):
     with self.test_session():
-      metric = keras.metrics.sparse_categorical_accuracy
-      y_a = keras.backend.variable(np.random.randint(0, 7, (6,)))
-      y_b = keras.backend.variable(np.random.random((6, 7)))
-      self.assertEqual(keras.backend.eval(metric(y_a, y_b)).shape, (6,))
+      metric = metrics.sparse_categorical_accuracy
+      y_a = K.variable(np.random.randint(0, 7, (6,)))
+      y_b = K.variable(np.random.random((6, 7)))
+      self.assertEqual(K.eval(metric(y_a, y_b)).shape, (6,))
 
   def test_sparse_top_k_categorical_accuracy(self):
     with self.test_session():
-      y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
-                                                [0.1, 0.2, 0.7]]))
-      y_true = keras.backend.variable(np.array([[1], [0]]))
-      result = keras.backend.eval(
-          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([[1], [0]]))
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
       self.assertEqual(result, 1)
-      result = keras.backend.eval(
-          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
       self.assertEqual(result, 0.5)
-      result = keras.backend.eval(
-          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
       self.assertEqual(result, 0.)
 
   def test_top_k_categorical_accuracy(self):
     with self.test_session():
-      y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
-                                                [0.1, 0.2, 0.7]]))
-      y_true = keras.backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
-      result = keras.backend.eval(
-          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
       self.assertEqual(result, 1)
-      result = keras.backend.eval(
-          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
       self.assertEqual(result, 0.5)
-      result = keras.backend.eval(
-          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
       self.assertEqual(result, 0.)
 
   def test_stateful_metrics(self):
     with self.test_session():
       np.random.seed(1334)
 
-      class BinaryTruePositives(keras.layers.Layer):
+      class BinaryTruePositives(layers.Layer):
         """Stateful Metric to count the total true positives over all batches.
 
         Assumes predictions and targets of shape `(samples, 1)`.
@@ -91,11 +96,11 @@ class KerasMetricsTest(test.TestCase):
 
         def __init__(self, name='true_positives', **kwargs):
           super(BinaryTruePositives, self).__init__(name=name, **kwargs)
-          self.true_positives = keras.backend.variable(value=0, dtype='int32')
+          self.true_positives = K.variable(value=0, dtype='int32')
           self.stateful = True
 
         def reset_states(self):
-          keras.backend.set_value(self.true_positives, 0)
+          K.set_value(self.true_positives, 0)
 
         def __call__(self, y_true, y_pred):
           """Computes the number of true positives in a batch.
@@ -120,14 +125,14 @@ class KerasMetricsTest(test.TestCase):
           return current_true_pos + true_pos
 
       metric_fn = BinaryTruePositives()
-      config = keras.metrics.serialize(metric_fn)
-      metric_fn = keras.metrics.deserialize(
+      config = metrics.serialize(metric_fn)
+      metric_fn = metrics.deserialize(
           config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
 
       # Test on simple model
-      inputs = keras.Input(shape=(2,))
-      outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
-      model = keras.Model(inputs, outputs)
+      inputs = layers.Input(shape=(2,))
+      outputs = layers.Dense(1, activation='sigmoid')(inputs)
+      model = Model(inputs, outputs)
       model.compile(optimizer='sgd',
                     loss='binary_crossentropy',
                     metrics=['acc', metric_fn])
@@ -184,6 +189,214 @@ class KerasMetricsTest(test.TestCase):
       self.assertAllClose(
           val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_mean(self):
+    m = metrics.Mean(name='my_mean')
+
+    # check config
+    self.assertEqual(m.name, 'my_mean')
+    self.assertTrue(m.stateful)
+    self.assertEqual(m.dtype, dtypes.float32)
+    self.assertEqual(len(m.variables), 2)
+    self.evaluate(variables.global_variables_initializer())
+
+    # check initial state
+    self.assertEqual(self.evaluate(m.total), 0)
+    self.assertEqual(self.evaluate(m.count), 0)
+
+    # check __call__()
+    self.assertEqual(self.evaluate(m(100)), 100)
+    self.assertEqual(self.evaluate(m.total), 100)
+    self.assertEqual(self.evaluate(m.count), 1)
+
+    # check update_state() and result() + state accumulation + tensor input
+    update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
+    self.evaluate(update_op)
+    self.assertAlmostEqual(self.evaluate(m.result()), 106 / 3, 2)
+    self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
+    self.assertEqual(self.evaluate(m.count), 3)
+
+    # check reset_states()
+    m.reset_states()
+    self.assertEqual(self.evaluate(m.total), 0)
+    self.assertEqual(self.evaluate(m.count), 0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_mean_with_sample_weight(self):
+    m = metrics.Mean(dtype=dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float64)
+    self.evaluate(variables.global_variables_initializer())
+
+    # check scalar weight
+    result_t = m(100, sample_weight=0.5)
+    self.assertEqual(self.evaluate(result_t), 50 / 0.5)
+    self.assertEqual(self.evaluate(m.total), 50)
+    self.assertEqual(self.evaluate(m.count), 0.5)
+
+    # check weights not scalar and weights rank matches values rank
+    result_t = m([1, 5], sample_weight=[1, 0.2])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 52 / 1.7, 2)
+    self.assertAlmostEqual(self.evaluate(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
+    self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
+
+    # check weights broadcast
+    result_t = m([1, 2], sample_weight=0.5)
+    self.assertAlmostEqual(self.evaluate(result_t), 53.5 / 2.7, 2)
+    self.assertAlmostEqual(self.evaluate(m.total), 53.5, 2)  # 52 + 0.5 + 1
+    self.assertAlmostEqual(self.evaluate(m.count), 2.7, 2)  # 1.7 + 0.5 + 0.5
+
+    # check weights squeeze
+    result_t = m([1, 5], sample_weight=[[1], [0.2]])
+    self.assertAlmostEqual(self.evaluate(result_t), 55.5 / 3.9, 2)
+    self.assertAlmostEqual(self.evaluate(m.total), 55.5, 2)  # 53.5 + 1 + 1
+    self.assertAlmostEqual(self.evaluate(m.count), 3.9, 2)  # 2.7 + 1.2
+
+    # check weights expand
+    result_t = m([[1], [5]], sample_weight=[1, 0.2])
+    self.assertAlmostEqual(self.evaluate(result_t), 57.5 / 5.1, 2)
+    self.assertAlmostEqual(self.evaluate(m.total), 57.5, 2)  # 55.5 + 1 + 1
+    self.assertAlmostEqual(self.evaluate(m.count), 5.1, 2)  # 3.9 + 1.2
+
+  def test_mean_graph_with_placeholder(self):
+    with context.graph_mode(), self.test_session() as sess:
+      m = metrics.Mean()
+      v = array_ops.placeholder(dtypes.float32)
+      w = array_ops.placeholder(dtypes.float32)
+      sess.run(variables.global_variables_initializer())
+
+      # check __call__()
+      result_t = m(v, sample_weight=w)
+      result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
+      self.assertEqual(sess.run(m.total), 50)
+      self.assertEqual(sess.run(m.count), 0.5)
+      self.assertEqual(result, 50 / 0.5)
+
+      # check update_state() and result()
+      result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
+      self.assertAlmostEqual(sess.run(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
+      self.assertAlmostEqual(sess.run(m.count), 1.7, 2)  # 0.5 + 1.2
+      self.assertAlmostEqual(result, 52 / 1.7, 2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_save_restore(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    m = metrics.Mean()
+    checkpoint = checkpointable_utils.Checkpoint(mean=m)
+    self.evaluate(variables.global_variables_initializer())
+
+    # update state
+    self.evaluate(m(100.))
+    self.evaluate(m(200.))
+
+    # save checkpoint and then add an update
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(m(1000.))
+
+    # restore to the same checkpoint mean object
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.evaluate(m(300.))
+    self.assertEqual(200., self.evaluate(m.result()))
+
+    # restore to a different checkpoint mean object
+    restore_mean = metrics.Mean()
+    restore_checkpoint = checkpointable_utils.Checkpoint(mean=restore_mean)
+    status = restore_checkpoint.restore(save_path)
+    restore_update = restore_mean(300.)
+    status.assert_consumed().run_restore_ops()
+    self.evaluate(restore_update)
+    self.assertEqual(200., self.evaluate(restore_mean.result()))
+    self.assertEqual(3, self.evaluate(restore_mean.count))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_binary_accuracy(self):
+    acc_obj = metrics.BinaryAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.global_variables_initializer())
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[1], [0]], [[1], [0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check y_pred squeeze
+    update_op = acc_obj.update_state([[1], [1]], [[[1]], [[0]]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertAlmostEqual(result, 0.75, 2)  # 3/4
+
+    # check y_true squeeze
+    result_t = acc_obj([[[1]], [[1]]], [[1], [0]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.67, 2)  # 4/6
+
+    # check with sample_weight
+    result_t = acc_obj([[1], [1]], [[1], [0]], [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
+
+    # check incompatible shapes
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shapes \(1,\) and \(2,\) are incompatible'):
+      acc_obj.update_state([1, 1], [1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_binary_accuracy_threshold(self):
+    acc_obj = metrics.BinaryAccuracy(threshold=0.7)
+    self.evaluate(variables.global_variables_initializer())
+    result_t = acc_obj([[1], [1], [0], [0]], [[0.9], [0.6], [0.4], [0.8]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.5, 2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_result(self):
+
+    class InvalidResult(metrics.Metric):
+
+      def __init__(self, name='invalid-result', dtype=dtypes.float64):
+        super(InvalidResult, self).__init__(name=name, dtype=dtype)
+
+      def update_state(self, *args, **kwargs):
+        pass
+
+      def result(self):
+        return 1
+
+    invalid_result_obj = InvalidResult()
+    with self.assertRaisesRegexp(
+        TypeError,
+        'Metric invalid-result\'s result must be a Tensor or Operation, given:'
+    ):
+      invalid_result_obj.result()
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_update(self):
+
+    class InvalidUpdate(metrics.Metric):
+
+      def __init__(self, name='invalid-update', dtype=dtypes.float64):
+        super(InvalidUpdate, self).__init__(name=name, dtype=dtype)
+
+      def update_state(self, *args, **kwargs):
+        return [1]
+
+      def result(self):
+        pass
+
+    invalid_update_obj = InvalidUpdate()
+    with self.assertRaisesRegexp(
+        TypeError,
+        'Metric invalid-update\'s update must be a Tensor or Operation, given:'
+    ):
+      invalid_update_obj.update_state()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 3ac4852eff..5fbc191e78 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -29,6 +29,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import data_structures
@@ -65,6 +67,22 @@ class SimpleTestModel(keras.Model):
     return self.dense2(x)
 
 
+class SimpleConvTestModel(keras.Model):
+
+  def __init__(self, num_classes=10):
+    super(SimpleConvTestModel, self).__init__(name='test_model')
+    self.num_classes = num_classes
+
+    self.conv1 = keras.layers.Conv2D(32, (3, 3), activation='relu')
+    self.flatten = keras.layers.Flatten()
+    self.dense1 = keras.layers.Dense(num_classes, activation='softmax')
+
+  def call(self, x):
+    x = self.conv1(x)
+    x = self.flatten(x)
+    return self.dense1(x)
+
+
 class MultiIOTestModel(keras.Model):
 
   def __init__(self, use_bn=False, use_dp=False, num_classes=(2, 3)):
@@ -174,6 +192,213 @@ def get_nested_model_3(input_dim, num_classes):
 class ModelSubclassingTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  def test_invalid_input_shape_build(self):
+    num_classes = 2
+    input_dim = 50
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'input shape is not one of the valid types'):
+      model.build(input_shape=tensor_shape.Dimension(input_dim))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_embed_dtype_with_subclass_build(self):
+    class Embedding(keras.layers.Layer):
+      """An Embedding layer."""
+
+      def __init__(self, vocab_size, embedding_dim, **kwargs):
+        super(Embedding, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+
+      def build(self, _):
+        self.embedding = self.add_variable(
+            'embedding_kernel',
+            shape=[self.vocab_size, self.embedding_dim],
+            dtype=np.float32,
+            initializer=init_ops.random_uniform_initializer(-0.1, 0.1),
+            trainable=True)
+
+      def call(self, x):
+        return embedding_ops.embedding_lookup(self.embedding, x)
+
+    class EmbedModel(keras.Model):
+
+      def __init__(self, vocab_size, embed_size):
+        super(EmbedModel, self).__init__()
+        self.embed1 = Embedding(vocab_size, embed_size)
+
+      def call(self, inputs):
+        return self.embed1(inputs)
+
+    model = EmbedModel(100, 20)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'if your layers do not support float type inputs'):
+      model.build(input_shape=(35, 20))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_time_step_rnn_build(self):
+    dim = 4
+    timesteps = 1
+    batch_input_shape = (None, timesteps, dim)
+    units = 3
+
+    class SimpleRNNModel(keras.Model):
+
+      def __init__(self):
+        super(SimpleRNNModel, self).__init__()
+        self.lstm = keras.layers.LSTM(units)
+
+      def call(self, inputs):
+        return self.lstm(inputs)
+
+    model = SimpleRNNModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(batch_input_shape)
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, timesteps, dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_io_subclass_build(self):
+    num_classes = 2
+    input_dim = 50
+    batch_size = None
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(input_shape=(batch_size, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, input_dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_io_dimension_subclass_build(self):
+    num_classes = 2
+    input_dim = tensor_shape.Dimension(50)
+    batch_size = tensor_shape.Dimension(None)
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(input_shape=(batch_size, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, input_dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multidim_io_subclass_build(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = 32
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    batch_input_shape = (batch_size,) + input_shape
+    model.build(input_shape=batch_input_shape)
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+    model(array_ops.ones(batch_input_shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_tensorshape_io_subclass_build(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = None
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+    model(array_ops.ones((32,) + input_shape))
+
+  def test_subclass_save_model(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = None
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    weights = model.get_weights()
+
+    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
+    model.save_weights(tf_format_name)
+    if h5py is not None:
+      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
+      model.save_weights(hdf5_format_name)
+
+    model = SimpleConvTestModel(num_classes)
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    if h5py is not None:
+      model.load_weights(hdf5_format_name)
+      self.assertAllClose(weights, model.get_weights())
+    model.load_weights(tf_format_name)
+    self.assertAllClose(weights, model.get_weights())
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multi_io_subclass_build(self):
+    batch_size = None
+    num_samples = 1000
+    input_dim = 50
+    model = MultiIOTestModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    batch_input_shape = tensor_shape.TensorShape((batch_size, input_dim))
+    model.build(
+        input_shape=[batch_input_shape, batch_input_shape])
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    x1 = array_ops.ones((num_samples, input_dim))
+    x2 = array_ops.ones((num_samples, input_dim))
+    model([x1, x2])
+
+  @test_util.run_in_graph_and_eager_modes
   def test_single_io_workflow_with_np_arrays(self):
     num_classes = 2
     num_samples = 100
@@ -750,6 +975,16 @@ class CustomCallModel(keras.Model):
       return combined
 
 
+class TrainingNoDefaultModel(keras.Model):
+
+  def __init__(self):
+    super(TrainingNoDefaultModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1)
+
+  def call(self, x, training):
+    return self.dense1(x)
+
+
 class CustomCallSignatureTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -767,6 +1002,32 @@ class CustomCallSignatureTests(test.TestCase):
     self.assertAllClose(expected_output, self.evaluate(output))
 
   @test_util.run_in_graph_and_eager_modes
+  def test_training_args_call_build(self):
+    input_dim = 2
+
+    model = TrainingNoDefaultModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build((None, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_custom_call_kwargs_and_build(self):
+    first_input_shape = (2, 3)
+    second_input_shape = (2, 5)
+
+    model = CustomCallModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'cannot build your model if it has positional'):
+      model.build(input_shape=[first_input_shape, second_input_shape])
+
+  @test_util.run_in_graph_and_eager_modes
   def test_inputs_in_signature(self):
 
     class HasInputsAndOtherPositional(keras.Model):
@@ -829,14 +1090,9 @@ class CustomCallSignatureTests(test.TestCase):
 
   def test_training_no_default(self):
 
-    class TrainingNoDefault(keras.Model):
-
-      def call(self, x, training):
-        return x
-
     with context.graph_mode():
-      model = TrainingNoDefault()
-      arg = array_ops.ones([])
+      model = TrainingNoDefaultModel()
+      arg = array_ops.ones([1, 1])
       model(arg, True)
       six.assertCountEqual(self, [arg], model.inputs)
 
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 17aba7d86c..6e8ee06ff5 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import OrderedDict
 import numpy as np
 
 from tensorflow.python import keras
@@ -185,75 +184,3 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   # for further checks in the caller function
   return actual_output
 
-
-def _combine_named_parameters(**kwargs):
-  """Generate combinations based on its keyword arguments.
-
-  Two sets of returned combinations can be concatenated using +.  Their product
-  can be computed using `times()`.
-
-  Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
-
-  Returns:
-    a list of dictionaries for each combination. Keys in the dictionaries are
-    the keyword argument names.  Each key has one value - one of the
-    corresponding keyword argument values.
-  """
-  if not kwargs:
-    return [OrderedDict()]
-
-  sort_by_key = lambda k: k[0][0]
-  kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
-  first = list(kwargs.items())[0]
-
-  rest = dict(list(kwargs.items())[1:])
-  rest_combined = _combine_named_parameters(**rest)
-
-  key = first[0]
-  values = first[1]
-  if not isinstance(values, list):
-    values = [values]
-
-  combinations = [
-      OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
-      for v in values
-      for combined in rest_combined
-  ]
-  return combinations
-
-
-def generate_combinations_with_testcase_name(**kwargs):
-  """Generate combinations based on its keyword arguments using combine().
-
-  This function calls combine() and appends a testcase name to the list of
-  dictionaries returned. The 'testcase_name' key is a required for named
-  parameterized tests.
-
-  Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
-
-  Returns:
-    a list of dictionaries for each combination. Keys in the dictionaries are
-    the keyword argument names.  Each key has one value - one of the
-    corresponding keyword argument values.
-  """
-  combinations = _combine_named_parameters(**kwargs)
-  named_combinations = []
-  for combination in combinations:
-    assert isinstance(combination, OrderedDict)
-    name = ''.join([
-        '_{}_{}'.format(
-            ''.join(filter(str.isalnum, key)),
-            ''.join(filter(str.isalnum, str(value))))
-        for key, value in combination.items()
-    ])
-    named_combinations.append(
-        OrderedDict(
-            list(combination.items()) + [('testcase_name',
-                                          '_test{}'.format(name))]))
-
-  return named_combinations
-
diff --git a/tensorflow/python/keras/utils/np_utils.py b/tensorflow/python/keras/utils/np_utils.py
index 9d9c72b162..c24e87308b 100644
--- a/tensorflow/python/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/utils/np_utils.py
@@ -33,7 +33,8 @@ def to_categorical(y, num_classes=None):
       num_classes: total number of classes.
 
   Returns:
-      A binary matrix representation of the input.
+      A binary matrix representation of the input. The classes axis is placed
+      last.
   """
   y = np.array(y, dtype='int')
   input_shape = y.shape